summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c6
-rw-r--r--fs/9p/vfs_dir.c72
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt14
-rw-r--r--fs/Makefile11
-rw-r--r--fs/adfs/dir.c48
-rw-r--r--fs/affs/dir.c69
-rw-r--r--fs/affs/namei.c26
-rw-r--r--fs/afs/dir.c99
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/afs/flock.c7
-rw-r--r--fs/afs/proc.c8
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c1581
-rw-r--r--fs/autofs4/expire.c17
-rw-r--r--fs/autofs4/root.c8
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/befs/btree.c3
-rw-r--r--fs/befs/linuxvfs.c40
-rw-r--r--fs/bfs/dir.c35
-rw-r--r--fs/binfmt_aout.c28
-rw-r--r--fs/binfmt_elf.c11
-rw-r--r--fs/binfmt_elf_fdpic.c17
-rw-r--r--fs/binfmt_flat.c37
-rw-r--r--fs/binfmt_misc.c24
-rw-r--r--fs/bio-integrity.c144
-rw-r--r--fs/bio.c367
-rw-r--r--fs/block_dev.c51
-rw-r--r--fs/btrfs/Kconfig22
-rw-r--r--fs/btrfs/backref.c160
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/check-integrity.c2
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c492
-rw-r--r--fs/btrfs/ctree.h258
-rw-r--r--fs/btrfs/delayed-inode.c89
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c30
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c11
-rw-r--r--fs/btrfs/dir-item.c11
-rw-r--r--fs/btrfs/disk-io.c889
-rw-r--r--fs/btrfs/disk-io.h37
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c926
-rw-r--r--fs/btrfs/extent_io.c498
-rw-r--r--fs/btrfs/extent_io.h47
-rw-r--r--fs/btrfs/extent_map.c23
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c244
-rw-r--r--fs/btrfs/file.c206
-rw-r--r--fs/btrfs/free-space-cache.c642
-rw-r--r--fs/btrfs/free-space-cache.h5
-rw-r--r--fs/btrfs/inode-item.c17
-rw-r--r--fs/btrfs/inode-map.c8
-rw-r--r--fs/btrfs/inode.c793
-rw-r--r--fs/btrfs/ioctl.c196
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c142
-rw-r--r--fs/btrfs/ordered-data.h30
-rw-r--r--fs/btrfs/print-tree.c9
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/qgroup.c965
-rw-r--r--fs/btrfs/raid56.c16
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/relocation.c217
-rw-r--r--fs/btrfs/root-tree.c206
-rw-r--r--fs/btrfs/scrub.c230
-rw-r--r--fs/btrfs/send.c267
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c127
-rw-r--r--fs/btrfs/transaction.c399
-rw-r--r--fs/btrfs/transaction.h53
-rw-r--r--fs/btrfs/tree-log.c425
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/ulist.c73
-rw-r--r--fs/btrfs/ulist.h6
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c504
-rw-r--r--fs/btrfs/volumes.h40
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/buffer.c72
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/rdwr.c32
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c271
-rw-r--r--fs/ceph/caps.c133
-rw-r--r--fs/ceph/dir.c164
-rw-r--r--fs/ceph/file.c249
-rw-r--r--fs/ceph/inode.c49
-rw-r--r--fs/ceph/ioctl.c5
-rw-r--r--fs/ceph/locks.c75
-rw-r--r--fs/ceph/mds_client.c152
-rw-r--r--fs/ceph/mdsmap.c50
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c9
-rw-r--r--fs/ceph/super.h74
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/asn1.c40
-rw-r--r--fs/cifs/cache.c6
-rw-r--r--fs/cifs/cifs_debug.c113
-rw-r--r--fs/cifs/cifs_debug.h70
-rw-r--r--fs/cifs/cifs_dfs_ref.c178
-rw-r--r--fs/cifs/cifs_spnego.c5
-rw-r--r--fs/cifs/cifs_unicode.c4
-rw-r--r--fs/cifs/cifs_unicode.h8
-rw-r--r--fs/cifs/cifsacl.c130
-rw-r--r--fs/cifs/cifsencrypt.c146
-rw-r--r--fs/cifs/cifsfs.c54
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h48
-rw-r--r--fs/cifs/cifspdu.h17
-rw-r--r--fs/cifs/cifsproto.h14
-rw-r--r--fs/cifs/cifssmb.c879
-rw-r--r--fs/cifs/connect.c548
-rw-r--r--fs/cifs/dir.c57
-rw-r--r--fs/cifs/dns_resolve.c21
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c189
-rw-r--r--fs/cifs/fscache.c58
-rw-r--r--fs/cifs/inode.c112
-rw-r--r--fs/cifs/ioctl.c6
-rw-r--r--fs/cifs/link.c24
-rw-r--r--fs/cifs/misc.c81
-rw-r--r--fs/cifs/netmisc.c16
-rw-r--r--fs/cifs/readdir.c260
-rw-r--r--fs/cifs/sess.c172
-rw-r--r--fs/cifs/smb1ops.c73
-rw-r--r--fs/cifs/smb2file.c8
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c2
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2misc.c100
-rw-r--r--fs/cifs/smb2ops.c64
-rw-r--r--fs/cifs/smb2pdu.c423
-rw-r--r--fs/cifs/smb2pdu.h100
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c175
-rw-r--r--fs/cifs/smbencrypt.c17
-rw-r--r--fs/cifs/smbfsctl.h27
-rw-r--r--fs/cifs/transport.c90
-rw-r--r--fs/cifs/xattr.c54
-rw-r--r--fs/coda/dir.c76
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/compat.c351
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/dir.c124
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c212
-rw-r--r--fs/cramfs/inode.c21
-rw-r--r--fs/dcache.c115
-rw-r--r--fs/dcookies.c15
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/direct-io.c40
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/dlm/plock.c18
-rw-r--r--fs/ecryptfs/crypto.c387
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c58
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/ecryptfs/read_write.c9
-rw-r--r--fs/efivarfs/Kconfig12
-rw-r--r--fs/efivarfs/Makefile7
-rw-r--r--fs/efivarfs/file.c116
-rw-r--r--fs/efivarfs/inode.c174
-rw-r--r--fs/efivarfs/internal.h22
-rw-r--r--fs/efivarfs/super.c269
-rw-r--r--fs/efs/dir.c75
-rw-r--r--fs/eventpoll.c185
-rw-r--r--fs/exec.c59
-rw-r--r--fs/exofs/dir.c38
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exofs/ore.c2
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/dir.c27
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext2/namei.c24
-rw-r--r--fs/ext3/dir.c157
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/inode.c14
-rw-r--r--fs/ext3/namei.c54
-rw-r--r--fs/ext3/super.c25
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/balloc.c67
-rw-r--r--fs/ext4/dir.c176
-rw-r--r--fs/ext4/ext4.h282
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c64
-rw-r--r--fs/ext4/ext4_jbd2.h41
-rw-r--r--fs/ext4/extents.c724
-rw-r--r--fs/ext4/extents_status.c92
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c43
-rw-r--r--fs/ext4/fsync.c55
-rw-r--r--fs/ext4/ialloc.c91
-rw-r--r--fs/ext4/indirect.c514
-rw-r--r--fs/ext4/inline.c288
-rw-r--r--fs/ext4/inode.c2043
-rw-r--r--fs/ext4/ioctl.c218
-rw-r--r--fs/ext4/mballoc.c282
-rw-r--r--fs/ext4/migrate.c62
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c76
-rw-r--r--fs/ext4/namei.c102
-rw-r--r--fs/ext4/page-io.c413
-rw-r--r--fs/ext4/resize.c40
-rw-r--r--fs/ext4/super.c295
-rw-r--r--fs/ext4/xattr.c13
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c4
-rw-r--r--fs/f2fs/checkpoint.c162
-rw-r--r--fs/f2fs/data.c272
-rw-r--r--fs/f2fs/debug.c14
-rw-r--r--fs/f2fs/dir.c257
-rw-r--r--fs/f2fs/f2fs.h153
-rw-r--r--fs/f2fs/file.c156
-rw-r--r--fs/f2fs/gc.c165
-rw-r--r--fs/f2fs/gc.h12
-rw-r--r--fs/f2fs/inode.c81
-rw-r--r--fs/f2fs/namei.c81
-rw-r--r--fs/f2fs/node.c448
-rw-r--r--fs/f2fs/node.h72
-rw-r--r--fs/f2fs/recovery.c199
-rw-r--r--fs/f2fs/segment.c184
-rw-r--r--fs/f2fs/segment.h41
-rw-r--r--fs/f2fs/super.c320
-rw-r--r--fs/f2fs/xattr.c94
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/dir.c127
-rw-r--r--fs/fat/fat.h41
-rw-r--r--fs/fat/file.c13
-rw-r--r--fs/fat/inode.c102
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c12
-rw-r--r--fs/fat/nfs.c221
-rw-r--r--fs/fifo.c153
-rw-r--r--fs/file.c68
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c55
-rw-r--r--fs/fs-writeback.c120
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fscache/stats.c2
-rw-r--r--fs/fuse/cuse.c11
-rw-r--r--fs/fuse/dev.c83
-rw-r--r--fs/fuse/dir.c66
-rw-r--r--fs/fuse/file.c326
-rw-r--r--fs/fuse/fuse_i.h36
-rw-r--r--fs/fuse/inode.c21
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c20
-rw-r--r--fs/gfs2/bmap.c23
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c123
-rw-r--r--fs/gfs2/dir.h7
-rw-r--r--fs/gfs2/export.c10
-rw-r--r--fs/gfs2/file.c114
-rw-r--r--fs/gfs2/glock.c32
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/glops.c12
-rw-r--r--fs/gfs2/incore.h19
-rw-r--r--fs/gfs2/inode.c333
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c180
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c62
-rw-r--r--fs/gfs2/lops.h6
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/gfs2/rgrp.c51
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c8
-rw-r--r--fs/gfs2/trace_gfs2.h11
-rw-r--r--fs/gfs2/trans.c16
-rw-r--r--fs/hfs/bfind.c10
-rw-r--r--fs/hfs/bitmap.c4
-rw-r--r--fs/hfs/bnode.c45
-rw-r--r--fs/hfs/brec.c19
-rw-r--r--fs/hfs/btree.c31
-rw-r--r--fs/hfs/catalog.c24
-rw-r--r--fs/hfs/dir.c71
-rw-r--r--fs/hfs/extent.c68
-rw-r--r--fs/hfs/hfs_fs.h29
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfs/mdb.c23
-rw-r--r--fs/hfs/string.c6
-rw-r--r--fs/hfs/super.c47
-rw-r--r--fs/hfsplus/attributes.c26
-rw-r--r--fs/hfsplus/bfind.c14
-rw-r--r--fs/hfsplus/bitmap.c13
-rw-r--r--fs/hfsplus/bnode.c36
-rw-r--r--fs/hfsplus/brec.c14
-rw-r--r--fs/hfsplus/btree.c29
-rw-r--r--fs/hfsplus/catalog.c11
-rw-r--r--fs/hfsplus/dir.c64
-rw-r--r--fs/hfsplus/extents.c53
-rw-r--r--fs/hfsplus/hfsplus_fs.h27
-rw-r--r--fs/hfsplus/inode.c5
-rw-r--r--fs/hfsplus/options.c22
-rw-r--r--fs/hfsplus/super.c56
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hfsplus/wrapper.c8
-rw-r--r--fs/hfsplus/xattr.c41
-rw-r--r--fs/hostfs/hostfs_kern.c21
-rw-r--r--fs/hpfs/buffer.c33
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/dir.c66
-rw-r--r--fs/hpfs/file.c80
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/map.c22
-rw-r--r--fs/hpfs/super.c17
-rw-r--r--fs/hppfs/hppfs.c64
-rw-r--r--fs/hugetlbfs/inode.c24
-rw-r--r--fs/inode.c6
-rw-r--r--fs/internal.h17
-rw-r--r--fs/isofs/dir.c42
-rw-r--r--fs/isofs/inode.c48
-rw-r--r--fs/isofs/namei.c3
-rw-r--r--fs/jbd/commit.c25
-rw-r--r--fs/jbd/journal.c21
-rw-r--r--fs/jbd/transaction.c20
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c234
-rw-r--r--fs/jbd2/journal.c201
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c535
-rw-r--r--fs/jffs2/dir.c52
-rw-r--r--fs/jfs/inode.c3
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c100
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c71
-rw-r--r--fs/jfs/jfs_logmgr.c13
-rw-r--r--fs/jfs/jfs_metapage.c10
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c11
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c54
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/libfs.c80
-rw-r--r--fs/lockd/clntlock.c3
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svclock.c14
-rw-r--r--fs/lockd/svcsubs.c12
-rw-r--r--fs/locks.c328
-rw-r--r--fs/logfs/dev_bdev.c5
-rw-r--r--fs/logfs/dir.c49
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/minix/dir.c42
-rw-r--r--fs/minix/namei.c13
-rw-r--r--fs/mount.h7
-rw-r--r--fs/namei.c119
-rw-r--r--fs/namespace.c348
-rw-r--r--fs/ncpfs/dir.c132
-rw-r--r--fs/ncpfs/inode.c16
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c4
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c8
-rw-r--r--fs/nfs/callback.c9
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c7
-rw-r--r--fs/nfs/callback_xdr.c54
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c131
-rw-r--r--fs/nfs/delegation.h5
-rw-r--r--fs/nfs/dir.c141
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/file.c47
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c146
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h27
-rw-r--r--fs/nfs/nfs4client.c45
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c37
-rw-r--r--fs/nfs/nfs4filelayout.h5
-rw-r--r--fs/nfs/nfs4filelayoutdev.c34
-rw-r--r--fs/nfs/nfs4namespace.c43
-rw-r--r--fs/nfs/nfs4proc.c1359
-rw-r--r--fs/nfs/nfs4session.c44
-rw-r--r--fs/nfs/nfs4session.h20
-rw-r--r--fs/nfs/nfs4state.c213
-rw-r--r--fs/nfs/nfs4super.c16
-rw-r--r--fs/nfs/nfs4xdr.c356
-rw-r--r--fs/nfs/objlayout/objio_osd.c2
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/objlayout/objlayout.h2
-rw-r--r--fs/nfs/pagelist.c53
-rw-r--r--fs/nfs/pnfs.c44
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c182
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs4callback.c33
-rw-r--r--fs/nfsd/nfs4proc.c156
-rw-r--r--fs/nfsd/nfs4recover.c32
-rw-r--r--fs/nfsd/nfs4state.c951
-rw-r--r--fs/nfsd/nfs4xdr.c290
-rw-r--r--fs/nfsd/nfscache.c197
-rw-r--r--fs/nfsd/nfsctl.c17
-rw-r--r--fs/nfsd/nfsd.h26
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h28
-rw-r--r--fs/nfsd/vfs.c43
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h7
-rw-r--r--fs/nfsd/xdr4cb.h23
-rw-r--r--fs/nilfs2/alloc.c63
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/dir.c48
-rw-r--r--fs/nilfs2/ifile.c22
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c54
-rw-r--r--fs/nilfs2/mdt.c19
-rw-r--r--fs/nilfs2/page.c70
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c118
-rw-r--r--fs/notify/inotify/inotify_user.c30
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/dir.c84
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/inode.c1
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c29
-rw-r--r--fs/ocfs2/dir.c151
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c14
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c27
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c22
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/move_extents.c53
-rw-r--r--fs/ocfs2/namei.c74
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/suballoc.c37
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c18
-rw-r--r--fs/omfs/dir.c94
-rw-r--r--fs/open.c115
-rw-r--r--fs/openpromfs/inode.c95
-rw-r--r--fs/pipe.c459
-rw-r--r--fs/pnode.c13
-rw-r--r--fs/pnode.h7
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/base.c614
-rw-r--r--fs/proc/fd.c114
-rw-r--r--fs/proc/fd.h5
-rw-r--r--fs/proc/generic.c477
-rw-r--r--fs/proc/inode.c283
-rw-r--r--fs/proc/internal.h329
-rw-r--r--fs/proc/kcore.c13
-rw-r--r--fs/proc/kmsg.c10
-rw-r--r--fs/proc/meminfo.c1
-rw-r--r--fs/proc/mmu.c60
-rw-r--r--fs/proc/namespaces.c104
-rw-r--r--fs/proc/proc_devtree.c4
-rw-r--r--fs/proc/proc_net.c13
-rw-r--r--fs/proc/proc_sysctl.c78
-rw-r--r--fs/proc/root.c21
-rw-r--r--fs/proc/self.c47
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c145
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c699
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/platform.c21
-rw-r--r--fs/pstore/ram.c33
-rw-r--r--fs/pstore/ram_core.c120
-rw-r--r--fs/qnx4/dir.c66
-rw-r--r--fs/qnx6/dir.c31
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/read_write.c400
-rw-r--r--fs/read_write.h16
-rw-r--r--fs/readdir.c56
-rw-r--r--fs/reiserfs/dir.c38
-rw-r--r--fs/reiserfs/file.c61
-rw-r--r--fs/reiserfs/inode.c22
-rw-r--r--fs/reiserfs/journal.c16
-rw-r--r--fs/reiserfs/procfs.c62
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/xattr.c47
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/romfs/mmap-nommu.c5
-rw-r--r--fs/romfs/super.c21
-rw-r--r--fs/select.c66
-rw-r--r--fs/seq_file.c72
-rw-r--r--fs/signalfd.c31
-rw-r--r--fs/splice.c96
-rw-r--r--fs/squashfs/dir.c40
-rw-r--r--fs/sync.c26
-rw-r--r--fs/sysfs/dir.c87
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/sysv/dir.c37
-rw-r--r--fs/sysv/namei.c3
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/ubifs/dir.c69
-rw-r--r--fs/ubifs/file.c6
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/dir.c63
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/ufs/dir.c28
-rw-r--r--fs/ufs/util.c3
-rw-r--r--fs/xfs/Kconfig13
-rw-r--r--fs/xfs/Makefile7
-rw-r--r--fs/xfs/mrlock.h12
-rw-r--r--fs/xfs/xfs.h5
-rw-r--r--fs/xfs/xfs_acl.c31
-rw-r--r--fs/xfs/xfs_acl.h31
-rw-r--r--fs/xfs/xfs_ag.h56
-rw-r--r--fs/xfs/xfs_alloc.c225
-rw-r--r--fs/xfs/xfs_alloc_btree.c109
-rw-r--r--fs/xfs/xfs_alloc_btree.h12
-rw-r--r--fs/xfs/xfs_aops.c46
-rw-r--r--fs/xfs/xfs_attr.c454
-rw-r--r--fs/xfs/xfs_attr.h1
-rw-r--r--fs/xfs/xfs_attr_leaf.c1851
-rw-r--r--fs/xfs/xfs_attr_leaf.h123
-rw-r--r--fs/xfs/xfs_attr_remote.c631
-rw-r--r--fs/xfs/xfs_attr_remote.h56
-rw-r--r--fs/xfs/xfs_bmap.c4314
-rw-r--r--fs/xfs/xfs_bmap_btree.c114
-rw-r--r--fs/xfs/xfs_bmap_btree.h21
-rw-r--r--fs/xfs/xfs_btree.c266
-rw-r--r--fs/xfs/xfs_btree.h66
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf_item.c94
-rw-r--r--fs/xfs/xfs_buf_item.h68
-rw-r--r--fs/xfs/xfs_da_btree.c1508
-rw-r--r--fs/xfs/xfs_da_btree.h130
-rw-r--r--fs/xfs/xfs_dfrag.c16
-rw-r--r--fs/xfs/xfs_dinode.h43
-rw-r--r--fs/xfs/xfs_dir2.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c196
-rw-r--r--fs/xfs/xfs_dir2_data.c266
-rw-r--r--fs/xfs/xfs_dir2_format.h280
-rw-r--r--fs/xfs/xfs_dir2_leaf.c921
-rw-r--r--fs/xfs/xfs_dir2_node.c1014
-rw-r--r--fs/xfs/xfs_dir2_priv.h61
-rw-r--r--fs/xfs/xfs_dir2_sf.c43
-rw-r--r--fs/xfs/xfs_dquot.c171
-rw-r--r--fs/xfs/xfs_dquot.h20
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c28
-rw-r--r--fs/xfs/xfs_extfree_item.h14
-rw-r--r--fs/xfs/xfs_file.c24
-rw-r--r--fs/xfs/xfs_fs.h1
-rw-r--r--fs/xfs/xfs_fsops.c40
-rw-r--r--fs/xfs/xfs_ialloc.c173
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_ialloc_btree.c91
-rw-r--r--fs/xfs/xfs_ialloc_btree.h9
-rw-r--r--fs/xfs/xfs_icache.c3
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c298
-rw-r--r--fs/xfs/xfs_inode.h31
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c14
-rw-r--r--fs/xfs/xfs_ioctl32.c14
-rw-r--r--fs/xfs/xfs_iomap.c176
-rw-r--r--fs/xfs/xfs_iops.c50
-rw-r--r--fs/xfs/xfs_itable.c5
-rw-r--r--fs/xfs/xfs_linux.h25
-rw-r--r--fs/xfs/xfs_log.c24
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c81
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c460
-rw-r--r--fs/xfs/xfs_message.c8
-rw-r--r--fs/xfs/xfs_message.h27
-rw-r--r--fs/xfs/xfs_mount.c246
-rw-r--r--fs/xfs/xfs_mount.h6
-rw-r--r--fs/xfs/xfs_qm.c232
-rw-r--r--fs/xfs/xfs_qm.h87
-rw-r--r--fs/xfs/xfs_qm_syscalls.c100
-rw-r--r--fs/xfs/xfs_quota.h60
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h172
-rw-r--r--fs/xfs/xfs_super.c50
-rw-r--r--fs/xfs/xfs_symlink.c767
-rw-r--r--fs/xfs/xfs_symlink.h66
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h44
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h20
-rw-r--r--fs/xfs/xfs_trans_buf.c97
-rw-r--r--fs/xfs/xfs_trans_dquot.c97
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c497
-rw-r--r--fs/xfs/xfs_vnodeops.h3
651 files changed, 39615 insertions, 27783 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6a65a5..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -147,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
* @offset: offset in the page
*/
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
v9fs_fscache_invalidate_page(page);
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
}
/**
- * v9fs_dir_readdir - read a directory
- * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
*
*/
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
{
- int over;
+ bool over;
struct p9_wstat st;
int err = 0;
struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
int reclen = 0;
struct p9_rdir *rdir;
- p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
- fid = filp->private_data;
+ p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ fid = file->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
- rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ rdir = v9fs_alloc_rdir_buf(file, buflen);
if (!rdir)
return -ENOMEM;
while (1) {
if (rdir->tail == rdir->head) {
- err = v9fs_file_readn(filp, rdir->buf, NULL,
- buflen, filp->f_pos);
+ err = v9fs_file_readn(file, rdir->buf, NULL,
+ buflen, ctx->pos);
if (err <= 0)
return err;
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
reclen = st.size+2;
- over = filldir(dirent, st.name, strlen(st.name),
- filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
-
+ over = !dir_emit(ctx, st.name, strlen(st.name),
+ v9fs_qid2ino(&st.qid), dt_type(&st));
p9stat_free(&st);
-
if (over)
return 0;
rdir->head += reclen;
- filp->f_pos += reclen;
+ ctx->pos += reclen;
}
}
}
/**
- * v9fs_dir_readdir_dotl - read a directory
- * @filp: opened file structure
- * @dirent: buffer to fill dirent structures
- * @filldir: function to populate dirent structures
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
*
*/
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
- filldir_t filldir)
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
{
- int over;
int err = 0;
struct p9_fid *fid;
int buflen;
struct p9_rdir *rdir;
struct p9_dirent curdirent;
- u64 oldoffset = 0;
- p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
- fid = filp->private_data;
+ p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ fid = file->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
- rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ rdir = v9fs_alloc_rdir_buf(file, buflen);
if (!rdir)
return -ENOMEM;
while (1) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
- filp->f_pos);
+ ctx->pos);
if (err <= 0)
return err;
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
return -EIO;
}
- /* d_off in dirent structure tracks the offset into
- * the next dirent in the dir. However, filldir()
- * expects offset into the current dirent. Hence
- * while calling filldir send the offset from the
- * previous dirent structure.
- */
- over = filldir(dirent, curdirent.d_name,
- strlen(curdirent.d_name),
- oldoffset, v9fs_qid2ino(&curdirent.qid),
- curdirent.d_type);
- oldoffset = curdirent.d_off;
-
- if (over)
+ if (!dir_emit(ctx, curdirent.d_name,
+ strlen(curdirent.d_name),
+ v9fs_qid2ino(&curdirent.qid),
+ curdirent.d_type))
return 0;
- filp->f_pos = curdirent.d_off;
+ ctx->pos = curdirent.d_off;
rdir->head += err;
}
}
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
const struct file_operations v9fs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = v9fs_dir_readdir,
+ .iterate = v9fs_dir_readdir,
.open = v9fs_file_open,
.release = v9fs_dir_release,
};
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
const struct file_operations v9fs_dir_operations_dotl = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = v9fs_dir_readdir_dotl,
+ .iterate = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/Kconfig b/fs/Kconfig
index 780725a463b1..c229f828eb01 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -211,6 +211,7 @@ source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
source "fs/f2fs/Kconfig"
+source "fs/efivarfs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 0efd1524b977..370b24cee4d8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -65,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
This config option changes the default setting of coredump_filter
seen at boot time. If unsure, say Y.
+config BINFMT_SCRIPT
+ tristate "Kernel support for scripts starting with #!"
+ default y
+ help
+ Say Y here if you want to execute interpreted scripts starting with
+ #! followed by the path to an interpreter.
+
+ You can build this support as a module; however, until that module
+ gets loaded, you cannot run scripts. Thus, if you want to load this
+ module from an initramfs, the portion of the initramfs before loading
+ this module must consist of compiled binaries only.
+
+ Most systems will not boot if you say M or N here. If unsure, say Y.
+
config BINFMT_FLAT
bool "Kernel support for flat binaries"
depends on !MMU && (!FRV || BROKEN)
diff --git a/fs/Makefile b/fs/Makefile
index 9d53192236fc..4fe6df3ec28f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,10 +7,10 @@
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
- ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+ ioctl.o readdir.o select.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
- pnode.o drop_caches.o splice.o sync.o utimes.o \
+ pnode.o splice.o sync.o utimes.o \
stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
@@ -34,10 +34,7 @@ obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
-
-# binfmt_script is always there
-obj-y += binfmt_script.o
-
+obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
@@ -49,6 +46,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-$(CONFIG_COREDUMP) += coredump.o
+obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
@@ -127,3 +125,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
+obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
static DEFINE_RWLOCK(adfs_dir_lock);
static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
struct adfs_dir dir;
int ret = 0;
- if (filp->f_pos >> 32)
- goto out;
+ if (ctx->pos >> 32)
+ return 0;
ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
if (ret)
- goto out;
+ return ret;
- switch ((unsigned long)filp->f_pos) {
- case 0:
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
goto free_out;
- filp->f_pos += 1;
-
- case 1:
- if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
goto free_out;
- filp->f_pos += 1;
-
- default:
- break;
+ ctx->pos = 2;
}
read_lock(&adfs_dir_lock);
- ret = ops->setpos(&dir, filp->f_pos - 2);
+ ret = ops->setpos(&dir, ctx->pos - 2);
if (ret)
goto unlock_out;
while (ops->getnext(&dir, &obj) == 0) {
- if (filldir(dirent, obj.name, obj.name_len,
- filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
- goto unlock_out;
- filp->f_pos += 1;
+ if (!dir_emit(ctx, obj.name, obj.name_len,
+ obj.file_id, DT_UNKNOWN))
+ break;
+ ctx->pos++;
}
unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
free_out:
ops->free(&dir);
-
-out:
return ret;
}
@@ -192,13 +186,12 @@ out:
const struct file_operations adfs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = adfs_readdir,
+ .iterate = adfs_readdir,
.fsync = generic_file_fsync,
};
static int
-adfs_hash(const struct dentry *parent, const struct inode *inode,
- struct qstr *qstr)
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
{
const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
* requirements of the underlying filesystem.
*/
static int
-adfs_compare(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
int i;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
#include "affs.h"
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
const struct file_operations affs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = affs_readdir,
+ .iterate = affs_readdir,
.fsync = affs_file_fsync,
};
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
};
static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- struct buffer_head *dir_bh;
- struct buffer_head *fh_bh;
+ struct buffer_head *dir_bh = NULL;
+ struct buffer_head *fh_bh = NULL;
unsigned char *name;
int namelen;
u32 i;
int hash_pos;
int chain_pos;
- u32 f_pos;
u32 ino;
- int stored;
- int res;
- pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+ pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
- stored = 0;
- res = -EIO;
- dir_bh = NULL;
- fh_bh = NULL;
- f_pos = filp->f_pos;
-
- if (f_pos == 0) {
- filp->private_data = (void *)0;
- if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos < 2) {
+ file->private_data = (void *)0;
+ if (!dir_emit_dots(file, ctx))
return 0;
- filp->f_pos = f_pos = 1;
- stored++;
- }
- if (f_pos == 1) {
- if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
- return stored;
- filp->f_pos = f_pos = 2;
- stored++;
}
affs_lock_dir(inode);
- chain_pos = (f_pos - 2) & 0xffff;
- hash_pos = (f_pos - 2) >> 16;
+ chain_pos = (ctx->pos - 2) & 0xffff;
+ hash_pos = (ctx->pos - 2) >> 16;
if (chain_pos == 0xffff) {
affs_warning(sb, "readdir", "More than 65535 entries in chain");
chain_pos = 0;
hash_pos++;
- filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+ ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
}
dir_bh = affs_bread(sb, inode->i_ino);
if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
- ino = (u32)(long)filp->private_data;
- if (ino && filp->f_version == inode->i_version) {
+ ino = (u32)(long)file->private_data;
+ if (ino && file->f_version == inode->i_version) {
pr_debug("AFFS: readdir() left off=%d\n", ino);
goto inside;
}
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", i);
- goto readdir_out;
+ return -EIO;
}
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
if (!ino)
continue;
- f_pos = (hash_pos << 16) + 2;
+ ctx->pos = (hash_pos << 16) + 2;
inside:
do {
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", ino);
- goto readdir_done;
+ break;
}
namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
- namelen, name, ino, hash_pos, f_pos);
- if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+ namelen, name, ino, hash_pos, (u32)ctx->pos);
+ if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
goto readdir_done;
- stored++;
- f_pos++;
+ ctx->pos++;
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
fh_bh = NULL;
} while (ino);
}
readdir_done:
- filp->f_pos = f_pos;
- filp->f_version = inode->i_version;
- filp->private_data = (void *)(long)ino;
- res = stored;
+ file->f_version = inode->i_version;
+ file->private_data = (void *)(long)ino;
readdir_out:
affs_brelse(dir_bh);
affs_brelse(fh_bh);
affs_unlock_dir(inode);
- pr_debug("AFFS: readdir()=%d\n", stored);
- return res;
+ return 0;
}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
typedef int (*toupper_t)(int);
static int affs_toupper(int ch);
-static int affs_hash_dentry(const struct dentry *,
- const struct inode *, struct qstr *);
-static int affs_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int affs_hash_dentry(const struct dentry *, struct qstr *);
+static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
static int affs_intl_toupper(int ch);
-static int affs_intl_hash_dentry(const struct dentry *,
- const struct inode *, struct qstr *);
-static int affs_intl_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int affs_intl_hash_dentry(const struct dentry *, struct qstr *);
+static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
}
static int
-affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
return __affs_hash_dentry(qstr, affs_toupper);
}
static int
-affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
return __affs_hash_dentry(qstr, affs_intl_toupper);
}
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
}
static int
-affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return __affs_compare_dentry(len, str, name, affs_toupper);
}
static int
-affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
.release = afs_release,
- .readdir = afs_readdir,
+ .iterate = afs_readdir,
.lock = afs_lock,
.llseek = generic_file_llseek,
};
@@ -119,9 +119,9 @@ struct afs_dir_page {
};
struct afs_lookup_cookie {
+ struct dir_context ctx;
struct afs_fid fid;
- const char *name;
- size_t nlen;
+ struct qstr name;
int found;
};
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
/*
* deal with one block in an AFS directory
*/
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
union afs_dir_block *block,
- unsigned blkoff,
- void *cookie,
- filldir_t filldir)
+ unsigned blkoff)
{
union afs_dirent *dire;
unsigned offset, next, curr;
size_t nlen;
- int tmp, ret;
+ int tmp;
- _enter("%u,%x,%p,,",*fpos,blkoff,block);
+ _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
- curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+ curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
/* walk through the block, an entry at a time */
for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
_debug("ENT[%Zu.%u]: unused",
blkoff / sizeof(union afs_dir_block), offset);
if (offset >= curr)
- *fpos = blkoff +
+ ctx->pos = blkoff +
next * sizeof(union afs_dirent);
continue;
}
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
continue;
/* found the next entry */
- ret = filldir(cookie,
- dire->u.name,
- nlen,
- blkoff + offset * sizeof(union afs_dirent),
+ if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
- filldir == afs_lookup_filldir ?
- ntohl(dire->u.unique) : DT_UNKNOWN);
- if (ret < 0) {
+ ctx->actor == afs_lookup_filldir ?
+ ntohl(dire->u.unique) : DT_UNKNOWN)) {
_leave(" = 0 [full]");
return 0;
}
- *fpos = blkoff + next * sizeof(union afs_dirent);
+ ctx->pos = blkoff + next * sizeof(union afs_dirent);
}
_leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
/*
* iterate through the data blob that lists the contents of an AFS directory
*/
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
- filldir_t filldir, struct key *key)
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+ struct key *key)
{
union afs_dir_block *dblock;
struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
unsigned blkoff, limit;
int ret;
- _enter("{%lu},%u,,", dir->i_ino, *fpos);
+ _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
_leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
}
/* round the file position up to the next entry boundary */
- *fpos += sizeof(union afs_dirent) - 1;
- *fpos &= ~(sizeof(union afs_dirent) - 1);
+ ctx->pos += sizeof(union afs_dirent) - 1;
+ ctx->pos &= ~(sizeof(union afs_dirent) - 1);
/* walk through the blocks in sequence */
ret = 0;
- while (*fpos < dir->i_size) {
- blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+ while (ctx->pos < dir->i_size) {
+ blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
/* fetch the appropriate page from the directory */
page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
do {
dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
sizeof(union afs_dir_block)];
- ret = afs_dir_iterate_block(fpos, dblock, blkoff,
- cookie, filldir);
+ ret = afs_dir_iterate_block(ctx, dblock, blkoff);
if (ret != 1) {
afs_dir_put_page(page);
goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
blkoff += sizeof(union afs_dir_block);
- } while (*fpos < dir->i_size && blkoff < limit);
+ } while (ctx->pos < dir->i_size && blkoff < limit);
afs_dir_put_page(page);
ret = 0;
@@ -387,23 +380,10 @@ out:
/*
* read an AFS directory
*/
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned fpos;
- int ret;
-
- _enter("{%Ld,{%lu}}",
- file->f_pos, file_inode(file)->i_ino);
-
- ASSERT(file->private_data != NULL);
-
- fpos = file->f_pos;
- ret = afs_dir_iterate(file_inode(file), &fpos,
- cookie, filldir, file->private_data);
- file->f_pos = fpos;
-
- _leave(" = %d", ret);
- return ret;
+ return afs_dir_iterate(file_inode(file),
+ ctx, file->private_data);
}
/*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
{
struct afs_lookup_cookie *cookie = _cookie;
- _enter("{%s,%Zu},%s,%u,,%llu,%u",
- cookie->name, cookie->nlen, name, nlen,
+ _enter("{%s,%u},%s,%u,,%llu,%u",
+ cookie->name.name, cookie->name.len, name, nlen,
(unsigned long long) ino, dtype);
/* insanity checks first */
BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
- if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+ if (cookie->name.len != nlen ||
+ memcmp(cookie->name.name, name, nlen) != 0) {
_leave(" = 0 [no]");
return 0;
}
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
struct afs_fid *fid, struct key *key)
{
- struct afs_lookup_cookie cookie;
- struct afs_super_info *as;
- unsigned fpos;
+ struct afs_super_info *as = dir->i_sb->s_fs_info;
+ struct afs_lookup_cookie cookie = {
+ .ctx.actor = afs_lookup_filldir,
+ .name = dentry->d_name,
+ .fid.vid = as->volume->vid
+ };
int ret;
_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
- as = dir->i_sb->s_fs_info;
-
/* search the directory */
- cookie.name = dentry->d_name.name;
- cookie.nlen = dentry->d_name.len;
- cookie.fid.vid = as->volume->vid;
- cookie.found = 0;
-
- fpos = 0;
- ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
- key);
+ ret = afs_dir_iterate(dir, &cookie.ctx, key);
if (ret < 0) {
_leave(" = %d [iter]", ret);
return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
#include "internal.h"
static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
- _enter("{%lu},%lu", page->index, offset);
+ _enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0) {
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
*/
static int afs_do_setlk(struct file *file, struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+ struct inode *inode = file_inode(file);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
afs_lock_type_t type;
struct key *key = file->private_data;
int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
- lock_flocks();
+ spin_lock(&inode->i_lock);
/* make sure we've got a callback on this file and that our view of the
* data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
afs_vnode_fetch_status(vnode, NULL, key);
error:
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
_leave(" = %d", ret);
return ret;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 096b23f821a1..526e4bbbde59 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = PDE(inode)->data;
+ m->private = PDE_DATA(inode);
return 0;
}
@@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = PDE(inode)->data;
+ cell = PDE_DATA(inode);
if (!cell)
return -ENOENT;
@@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = PDE(inode)->data;
+ cell = PDE_DATA(inode);
if (!cell)
return -ENOENT;
@@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
struct seq_file *m;
int ret;
- cell = PDE(inode)->data;
+ cell = PDE_DATA(inode);
if (!cell)
return -ENOENT;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7e03eadb40c0..a890db4b9898 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index 1dc8786f4588..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
*
* See ../COPYING for licensing terms.
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -18,8 +20,6 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
-#define DEBUG 0
-
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -39,11 +39,78 @@
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#if DEBUG > 1
-#define dprintk printk
-#else
-#define dprintk(x...) do { ; } while (0)
-#endif
+#include "internal.h"
+
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head;
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ struct io_event io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES 8
+
+struct kioctx {
+ atomic_t users;
+ atomic_t dead;
+
+ /* This needs improving */
+ unsigned long user_id;
+ struct hlist_node list;
+
+ /*
+ * This is what userspace passed to io_setup(), it's not used for
+ * anything but counting against the global max_reqs quota.
+ *
+ * The real limit is nr_events - 1, which will be larger (see
+ * aio_setup_ring())
+ */
+ unsigned max_reqs;
+
+ /* Size of ringbuffer, in units of struct io_event */
+ unsigned nr_events;
+
+ unsigned long mmap_base;
+ unsigned long mmap_size;
+
+ struct page **ring_pages;
+ long nr_pages;
+
+ struct rcu_head rcu_head;
+ struct work_struct rcu_work;
+
+ struct {
+ atomic_t reqs_active;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t ctx_lock;
+ struct list_head active_reqs; /* used for cancellation */
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ struct mutex ring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned tail;
+ spinlock_t completion_lock;
+ } ____cacheline_aligned_in_smp;
+
+ struct page *internal_pages[AIO_RING_PAGES];
+};
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +121,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
@@ -68,10 +130,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
- aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
- BUG_ON(!aio_wq);
-
- pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+ pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
return 0;
}
@@ -79,28 +138,20 @@ __initcall(aio_setup);
static void aio_free_ring(struct kioctx *ctx)
{
- struct aio_ring_info *info = &ctx->ring_info;
long i;
- for (i=0; i<info->nr_pages; i++)
- put_page(info->ring_pages[i]);
+ for (i = 0; i < ctx->nr_pages; i++)
+ put_page(ctx->ring_pages[i]);
- if (info->mmap_size) {
- BUG_ON(ctx->mm != current->mm);
- vm_munmap(info->mmap_base, info->mmap_size);
- }
-
- if (info->ring_pages && info->ring_pages != info->internal_pages)
- kfree(info->ring_pages);
- info->ring_pages = NULL;
- info->nr = 0;
+ if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+ kfree(ctx->ring_pages);
}
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
- struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
+ struct mm_struct *mm = current->mm;
unsigned long size, populate;
int nr_pages;
@@ -116,46 +167,44 @@ static int aio_setup_ring(struct kioctx *ctx)
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
- info->nr = 0;
- info->ring_pages = info->internal_pages;
+ ctx->nr_events = 0;
+ ctx->ring_pages = ctx->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
- info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!info->ring_pages)
+ ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!ctx->ring_pages)
return -ENOMEM;
}
- info->mmap_size = nr_pages * PAGE_SIZE;
- dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
- down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0,
- &populate);
- if (IS_ERR((void *)info->mmap_base)) {
- up_write(&ctx->mm->mmap_sem);
- info->mmap_size = 0;
+ ctx->mmap_size = nr_pages * PAGE_SIZE;
+ pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+ down_write(&mm->mmap_sem);
+ ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+ if (IS_ERR((void *)ctx->mmap_base)) {
+ up_write(&mm->mmap_sem);
+ ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
- dprintk("mmap address: 0x%08lx\n", info->mmap_base);
- info->nr_pages = get_user_pages(current, ctx->mm,
- info->mmap_base, nr_pages,
- 1, 0, info->ring_pages, NULL);
- up_write(&ctx->mm->mmap_sem);
+ pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+ 1, 0, ctx->ring_pages, NULL);
+ up_write(&mm->mmap_sem);
- if (unlikely(info->nr_pages != nr_pages)) {
+ if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
}
if (populate)
- mm_populate(info->mmap_base, populate);
-
- ctx->user_id = info->mmap_base;
+ mm_populate(ctx->mmap_base, populate);
- info->nr = nr_events; /* trusted copy */
+ ctx->user_id = ctx->mmap_base;
+ ctx->nr_events = nr_events; /* trusted copy */
- ring = kmap_atomic(info->ring_pages[0]);
+ ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
@@ -164,72 +213,130 @@ static int aio_setup_ring(struct kioctx *ctx)
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
return 0;
}
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(). Release the pointer with put_aio_ring_event();
- */
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr) ({ \
- unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
- struct io_event *__event; \
- __event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
- __event += pos % AIO_EVENTS_PER_PAGE; \
- __event; \
-})
-
-#define put_aio_ring_event(event) do { \
- struct io_event *__event = (event); \
- (void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+ if (!req->ki_list.next)
+ list_add(&req->ki_list, &ctx->active_reqs);
+
+ req->ki_cancel = cancel;
+
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+ struct io_event *res)
+{
+ kiocb_cancel_fn *old, *cancel;
+ int ret = -EINVAL;
+
+ /*
+ * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+ * actually has a cancel function, hence the cmpxchg()
+ */
+
+ cancel = ACCESS_ONCE(kiocb->ki_cancel);
+ do {
+ if (!cancel || cancel == KIOCB_CANCELLED)
+ return ret;
+
+ old = cancel;
+ cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+ } while (cancel != old);
+
+ atomic_inc(&kiocb->ki_users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ memset(res, 0, sizeof(*res));
+ res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+ res->data = kiocb->ki_user_data;
+ ret = cancel(kiocb, res);
+
+ spin_lock_irq(&ctx->ctx_lock);
+
+ return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
kmem_cache_free(kioctx_cachep, ctx);
}
-/* __put_ioctx
- * Called when the last user of an aio context has gone away,
- * and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
*/
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
{
- unsigned nr_events = ctx->max_reqs;
- BUG_ON(ctx->reqs_active);
+ struct aio_ring *ring;
+ struct io_event res;
+ struct kiocb *req;
+ unsigned head, avail;
- cancel_delayed_work_sync(&ctx->wq);
- aio_free_ring(ctx);
- mmdrop(ctx->mm);
- ctx->mm = NULL;
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
+ spin_lock_irq(&ctx->ctx_lock);
+
+ while (!list_empty(&ctx->active_reqs)) {
+ req = list_first_entry(&ctx->active_reqs,
+ struct kiocb, ki_list);
+
+ list_del_init(&req->ki_list);
+ kiocb_cancel(ctx, req, &res);
}
- pr_debug("__put_ioctx: freeing %p\n", ctx);
- call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}
-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
- return atomic_inc_not_zero(&kioctx->users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ while (atomic_read(&ctx->reqs_active) > 0) {
+ wait_event(ctx->wait,
+ head != ctx->tail ||
+ atomic_read(&ctx->reqs_active) <= 0);
+
+ avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+
+ atomic_sub(avail, &ctx->reqs_active);
+ head += avail;
+ head %= ctx->nr_events;
+ }
+
+ WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+
+ aio_free_ring(ctx);
+
+ pr_debug("freeing %p\n", ctx);
+
+ /*
+ * Here the call_rcu() is between the wait_event() for reqs_active to
+ * hit 0, and freeing the ioctx.
+ *
+ * aio_complete() decrements reqs_active, but it has to touch the ioctx
+ * after to issue a wakeup so we use rcu.
+ */
+ call_rcu(&ctx->rcu_head, free_ioctx_rcu);
}
-static inline void put_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- if (unlikely(atomic_dec_and_test(&kioctx->users)))
- __put_ioctx(kioctx);
+ if (unlikely(atomic_dec_and_test(&ctx->users)))
+ free_ioctx(ctx);
}
/* ioctx_alloc
@@ -237,7 +344,7 @@ static inline void put_ioctx(struct kioctx *kioctx)
*/
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
struct kioctx *ctx;
int err = -ENOMEM;
@@ -256,17 +363,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-ENOMEM);
ctx->max_reqs = nr_events;
- mm = ctx->mm = current->mm;
- atomic_inc(&mm->mm_count);
atomic_set(&ctx->users, 2);
+ atomic_set(&ctx->dead, 0);
spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->ring_info.ring_lock);
+ spin_lock_init(&ctx->completion_lock);
+ mutex_init(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
- INIT_LIST_HEAD(&ctx->run_list);
- INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
if (aio_setup_ring(ctx) < 0)
goto out_freectx;
@@ -286,64 +391,63 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
- dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
- ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+ pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+ ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
out_freectx:
- mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- dprintk("aio: error allocating ioctx %d\n", err);
+ pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
}
-/* kill_ctx
- * Cancels all outstanding aio requests on an aio context. Used
- * when the processes owning a context have all exited to encourage
+static void kill_ioctx_work(struct work_struct *work)
+{
+ struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
+
+ wake_up_all(&ctx->wait);
+ put_ioctx(ctx);
+}
+
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+ INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+ schedule_work(&ctx->rcu_work);
+}
+
+/* kill_ioctx
+ * Cancels all outstanding aio requests on an aio context. Used
+ * when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx(struct kioctx *ctx)
{
- int (*cancel)(struct kiocb *, struct io_event *);
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- struct io_event res;
+ if (!atomic_xchg(&ctx->dead, 1)) {
+ hlist_del_rcu(&ctx->list);
- spin_lock_irq(&ctx->ctx_lock);
- ctx->dead = 1;
- while (!list_empty(&ctx->active_reqs)) {
- struct list_head *pos = ctx->active_reqs.next;
- struct kiocb *iocb = list_kiocb(pos);
- list_del_init(&iocb->ki_list);
- cancel = iocb->ki_cancel;
- kiocbSetCancelled(iocb);
- if (cancel) {
- iocb->ki_users++;
- spin_unlock_irq(&ctx->ctx_lock);
- cancel(iocb, &res);
- spin_lock_irq(&ctx->ctx_lock);
- }
- }
+ /*
+ * It'd be more correct to do this in free_ioctx(), after all
+ * the outstanding kiocbs have finished - but by then io_destroy
+ * has already returned, so io_setup() could potentially return
+ * -EAGAIN with no ioctxs actually in use (as far as userspace
+ * could tell).
+ */
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
- if (!ctx->reqs_active)
- goto out;
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
- add_wait_queue(&ctx->wait, &wait);
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (ctx->reqs_active) {
- spin_unlock_irq(&ctx->ctx_lock);
- io_schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&ctx->ctx_lock);
+ /* Between hlist_del_rcu() and dropping the initial ref */
+ call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
}
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
-
-out:
- spin_unlock_irq(&ctx->ctx_lock);
}
/* wait_on_sync_kiocb:
@@ -351,9 +455,9 @@ out:
*/
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
{
- while (iocb->ki_users) {
+ while (atomic_read(&iocb->ki_users)) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!iocb->ki_users)
+ if (!atomic_read(&iocb->ki_users))
break;
io_schedule();
}
@@ -362,28 +466,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
}
EXPORT_SYMBOL(wait_on_sync_kiocb);
-/* exit_aio: called when the last user of mm goes away. At this point,
- * there is no way for any new requests to be submited or any of the
- * io_* syscalls to be called on the context. However, there may be
- * outstanding requests which hold references to the context; as they
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away. At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
*/
void exit_aio(struct mm_struct *mm)
{
struct kioctx *ctx;
+ struct hlist_node *n;
- while (!hlist_empty(&mm->ioctx_list)) {
- ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
- hlist_del_rcu(&ctx->list);
-
- kill_ctx(ctx);
-
+ hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
"exit_aio:ioctx still alive: %d %d %d\n",
- atomic_read(&ctx->users), ctx->dead,
- ctx->reqs_active);
+ atomic_read(&ctx->users),
+ atomic_read(&ctx->dead),
+ atomic_read(&ctx->reqs_active));
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
@@ -391,150 +493,50 @@ void exit_aio(struct mm_struct *mm)
* as indicator that it needs to unmap the area,
* just set it to 0; aio_free_ring() is the only
* place that uses ->mmap_size, so it's safe.
- * That way we get all munmap done to current->mm -
- * all other callers have ctx->mm == current->mm.
*/
- ctx->ring_info.mmap_size = 0;
- put_ioctx(ctx);
+ ctx->mmap_size = 0;
+
+ kill_ioctx(ctx);
}
}
/* aio_get_req
- * Allocate a slot for an aio request. Increments the users count
+ * Allocate a slot for an aio request. Increments the ki_users count
* of the kioctx so that the kioctx stays around until all requests are
* complete. Returns NULL if no requests are free.
*
- * Returns with kiocb->users set to 2. The io submit code path holds
+ * Returns with kiocb->ki_users set to 2. The io submit code path holds
* an extra reference while submitting the i/o.
* This prevents races between the aio code path referencing the
* req (after submitting it) and aio_complete() freeing the req.
*/
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
- struct kiocb *req = NULL;
+ struct kiocb *req;
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
- if (unlikely(!req))
+ if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
return NULL;
- req->ki_flags = 0;
- req->ki_users = 2;
- req->ki_key = 0;
- req->ki_ctx = ctx;
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
- req->ki_iovec = NULL;
- INIT_LIST_HEAD(&req->ki_run_list);
- req->ki_eventfd = NULL;
-
- return req;
-}
-
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE 32L
-struct kiocb_batch {
- struct list_head head;
- long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
- INIT_LIST_HEAD(&batch->head);
- batch->count = total;
-}
-
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- struct kiocb *req, *n;
-
- if (list_empty(&batch->head))
- return;
-
- spin_lock_irq(&ctx->ctx_lock);
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- list_del(&req->ki_list);
- kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
- }
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs. This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- unsigned short allocated, to_alloc;
- long avail;
- struct kiocb *req, *n;
- struct aio_ring *ring;
-
- to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
- for (allocated = 0; allocated < to_alloc; allocated++) {
- req = __aio_get_req(ctx);
- if (!req)
- /* allocation failed, go with what we've got */
- break;
- list_add(&req->ki_batch, &batch->head);
- }
-
- if (allocated == 0)
- goto out;
-
- spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
- avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
- BUG_ON(avail < 0);
- if (avail < allocated) {
- /* Trim back the number of requests. */
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- kmem_cache_free(kiocb_cachep, req);
- if (--allocated <= avail)
- break;
- }
- }
-
- batch->count -= allocated;
- list_for_each_entry(req, &batch->head, ki_batch) {
- list_add(&req->ki_list, &ctx->active_reqs);
- ctx->reqs_active++;
- }
-
- kunmap_atomic(ring);
- spin_unlock_irq(&ctx->ctx_lock);
+ if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
+ goto out_put;
-out:
- return allocated;
-}
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ if (unlikely(!req))
+ goto out_put;
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
- struct kiocb_batch *batch)
-{
- struct kiocb *req;
+ atomic_set(&req->ki_users, 2);
+ req->ki_ctx = ctx;
- if (list_empty(&batch->head))
- if (kiocb_batch_refill(ctx, batch) == 0)
- return NULL;
- req = list_first_entry(&batch->head, struct kiocb, ki_batch);
- list_del(&req->ki_batch);
return req;
+out_put:
+ atomic_dec(&ctx->reqs_active);
+ return NULL;
}
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
{
- assert_spin_locked(&ctx->ctx_lock);
-
+ if (req->ki_filp)
+ fput(req->ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_dtor)
@@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
-
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
}
-/* __aio_put_req
- * Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
- req, atomic_long_read(&req->ki_filp->f_count));
-
- assert_spin_locked(&ctx->ctx_lock);
-
- req->ki_users--;
- BUG_ON(req->ki_users < 0);
- if (likely(req->ki_users))
- return 0;
- list_del(&req->ki_list); /* remove from active_reqs */
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
-
- fput(req->ki_filp);
- req->ki_filp = NULL;
- really_put_req(ctx, req);
- return 1;
-}
-
-/* aio_put_req
- * Returns true if this put was the last user of the kiocb,
- * false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
- struct kioctx *ctx = req->ki_ctx;
- int ret;
- spin_lock_irq(&ctx->ctx_lock);
- ret = __aio_put_req(ctx, req);
- spin_unlock_irq(&ctx->ctx_lock);
- return ret;
+ if (atomic_dec_and_test(&req->ki_users))
+ kiocb_free(req);
}
EXPORT_SYMBOL(aio_put_req);
@@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
rcu_read_lock();
hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- /*
- * RCU protects us against accessing freed memory but
- * we have to be careful not to get a reference when the
- * reference count already dropped to 0 (ctx->dead test
- * is unreliable because of races).
- */
- if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+ if (ctx->user_id == ctx_id) {
+ atomic_inc(&ctx->users);
ret = ctx;
break;
}
@@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
return ret;
}
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- if (list_empty(&iocb->ki_run_list)) {
- list_add_tail(&iocb->ki_run_list,
- &ctx->run_list);
- return 1;
- }
- return 0;
-}
-
-/* aio_run_iocb
- * This is the core aio execution routine. It is
- * invoked both for initial i/o submission and
- * subsequent retries via the aio_kick_handler.
- * Expects to be invoked with iocb->ki_ctx->lock
- * already held. The lock is released and reacquired
- * as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- ssize_t (*retry)(struct kiocb *);
- ssize_t ret;
-
- if (!(retry = iocb->ki_retry)) {
- printk("aio_run_iocb: iocb->ki_retry = NULL\n");
- return 0;
- }
-
- /*
- * We don't want the next retry iteration for this
- * operation to start until this one has returned and
- * updated the iocb state. However, wait_queue functions
- * can trigger a kick_iocb from interrupt context in the
- * meantime, indicating that data is available for the next
- * iteration. We want to remember that and enable the
- * next retry iteration _after_ we are through with
- * this one.
- *
- * So, in order to be able to register a "kick", but
- * prevent it from being queued now, we clear the kick
- * flag, but make the kick code *think* that the iocb is
- * still on the run list until we are actually done.
- * When we are done with this iteration, we check if
- * the iocb was kicked in the meantime and if so, queue
- * it up afresh.
- */
-
- kiocbClearKicked(iocb);
-
- /*
- * This is so that aio_complete knows it doesn't need to
- * pull the iocb off the run list (We can't just call
- * INIT_LIST_HEAD because we don't want a kick_iocb to
- * queue this on the run list yet)
- */
- iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
- spin_unlock_irq(&ctx->ctx_lock);
-
- /* Quit retrying if the i/o has been cancelled */
- if (kiocbIsCancelled(iocb)) {
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- goto out;
- }
-
- /*
- * Now we are all set to call the retry method in async
- * context.
- */
- ret = retry(iocb);
-
- if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
- /*
- * There's no easy way to restart the syscall since other AIO's
- * may be already running. Just fail this IO with EINTR.
- */
- if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
- ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- }
-out:
- spin_lock_irq(&ctx->ctx_lock);
-
- if (-EIOCBRETRY == ret) {
- /*
- * OK, now that we are done with this iteration
- * and know that there is more left to go,
- * this is where we let go so that a subsequent
- * "kick" can start the next iteration
- */
-
- /* will make __queue_kicked_iocb succeed from here on */
- INIT_LIST_HEAD(&iocb->ki_run_list);
- /* we must queue the next iteration ourselves, if it
- * has already been kicked */
- if (kiocbIsKicked(iocb)) {
- __queue_kicked_iocb(iocb);
-
- /*
- * __queue_kicked_iocb will always return 1 here, because
- * iocb->ki_run_list is empty at this point so it should
- * be safe to unconditionally queue the context into the
- * work queue.
- */
- aio_queue_work(ctx);
- }
- }
- return ret;
-}
-
-/*
- * __aio_run_iocbs:
- * Process all pending retries queued on the ioctx
- * run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
- struct kiocb *iocb;
- struct list_head run_list;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- list_replace_init(&ctx->run_list, &run_list);
- while (!list_empty(&run_list)) {
- iocb = list_entry(run_list.next, struct kiocb,
- ki_run_list);
- list_del(&iocb->ki_run_list);
- /*
- * Hold an extra reference while retrying i/o.
- */
- iocb->ki_users++; /* grab extra reference */
- aio_run_iocb(iocb);
- __aio_put_req(ctx, iocb);
- }
- if (!list_empty(&ctx->run_list))
- return 1;
- return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
- unsigned long timeout;
- /*
- * if someone is waiting, get the work started right
- * away, otherwise, use a longer delay
- */
- smp_mb();
- if (waitqueue_active(&ctx->wait))
- timeout = 1;
- else
- timeout = HZ/10;
- queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- * Process all pending retries queued on the ioctx
- * run list, and keep running them until the list
- * stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
- spin_lock_irq(&ctx->ctx_lock);
- while (__aio_run_iocbs(ctx))
- ;
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- * Work queue handler triggered to process pending
- * retries on an ioctx. Takes on the aio issuer's
- * mm context before running the iocbs, so that
- * copy_xxx_user operates on the issuer's address
- * space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
- struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
- mm_segment_t oldfs = get_fs();
- struct mm_struct *mm;
- int requeue;
-
- set_fs(USER_DS);
- use_mm(ctx->mm);
- spin_lock_irq(&ctx->ctx_lock);
- requeue =__aio_run_iocbs(ctx);
- mm = ctx->mm;
- spin_unlock_irq(&ctx->ctx_lock);
- unuse_mm(mm);
- set_fs(oldfs);
- /*
- * we're in a worker thread already; no point using non-zero delay
- */
- if (requeue)
- queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- unsigned long flags;
- int run = 0;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- /* set this inside the lock so that we can't race with aio_run_iocb()
- * testing it and putting the iocb on the run list under the lock */
- if (!kiocbTryKick(iocb))
- run = __queue_kicked_iocb(iocb);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- if (run)
- aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- * Called typically from a wait queue callback context
- * to trigger a retry of the iocb.
- * The retry is usually executed by aio workqueue
- * threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
- /* sync iocbs are easy: they can only ever be executing from a
- * single context. */
- if (is_sync_kiocb(iocb)) {
- kiocbSetKicked(iocb);
- wake_up_process(iocb->ki_obj.tsk);
- return;
- }
-
- try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
/* aio_complete
* Called when the io request on the given iocb is complete.
- * Returns true if this is the last user of the request. The
- * only other user of the request can be the cancellation code.
*/
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring_info *info;
struct aio_ring *ring;
- struct io_event *event;
+ struct io_event *ev_page, *event;
unsigned long flags;
- unsigned long tail;
- int ret;
+ unsigned tail, pos;
/*
* Special case handling for sync iocbs:
@@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
* - the sync task helpfully left a reference to itself in the iocb
*/
if (is_sync_kiocb(iocb)) {
- BUG_ON(iocb->ki_users != 1);
+ BUG_ON(atomic_read(&iocb->ki_users) != 1);
iocb->ki_user_data = res;
- iocb->ki_users = 0;
+ atomic_set(&iocb->ki_users, 0);
wake_up_process(iocb->ki_obj.tsk);
- return 1;
+ return;
}
- info = &ctx->ring_info;
-
- /* add a completion event to the ring buffer.
- * must be done holding ctx->ctx_lock to prevent
- * other code from messing with the tail
- * pointer since we might be called from irq
- * context.
+ /*
+ * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+ * need to issue a wakeup after decrementing reqs_active.
*/
- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ rcu_read_lock();
- if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
- list_del_init(&iocb->ki_run_list);
+ if (iocb->ki_list.next) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&iocb->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ }
/*
* cancelled requests don't get events, userland was given one
* when the event got cancelled.
*/
- if (kiocbIsCancelled(iocb))
+ if (unlikely(xchg(&iocb->ki_cancel,
+ KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
+ atomic_dec(&ctx->reqs_active);
+ /* Still need the wake_up in case free_ioctx is waiting */
goto put_rq;
+ }
- ring = kmap_atomic(info->ring_pages[0]);
+ /*
+ * Add a completion event to the ring buffer. Must be done holding
+ * ctx->completion_lock to prevent other code from messing with the tail
+ * pointer since we might be called from irq context.
+ */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+
+ tail = ctx->tail;
+ pos = tail + AIO_EVENTS_OFFSET;
- tail = info->tail;
- event = aio_ring_event(info, tail);
- if (++tail >= info->nr)
+ if (++tail >= ctx->nr_events)
tail = 0;
+ ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
event->obj = (u64)(unsigned long)iocb->ki_obj.user;
event->data = iocb->ki_user_data;
event->res = res;
event->res2 = res2;
- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ kunmap_atomic(ev_page);
+ flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+ pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ res, res2);
/* after flagging the request as done, we
* must never even look at it again
*/
smp_wmb(); /* make event visible before updating tail */
- info->tail = tail;
- ring->tail = tail;
+ ctx->tail = tail;
- put_aio_ring_event(event);
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->tail = tail;
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ pr_debug("added to ring %p at [%u]\n", iocb, tail);
/*
* Check if the user asked us to deliver the result through an
@@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
put_rq:
/* everything turned out well, dispose of the aiocb. */
- ret = __aio_put_req(ctx, iocb);
+ aio_put_req(iocb);
/*
* We have to order our ring_info tail store above and test
@@ -988,233 +690,133 @@ put_rq:
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- return ret;
+ rcu_read_unlock();
}
EXPORT_SYMBOL(aio_complete);
-/* aio_read_evt
- * Pull an event off of the ioctx's event ring. Returns the number of
- * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
- * TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched
*/
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+ struct io_event __user *event, long nr)
{
- struct aio_ring_info *info = &ioctx->ring_info;
struct aio_ring *ring;
- unsigned long head;
- int ret = 0;
-
- ring = kmap_atomic(info->ring_pages[0]);
- dprintk("in aio_read_evt h%lu t%lu m%lu\n",
- (unsigned long)ring->head, (unsigned long)ring->tail,
- (unsigned long)ring->nr);
-
- if (ring->head == ring->tail)
- goto out;
+ unsigned head, pos;
+ long ret = 0;
+ int copy_ret;
- spin_lock(&info->ring_lock);
-
- head = ring->head % info->nr;
- if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp);
- }
- spin_unlock(&info->ring_lock);
+ mutex_lock(&ctx->ring_lock);
-out:
- dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
- (unsigned long)ring->head, (unsigned long)ring->tail);
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
kunmap_atomic(ring);
- return ret;
-}
-struct aio_timeout {
- struct timer_list timer;
- int timed_out;
- struct task_struct *p;
-};
+ pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
-static void timeout_func(unsigned long data)
-{
- struct aio_timeout *to = (struct aio_timeout *)data;
+ if (head == ctx->tail)
+ goto out;
- to->timed_out = 1;
- wake_up_process(to->p);
-}
+ while (ret < nr) {
+ long avail;
+ struct io_event *ev;
+ struct page *page;
-static inline void init_timeout(struct aio_timeout *to)
-{
- setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
- to->timed_out = 0;
- to->p = current;
-}
+ avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+ if (head == ctx->tail)
+ break;
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
- const struct timespec *ts)
-{
- to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
- if (time_after(to->timer.expires, jiffies))
- add_timer(&to->timer);
- else
- to->timed_out = 1;
-}
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+ ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-static inline void clear_timeout(struct aio_timeout *to)
-{
- del_singleshot_timer_sync(&to->timer);
-}
+ pos = head + AIO_EVENTS_OFFSET;
+ page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ pos %= AIO_EVENTS_PER_PAGE;
-static int read_events(struct kioctx *ctx,
- long min_nr, long nr,
- struct io_event __user *event,
- struct timespec __user *timeout)
-{
- long start_jiffies = jiffies;
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- int ret;
- int i = 0;
- struct io_event ent;
- struct aio_timeout to;
- int retry = 0;
-
- /* needed to zero any padding within an entry (there shouldn't be
- * any, but C is fun!
- */
- memset(&ent, 0, sizeof(ent));
-retry:
- ret = 0;
- while (likely(i < nr)) {
- ret = aio_read_evt(ctx, &ent);
- if (unlikely(ret <= 0))
- break;
+ ev = kmap(page);
+ copy_ret = copy_to_user(event + ret, ev + pos,
+ sizeof(*ev) * avail);
+ kunmap(page);
- dprintk("read event: %Lx %Lx %Lx %Lx\n",
- ent.data, ent.obj, ent.res, ent.res2);
-
- /* Could we split the check in two? */
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
+ if (unlikely(copy_ret)) {
+ ret = -EFAULT;
+ goto out;
}
- ret = 0;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
+ ret += avail;
+ head += avail;
+ head %= ctx->nr_events;
}
- if (min_nr <= i)
- return i;
- if (ret)
- return ret;
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->head = head;
+ kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
- /* End fast path */
+ pr_debug("%li h%u t%u\n", ret, head, ctx->tail);
- /* racey check, but it gets redone */
- if (!retry && unlikely(!list_empty(&ctx->run_list))) {
- retry = 1;
- aio_run_all_iocbs(ctx);
- goto retry;
- }
+ atomic_sub(ret, &ctx->reqs_active);
+out:
+ mutex_unlock(&ctx->ring_lock);
- init_timeout(&to);
- if (timeout) {
- struct timespec ts;
- ret = -EFAULT;
- if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- goto out;
+ return ret;
+}
- set_timeout(start_jiffies, &to, &ts);
- }
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event, long *i)
+{
+ long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- while (likely(i < nr)) {
- add_wait_queue_exclusive(&ctx->wait, &wait);
- do {
- set_task_state(tsk, TASK_INTERRUPTIBLE);
- ret = aio_read_evt(ctx, &ent);
- if (ret)
- break;
- if (min_nr <= i)
- break;
- if (unlikely(ctx->dead)) {
- ret = -EINVAL;
- break;
- }
- if (to.timed_out) /* Only check after read evt */
- break;
- /* Try to only show up in io wait if there are ops
- * in flight */
- if (ctx->reqs_active)
- io_schedule();
- else
- schedule();
- if (signal_pending(tsk)) {
- ret = -EINTR;
- break;
- }
- /*ret = aio_read_evt(ctx, &ent);*/
- } while (1) ;
-
- set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
-
- if (unlikely(ret <= 0))
- break;
+ if (ret > 0)
+ *i += ret;
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
+ if (unlikely(atomic_read(&ctx->dead)))
+ ret = -EINVAL;
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
+ if (!*i)
+ *i = ret;
- if (timeout)
- clear_timeout(&to);
-out:
- destroy_timer_on_stack(&to.timer);
- return i ? i : ret;
+ return ret < 0 || *i >= min_nr;
}
-/* Take an ioctx and remove it from the list of ioctx's. Protects
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event,
+ struct timespec __user *timeout)
{
- struct mm_struct *mm = current->mm;
- int was_dead;
+ ktime_t until = { .tv64 = KTIME_MAX };
+ long ret = 0;
- /* delete the entry from the list is someone else hasn't already */
- spin_lock(&mm->ioctx_lock);
- was_dead = ioctx->dead;
- ioctx->dead = 1;
- hlist_del_rcu(&ioctx->list);
- spin_unlock(&mm->ioctx_lock);
+ if (timeout) {
+ struct timespec ts;
- dprintk("aio_release(%p)\n", ioctx);
- if (likely(!was_dead))
- put_ioctx(ioctx); /* twice for the list */
+ if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+ return -EFAULT;
- kill_ctx(ioctx);
+ until = timespec_to_ktime(ts);
+ }
/*
- * Wake up any waiters. The setting of ctx->dead must be seen
- * by other CPUs at this point. Right now, we rely on the
- * locking done by the above calls to ensure this consistency.
+ * Note that aio_read_events() is being called as the conditional - i.e.
+ * we're calling it after prepare_to_wait() has set task state to
+ * TASK_INTERRUPTIBLE.
+ *
+ * But aio_read_events() can block, and if it blocks it's going to flip
+ * the task state back to TASK_RUNNING.
+ *
+ * This should be ok, provided it doesn't flip the state back to
+ * TASK_RUNNING and return 0 too much - that causes us to spin. That
+ * will only happen if the mutex_lock() call blocks, and we then find
+ * the ringbuffer empty. So in practice we should be ok, but it's
+ * something to be aware of when touching this code.
*/
- wake_up_all(&ioctx->wait);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+
+ return ret;
}
/* sys_io_setup:
@@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
}
@@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
return 0;
}
@@ -1301,29 +903,22 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
BUG_ON(ret > 0 && iocb->ki_left == 0);
}
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
ssize_t ret = 0;
- unsigned short opcode;
-
- if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
- (iocb->ki_opcode == IOCB_CMD_PREAD)) {
- rw_op = file->f_op->aio_read;
- opcode = IOCB_CMD_PREADV;
- } else {
- rw_op = file->f_op->aio_write;
- opcode = IOCB_CMD_PWRITEV;
- }
/* This matches the pread()/pwrite() logic */
if (iocb->ki_pos < 0)
return -EINVAL;
+ if (rw == WRITE)
+ file_start_write(file);
do {
ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
iocb->ki_nr_segs - iocb->ki_cur_seg,
@@ -1334,8 +929,10 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* retry all partial writes. retry partial reads as long as its a
* regular file. */
} while (ret > 0 && iocb->ki_left > 0 &&
- (opcode == IOCB_CMD_PWRITEV ||
+ (rw == WRITE ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
+ if (rw == WRITE)
+ file_end_write(file);
/* This means we must have transferred all that we could */
/* No need to retry anymore */
@@ -1344,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
/* If we managed to write some out we return that, rather than
* the eventual error. */
- if (opcode == IOCB_CMD_PWRITEV
- && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+ if (rw == WRITE
+ && ret < 0 && ret != -EIOCBQUEUED
&& iocb->ki_nbytes - iocb->ki_left)
ret = iocb->ki_nbytes - iocb->ki_left;
return ret;
}
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 1);
- return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 0);
- return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
+ kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(type,
+ ret = compat_rw_copy_check_uvector(rw,
(struct compat_iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
else
#endif
- ret = rw_copy_check_uvector(type,
+ ret = rw_copy_check_uvector(rw,
(struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
if (ret < 0)
- goto out;
-
- ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
- if (ret < 0)
- goto out;
+ return ret;
- kiocb->ki_nr_segs = kiocb->ki_nbytes;
- kiocb->ki_cur_seg = 0;
- /* ki_nbytes/left now reflect bytes instead of segs */
+ /* ki_nbytes now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
- kiocb->ki_left = ret;
-
- ret = 0;
-out:
- return ret;
+ return 0;
}
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
{
- int bytes;
-
- bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
- if (bytes < 0)
- return bytes;
+ if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+ return -EFAULT;
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
- kiocb->ki_iovec->iov_len = bytes;
+ kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
kiocb->ki_nr_segs = 1;
- kiocb->ki_cur_seg = 0;
return 0;
}
@@ -1427,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
{
- struct file *file = kiocb->ki_filp;
- ssize_t ret = 0;
+ struct file *file = req->ki_filp;
+ ssize_t ret;
+ int rw;
+ fmode_t mode;
+ aio_rw_op *rw_op;
- switch (kiocb->ki_opcode) {
+ switch (req->ki_opcode) {
case IOCB_CMD_PREAD:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(READ, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
- case IOCB_CMD_PWRITE:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(WRITE, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
case IOCB_CMD_PREADV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = aio_setup_vectored_rw(READ, kiocb, compat);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
+ mode = FMODE_READ;
+ rw = READ;
+ rw_op = file->f_op->aio_read;
+ goto rw_common;
+
+ case IOCB_CMD_PWRITE:
case IOCB_CMD_PWRITEV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+ mode = FMODE_WRITE;
+ rw = WRITE;
+ rw_op = file->f_op->aio_write;
+ goto rw_common;
+rw_common:
+ if (unlikely(!(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!rw_op)
+ return -EINVAL;
+
+ ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+ req->ki_opcode == IOCB_CMD_PWRITEV)
+ ? aio_setup_vectored_rw(rw, req, compat)
+ : aio_setup_single_vector(rw, req);
if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
+ return ret;
+
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (ret < 0)
+ return ret;
+
+ req->ki_nbytes = ret;
+ req->ki_left = ret;
+
+ ret = aio_rw_vect_retry(req, rw, rw_op);
break;
+
case IOCB_CMD_FDSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fdsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 1);
break;
+
case IOCB_CMD_FSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 0);
break;
+
default:
- dprintk("EINVAL: io_submit: no operation provided\n");
- ret = -EINVAL;
+ pr_debug("EINVAL: no operation provided\n");
+ return -EINVAL;
}
- if (!kiocb->ki_retry)
- return ret;
+ if (ret != -EIOCBQUEUED) {
+ /*
+ * There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+ ret == -ERESTARTNOHAND ||
+ ret == -ERESTART_RESTARTBLOCK))
+ ret = -EINTR;
+ aio_complete(req, ret, 0);
+ }
return 0;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct kiocb_batch *batch,
- bool compat)
+ struct iocb *iocb, bool compat)
{
struct kiocb *req;
- struct file *file;
ssize_t ret;
/* enforce forwards compatibility on users */
if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
- pr_debug("EINVAL: io_submit: reserve field set\n");
+ pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1530,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
return -EINVAL;
}
- file = fget(iocb->aio_fildes);
- if (unlikely(!file))
- return -EBADF;
-
- req = aio_get_req(ctx, batch); /* returns with 2 references to req */
- if (unlikely(!req)) {
- fput(file);
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
return -EAGAIN;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp)) {
+ ret = -EBADF;
+ goto out_put_req;
}
- req->ki_filp = file;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1555,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
}
- ret = put_user(req->ki_key, &user_iocb->aio_key);
+ ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
- dprintk("EFAULT: aio_key\n");
+ pr_debug("EFAULT: aio_key\n");
goto out_put_req;
}
@@ -1569,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req, compat);
-
+ ret = aio_run_iocb(req, compat);
if (ret)
goto out_put_req;
- spin_lock_irq(&ctx->ctx_lock);
- /*
- * We could have raced with io_destroy() and are currently holding a
- * reference to ctx which should be destroyed. We cannot submit IO
- * since ctx gets freed as soon as io_submit() puts its reference. The
- * check here is reliable: io_destroy() sets ctx->dead before waiting
- * for outstanding IO and the barrier between these two is realized by
- * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
- * increment ctx->reqs_active before checking for ctx->dead and the
- * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
- * don't see ctx->dead set here, io_destroy() waits for our IO to
- * finish.
- */
- if (ctx->dead) {
- spin_unlock_irq(&ctx->ctx_lock);
- ret = -EINVAL;
- goto out_put_req;
- }
- aio_run_iocb(req);
- if (!list_empty(&ctx->run_list)) {
- /* drain the run list */
- while (__aio_run_iocbs(ctx))
- ;
- }
- spin_unlock_irq(&ctx->ctx_lock);
-
aio_put_req(req); /* drop extra ref to req */
return 0;
-
out_put_req:
+ atomic_dec(&ctx->reqs_active);
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
return ret;
@@ -1616,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
long ret = 0;
int i = 0;
struct blk_plug plug;
- struct kiocb_batch batch;
if (unlikely(nr < 0))
return -EINVAL;
@@ -1629,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr,
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
- pr_debug("EINVAL: io_submit: invalid context id\n");
+ pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- kiocb_batch_init(&batch, nr);
-
blk_start_plug(&plug);
/*
@@ -1655,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+ ret = io_submit_one(ctx, user_iocb, &tmp, compat);
if (ret)
break;
}
blk_finish_plug(&plug);
- kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
@@ -1694,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
assert_spin_locked(&ctx->ctx_lock);
+ if (key != KIOCB_KEY)
+ return NULL;
+
/* TODO: use a hash or array, this sucks. */
list_for_each(pos, &ctx->active_reqs) {
struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+ if (kiocb->ki_obj.user == iocb)
return kiocb;
}
return NULL;
@@ -1716,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
- int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct io_event res;
struct kioctx *ctx;
struct kiocb *kiocb;
u32 key;
@@ -1731,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- ret = -EAGAIN;
+
kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb && kiocb->ki_cancel) {
- cancel = kiocb->ki_cancel;
- kiocb->ki_users ++;
- kiocbSetCancelled(kiocb);
- } else
- cancel = NULL;
+ if (kiocb)
+ ret = kiocb_cancel(ctx, kiocb, &res);
+ else
+ ret = -EINVAL;
+
spin_unlock_irq(&ctx->ctx_lock);
- if (NULL != cancel) {
- struct io_event tmp;
- pr_debug("calling cancel\n");
- memset(&tmp, 0, sizeof(tmp));
- tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
- tmp.data = kiocb->ki_user_data;
- ret = cancel(kiocb, &tmp);
- if (!ret) {
- /* Cancellation succeeded -- copy the result
- * into the user's buffer.
- */
- if (copy_to_user(result, &tmp, sizeof(tmp)))
- ret = -EFAULT;
- }
- } else
- ret = -EINVAL;
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(result, &res, sizeof(res)))
+ ret = -EFAULT;
+ }
put_ioctx(ctx);
@@ -1773,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
* < min_nr if the timeout specified by timeout has elapsed
* before sufficient events are available, where timeout == NULL
* specifies an infinite timeout. Note that the timeout pointed to by
- * timeout is relative and will be updated if not NULL and the
- * operation blocks. Will fail with -ENOSYS if not implemented.
+ * timeout is relative. Will fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
@@ -1790,7 +1315,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
ret = read_events(ioctx, min_nr, nr, events, timeout);
put_ioctx(ioctx);
}
-
- asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
return ret;
}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 01443ce43ee7..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -61,15 +61,6 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
/* This is an autofs submount, we can't expire it */
if (autofs_type_indirect(sbi->type))
goto done;
-
- /*
- * Otherwise it's an offset mount and we need to check
- * if we can umount its mount, if there is one.
- */
- if (!d_mountpoint(path.dentry)) {
- status = 0;
- goto done;
- }
}
/* Update the expiry counter if fs is busy */
@@ -118,7 +109,7 @@ cont:
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
/* Already gone or negative dentry (under construction) - try next */
- if (q->d_count == 0 || !simple_positive(q)) {
+ if (!d_count(q) || !simple_positive(q)) {
spin_unlock(&q->d_lock);
next = q->d_u.d_child.next;
goto cont;
@@ -276,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
else
ino_count++;
- if (p->d_count > ino_count) {
+ if (d_count(p) > ino_count) {
top_ino->last_used = jiffies;
dput(p);
return 1;
@@ -418,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
if (!exp_leaves) {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -432,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
} else {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9bd16255dd9c..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.llseek = dcache_dir_lseek,
.unlocked_ioctl = autofs4_root_ioctl,
#ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
.open = autofs4_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.llseek = dcache_dir_lseek,
};
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
spin_lock(&active->d_lock);
/* Already gone? */
- if (active->d_count == 0)
+ if (!d_count(active))
goto next;
qstr = &active->d_name;
@@ -408,7 +408,7 @@ done:
return NULL;
}
-int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return -EIO;
}
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
{
return -EIO;
}
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
.write = bad_file_write,
.aio_read = bad_file_aio_read,
.aio_write = bad_file_aio_write,
- .readdir = bad_file_readdir,
+ .iterate = bad_file_readdir,
.poll = bad_file_poll,
.unlocked_ioctl = bad_file_unlocked_ioctl,
.compat_ioctl = bad_file_compat_ioctl,
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index a66c9b1136e0..74e397db0b8b 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -436,8 +436,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- if ((this_node = (befs_btree_node *)
- kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+ if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
befs_error(sb, "befs_btree_read() failed to allocate %u "
"bytes of memory", sizeof (befs_btree_node));
goto error;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8615ee89ab55..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
/* The units the vfs expects inode->i_blocks to be in */
#define VFS_BLOCK_SIZE 512
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
static int befs_readpage(struct file *file, struct page *page);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
static const struct file_operations befs_dir_operations = {
.read = generic_read_dir,
- .readdir = befs_readdir,
+ .iterate = befs_readdir,
.llseek = generic_file_llseek,
};
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
size_t keysize;
unsigned char d_type;
char keybuf[BEFS_NAME_LEN + 1];
- char *nlsname;
- int nlsnamelen;
- const char *dirname = filp->f_path.dentry->d_name.name;
+ const char *dirname = file->f_path.dentry->d_name.name;
befs_debug(sb, "---> befs_readdir() "
- "name %s, inode %ld, filp->f_pos %Ld",
- dirname, inode->i_ino, filp->f_pos);
+ "name %s, inode %ld, ctx->pos %Ld",
+ dirname, inode->i_ino, ctx->pos);
- result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+ result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
keybuf, &keysize, &value);
if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* Convert to NLS */
if (BEFS_SB(sb)->nls) {
+ char *nlsname;
+ int nlsnamelen;
result =
befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
if (result < 0) {
befs_debug(sb, "<--- befs_readdir() ERROR");
return result;
}
- result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
- (ino_t) value, d_type);
+ if (!dir_emit(ctx, nlsname, nlsnamelen,
+ (ino_t) value, d_type)) {
+ kfree(nlsname);
+ return 0;
+ }
kfree(nlsname);
-
} else {
- result = filldir(dirent, keybuf, keysize, filp->f_pos,
- (ino_t) value, d_type);
+ if (!dir_emit(ctx, keybuf, keysize,
+ (ino_t) value, d_type))
+ return 0;
}
+ ctx->pos++;
+ goto more;
- filp->f_pos++;
-
- befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+ befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
return 0;
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
const unsigned char *name, int namelen,
struct bfs_dirent **res_dir);
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
struct buffer_head *bh;
struct bfs_dirent *de;
- struct bfs_sb_info *info = BFS_SB(dir->i_sb);
unsigned int offset;
int block;
- mutex_lock(&info->bfs_lock);
-
- if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+ if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
printf("Bad f_pos=%08lx for %s:%08lx\n",
- (unsigned long)f->f_pos,
+ (unsigned long)ctx->pos,
dir->i_sb->s_id, dir->i_ino);
- mutex_unlock(&info->bfs_lock);
- return -EBADF;
+ return -EINVAL;
}
- while (f->f_pos < dir->i_size) {
- offset = f->f_pos & (BFS_BSIZE - 1);
- block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+ while (ctx->pos < dir->i_size) {
+ offset = ctx->pos & (BFS_BSIZE - 1);
+ block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
bh = sb_bread(dir->i_sb, block);
if (!bh) {
- f->f_pos += BFS_BSIZE - offset;
+ ctx->pos += BFS_BSIZE - offset;
continue;
}
do {
de = (struct bfs_dirent *)(bh->b_data + offset);
if (de->ino) {
int size = strnlen(de->name, BFS_NAMELEN);
- if (filldir(dirent, de->name, size, f->f_pos,
+ if (!dir_emit(ctx, de->name, size,
le16_to_cpu(de->ino),
- DT_UNKNOWN) < 0) {
+ DT_UNKNOWN)) {
brelse(bh);
- mutex_unlock(&info->bfs_lock);
return 0;
}
}
offset += BFS_DIRENT_SIZE;
- f->f_pos += BFS_DIRENT_SIZE;
- } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+ ctx->pos += BFS_DIRENT_SIZE;
+ } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
brelse(bh);
}
-
- mutex_unlock(&info->bfs_lock);
- return 0;
+ return 0;
}
const struct file_operations bfs_dir_operations = {
.read = generic_read_dir,
- .readdir = bfs_readdir,
+ .iterate = bfs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bbc8f8827eac..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm)
fs = get_fs();
set_fs(KERNEL_DS);
has_dumped = 1;
- current->flags |= PF_DUMPCORE;
strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
dump.u_ar0 = offsetof(struct user, regs);
dump.signal = cprm->siginfo->si_signo;
@@ -256,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
@@ -287,15 +284,12 @@ static int load_aout_binary(struct linux_binprm * bprm)
return error;
}
- error = bprm->file->f_op->read(bprm->file,
- (char __user *)text_addr,
- ex.a_text+ex.a_data, &pos);
+ error = read_code(bprm->file, text_addr, pos,
+ ex.a_text+ex.a_data);
if ((signed long)error < 0) {
send_sig(SIGKILL, current, 0);
return error;
}
-
- flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
} else {
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -311,14 +305,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
}
if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- loff_t pos = fd_offset;
vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- bprm->file->f_op->read(bprm->file,
- (char __user *)N_TXTADDR(ex),
- ex.a_text+ex.a_data, &pos);
- flush_icache_range((unsigned long) N_TXTADDR(ex),
- (unsigned long) N_TXTADDR(ex) +
- ex.a_text+ex.a_data);
+ read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+ ex.a_text + ex.a_data);
goto beyond_if;
}
@@ -397,8 +386,6 @@ static int load_aout_library(struct file *file)
start_addr = ex.a_entry & 0xfffff000;
if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
- loff_t pos = N_TXTOFF(ex);
-
if (printk_ratelimit())
{
printk(KERN_WARNING
@@ -407,11 +394,8 @@ static int load_aout_library(struct file *file)
}
vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- file->f_op->read(file, (char __user *)start_addr,
- ex.a_text + ex.a_data, &pos);
- flush_icache_range((unsigned long) start_addr,
- (unsigned long) start_addr + ex.a_text + ex.a_data);
-
+ read_code(file, start_addr, N_TXTOFF(ex),
+ ex.a_text + ex.a_data);
retval = 0;
goto out;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 86af964c2425..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+#ifdef ELF_HWCAP2
+ NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
@@ -735,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
@@ -803,7 +804,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
* follow the loader, and is not movable. */
#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
/* Memory randomization might have been switched off
- * in runtime via sysctl.
+ * in runtime via sysctl or explicit setting of
+ * personality flags.
* If that is the case, retain the original non-zero
* load_bias value in order to establish proper
* non-randomized mappings.
@@ -2091,8 +2093,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto cleanup;
has_dumped = 1;
- current->flags |= PF_DUMPCORE;
-
+
fs = get_fs();
set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9c13e023e2b7..c166f325a183 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -483,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
size_t platform_len = 0, len;
char *k_platform, *k_base_platform;
char __user *u_platform, *u_base_platform, *p;
- long hwcap;
int loop;
int nr; /* reset for each csp adjustment */
@@ -502,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
return -EFAULT;
#endif
- hwcap = ELF_HWCAP;
-
/*
* If this architecture has a platform capability string, copy it
* to userspace. In some cases (Sparc), this info is impossible
@@ -617,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
nr = 0;
csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
- NEW_AUX_ENT(AT_HWCAP, hwcap);
+ NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+#ifdef ELF_HWCAP2
+ NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr);
@@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
struct elf32_fdpic_loadseg *seg;
struct elf32_phdr *phdr;
unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
- loff_t fpos;
int loop, ret;
load_addr = params->load_addr;
@@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
if (params->phdrs[loop].p_type != PT_LOAD)
continue;
- fpos = phdr->p_offset;
-
seg->addr = maddr + (phdr->p_vaddr - base);
seg->p_vaddr = phdr->p_vaddr;
seg->p_memsz = phdr->p_memsz;
- ret = file->f_op->read(file, (void *) seg->addr,
- phdr->p_filesz, &fpos);
+ ret = read_code(file, seg->addr, phdr->p_offset,
+ phdr->p_filesz);
if (ret < 0)
return ret;
@@ -1687,8 +1684,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
fill_elf_fdpic_header(elf, e_phnum);
has_dumped = 1;
- current->flags |= PF_DUMPCORE;
-
/*
* Set up the notes in similar form to SVR4 core dumps made
* with info from their /proc.
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2036d21baaef..d50bbe59da1e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -207,11 +207,12 @@ static int decompress_exec(
/* Read in first chunk of data and parse gzip header. */
fpos = offset;
- ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+ ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
strm.next_in = buf;
strm.avail_in = ret;
strm.total_in = 0;
+ fpos += ret;
retval = -ENOEXEC;
@@ -277,7 +278,7 @@ static int decompress_exec(
}
while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
- ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+ ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
if (ret <= 0)
break;
len -= ret;
@@ -285,6 +286,7 @@ static int decompress_exec(
strm.next_in = buf;
strm.avail_in = ret;
strm.total_in = 0;
+ fpos += ret;
}
if (ret < 0) {
@@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm,
unsigned long textpos = 0, datapos = 0, result;
unsigned long realdatastart = 0;
unsigned long text_len, data_len, bss_len, stack_len, flags;
+ unsigned long full_data;
unsigned long len, memp = 0;
unsigned long memp_size, extra, rlim;
unsigned long *reloc = 0, *rp;
@@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm,
relocs = ntohl(hdr->reloc_count);
flags = ntohl(hdr->flags);
rev = ntohl(hdr->rev);
+ full_data = data_len + relocs * sizeof(unsigned long);
if (strncmp(hdr->magic, "bFLT", 4)) {
/*
@@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm,
#ifdef CONFIG_BINFMT_ZFLAT
if (flags & FLAT_FLAG_GZDATA) {
result = decompress_exec(bprm, fpos, (char *) datapos,
- data_len + (relocs * sizeof(unsigned long)), 0);
+ full_data, 0);
} else
#endif
{
- result = bprm->file->f_op->read(bprm->file, (char *) datapos,
- data_len + (relocs * sizeof(unsigned long)), &fpos);
+ result = read_code(bprm->file, datapos, fpos,
+ full_data);
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
@@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm,
if (flags & FLAT_FLAG_GZIP) {
result = decompress_exec(bprm, sizeof (struct flat_hdr),
(((char *) textpos) + sizeof (struct flat_hdr)),
- (text_len + data_len + (relocs * sizeof(unsigned long))
+ (text_len + full_data
- sizeof (struct flat_hdr)),
0);
memmove((void *) datapos, (void *) realdatastart,
- data_len + (relocs * sizeof(unsigned long)));
+ full_data);
} else if (flags & FLAT_FLAG_GZDATA) {
- fpos = 0;
- result = bprm->file->f_op->read(bprm->file,
- (char *) textpos, text_len, &fpos);
+ result = read_code(bprm->file, textpos, 0, text_len);
if (!IS_ERR_VALUE(result))
result = decompress_exec(bprm, text_len, (char *) datapos,
- data_len + (relocs * sizeof(unsigned long)), 0);
+ full_data, 0);
}
else
#endif
{
- fpos = 0;
- result = bprm->file->f_op->read(bprm->file,
- (char *) textpos, text_len, &fpos);
- if (!IS_ERR_VALUE(result)) {
- fpos = ntohl(hdr->data_start);
- result = bprm->file->f_op->read(bprm->file, (char *) datapos,
- data_len + (relocs * sizeof(unsigned long)), &fpos);
- }
+ result = read_code(bprm->file, textpos, 0, text_len);
+ if (!IS_ERR_VALUE(result))
+ result = read_code(bprm->file, datapos,
+ ntohl(hdr->data_start),
+ full_data);
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read code+data+bss, errno %d\n",(int)-result);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 751df5e4f61a..1c740e152f38 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -23,6 +23,7 @@
#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/string_helpers.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
@@ -234,24 +235,6 @@ static char *scanarg(char *s, char del)
return s;
}
-static int unquote(char *from)
-{
- char c = 0, *s = from, *p = from;
-
- while ((c = *s++) != '\0') {
- if (c == '\\' && *s == 'x') {
- s++;
- c = toupper(*s++);
- *p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4;
- c = toupper(*s++);
- *p++ |= c - (isdigit(c) ? '0' : 'A' - 10);
- continue;
- }
- *p++ = c;
- }
- return p - from;
-}
-
static char * check_special_flags (char * sfs, Node * e)
{
char * p = sfs;
@@ -354,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count)
p[-1] = '\0';
if (!e->mask[0])
e->mask = NULL;
- e->size = unquote(e->magic);
- if (e->mask && unquote(e->mask) != e->size)
+ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
+ if (e->mask &&
+ string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
goto Einval;
if (e->size + e->offset > BINPRM_BUF_SIZE)
goto Einval;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a3f28f331b2b..8fb42916d8a2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -27,48 +27,11 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
-struct integrity_slab {
- struct kmem_cache *slab;
- unsigned short nr_vecs;
- char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
- IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
+#define BIP_INLINE_VECS 4
+static struct kmem_cache *bip_slab;
static struct workqueue_struct *kintegrityd_wq;
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
- switch (nr) {
- case 1:
- return 0;
- case 2 ... 4:
- return 1;
- case 5 ... 16:
- return 2;
- case 17 ... 64:
- return 3;
- case 65 ... 128:
- return 4;
- case 129 ... BIO_MAX_PAGES:
- return 5;
- default:
- BUG();
- }
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
- if (idx == BIOVEC_MAX_IDX)
- return 1;
-
- return 0;
-}
-
/**
* bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
@@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
unsigned int nr_vecs)
{
struct bio_integrity_payload *bip;
- unsigned int idx = vecs_to_idx(nr_vecs);
struct bio_set *bs = bio->bi_pool;
-
- if (!bs)
- bs = fs_bio_set;
-
- BUG_ON(bio == NULL);
- bip = NULL;
-
- /* Lower order allocations come straight from slab */
- if (!use_bip_pool(idx))
- bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
- /* Use mempool if lower order alloc failed or max vecs were requested */
- if (bip == NULL) {
- idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */
+ unsigned long idx = BIO_POOL_NONE;
+ unsigned inline_vecs;
+
+ if (!bs) {
+ bip = kmalloc(sizeof(struct bio_integrity_payload) +
+ sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+ inline_vecs = nr_vecs;
+ } else {
bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
- if (unlikely(bip == NULL)) {
- printk(KERN_ERR "%s: could not alloc bip\n", __func__);
- return NULL;
- }
+ inline_vecs = BIP_INLINE_VECS;
}
+ if (unlikely(!bip))
+ return NULL;
+
memset(bip, 0, sizeof(*bip));
+ if (nr_vecs > inline_vecs) {
+ bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+ bs->bvec_integrity_pool);
+ if (!bip->bip_vec)
+ goto err;
+ } else {
+ bip->bip_vec = bip->bip_inline_vecs;
+ }
+
bip->bip_slab = idx;
bip->bip_bio = bio;
bio->bi_integrity = bip;
return bip;
+err:
+ mempool_free(bip, bs->bio_integrity_pool);
+ return NULL;
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio)
struct bio_integrity_payload *bip = bio->bi_integrity;
struct bio_set *bs = bio->bi_pool;
- if (!bs)
- bs = fs_bio_set;
-
- BUG_ON(bip == NULL);
-
- /* A cloned bio doesn't own the integrity metadata */
- if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
- && bip->bip_buf != NULL)
+ if (bip->bip_owns_buf)
kfree(bip->bip_buf);
- if (use_bip_pool(bip->bip_slab))
+ if (bs) {
+ if (bip->bip_slab != BIO_POOL_NONE)
+ bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+ bip->bip_slab);
+
mempool_free(bip, bs->bio_integrity_pool);
- else
- kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+ } else {
+ kfree(bip);
+ }
bio->bi_integrity = NULL;
}
@@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
return -EIO;
}
+ bip->bip_owns_buf = 1;
bip->bip_buf = buf;
bip->bip_size = len;
bip->bip_sector = bio->bi_sector;
@@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
bp->bio1.bi_integrity = &bp->bip1;
bp->bio2.bi_integrity = &bp->bip2;
- bp->iv1 = bip->bip_vec[0];
- bp->iv2 = bip->bip_vec[0];
+ bp->iv1 = bip->bip_vec[bip->bip_idx];
+ bp->iv2 = bip->bip_vec[bip->bip_idx];
- bp->bip1.bip_vec[0] = bp->iv1;
- bp->bip2.bip_vec[0] = bp->iv2;
+ bp->bip1.bip_vec = &bp->iv1;
+ bp->bip2.bip_vec = &bp->iv2;
bp->iv1.bv_len = sectors * bi->tuple_size;
bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
- unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
if (bs->bio_integrity_pool)
return 0;
- bs->bio_integrity_pool =
- mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+ bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+
+ bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+ if (!bs->bvec_integrity_pool)
+ return -1;
if (!bs->bio_integrity_pool)
return -1;
@@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs)
{
if (bs->bio_integrity_pool)
mempool_destroy(bs->bio_integrity_pool);
+
+ if (bs->bvec_integrity_pool)
+ mempool_destroy(bs->bio_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
void __init bio_integrity_init(void)
{
- unsigned int i;
-
/*
* kintegrityd won't block much but may burn a lot of CPU cycles.
* Make it highpri CPU intensive wq with max concurrency of 1.
@@ -781,14 +749,10 @@ void __init bio_integrity_init(void)
if (!kintegrityd_wq)
panic("Failed to create kintegrityd\n");
- for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
- unsigned int size;
-
- size = sizeof(struct bio_integrity_payload)
- + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
- bip_slab[i].slab =
- kmem_cache_create(bip_slab[i].name, size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- }
+ bip_slab = kmem_cache_create("bio_integrity_payload",
+ sizeof(struct bio_integrity_payload) +
+ sizeof(struct bio_vec) * BIP_INLINE_VECS,
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ if (!bip_slab)
+ panic("Failed to create slab\n");
}
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce4855..94bbc04dba77 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
return bvec_slabs[idx].nr_vecs;
}
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
{
BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
if (idx == BIOVEC_MAX_IDX)
- mempool_free(bv, bs->bvec_pool);
+ mempool_free(bv, pool);
else {
struct biovec_slab *bvs = bvec_slabs + idx;
@@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
}
}
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
- struct bio_set *bs)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+ mempool_t *pool)
{
struct bio_vec *bvl;
@@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
*/
if (*idx == BIOVEC_MAX_IDX) {
fallback:
- bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+ bvl = mempool_alloc(pool, gfp_mask);
} else {
struct biovec_slab *bvs = bvec_slabs + *idx;
gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -252,8 +253,8 @@ static void bio_free(struct bio *bio)
__bio_free(bio);
if (bs) {
- if (bio_has_allocated_vec(bio))
- bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+ if (bio_flagged(bio, BIO_OWNS_VEC))
+ bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
/*
* If we have front padding, adjust the bio pointer before freeing
@@ -297,6 +298,54 @@ void bio_reset(struct bio *bio)
}
EXPORT_SYMBOL(bio_reset);
+static void bio_alloc_rescue(struct work_struct *work)
+{
+ struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+ struct bio *bio;
+
+ while (1) {
+ spin_lock(&bs->rescue_lock);
+ bio = bio_list_pop(&bs->rescue_list);
+ spin_unlock(&bs->rescue_lock);
+
+ if (!bio)
+ break;
+
+ generic_make_request(bio);
+ }
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+ struct bio_list punt, nopunt;
+ struct bio *bio;
+
+ /*
+ * In order to guarantee forward progress we must punt only bios that
+ * were allocated from this bio_set; otherwise, if there was a bio on
+ * there for a stacking driver higher up in the stack, processing it
+ * could require allocating bios from this bio_set, and doing that from
+ * our own rescuer would be bad.
+ *
+ * Since bio lists are singly linked, pop them all instead of trying to
+ * remove from the middle of the list:
+ */
+
+ bio_list_init(&punt);
+ bio_list_init(&nopunt);
+
+ while ((bio = bio_list_pop(current->bio_list)))
+ bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+ *current->bio_list = nopunt;
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_merge(&bs->rescue_list, &punt);
+ spin_unlock(&bs->rescue_lock);
+
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
@@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset);
* previously allocated bio for IO before attempting to allocate a new one.
* Failure to do so can cause deadlocks under memory pressure.
*
+ * Note that when running under generic_make_request() (i.e. any block
+ * driver), bios are not submitted until after you return - see the code in
+ * generic_make_request() that converts recursion into iteration, to prevent
+ * stack overflows.
+ *
+ * This would normally mean allocating multiple bios under
+ * generic_make_request() would be susceptible to deadlocks, but we have
+ * deadlock avoidance code that resubmits any blocked bios from a rescuer
+ * thread.
+ *
+ * However, we do not guarantee forward progress for allocations from other
+ * mempools. Doing multiple allocations from the same mempool under
+ * generic_make_request() should be avoided - instead, use bio_set's front_pad
+ * for per bio allocations.
+ *
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
+ gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
unsigned inline_vecs;
unsigned long idx = BIO_POOL_NONE;
@@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
front_pad = 0;
inline_vecs = nr_iovecs;
} else {
+ /*
+ * generic_make_request() converts recursion to iteration; this
+ * means if we're running beneath it, any bios we allocate and
+ * submit will not be submitted (and thus freed) until after we
+ * return.
+ *
+ * This exposes us to a potential deadlock if we allocate
+ * multiple bios from the same bio_set() while running
+ * underneath generic_make_request(). If we were to allocate
+ * multiple bios (say a stacking block driver that was splitting
+ * bios), we would deadlock if we exhausted the mempool's
+ * reserve.
+ *
+ * We solve this, and guarantee forward progress, with a rescuer
+ * workqueue per bio_set. If we go to allocate and there are
+ * bios on current->bio_list, we first try the allocation
+ * without __GFP_WAIT; if that fails, we punt those bios we
+ * would be blocking to the rescuer workqueue before we retry
+ * with the original gfp_flags.
+ */
+
+ if (current->bio_list && !bio_list_empty(current->bio_list))
+ gfp_mask &= ~__GFP_WAIT;
+
p = mempool_alloc(bs->bio_pool, gfp_mask);
+ if (!p && gfp_mask != saved_gfp) {
+ punt_bios_to_rescuer(bs);
+ gfp_mask = saved_gfp;
+ p = mempool_alloc(bs->bio_pool, gfp_mask);
+ }
+
front_pad = bs->front_pad;
inline_vecs = BIO_INLINE_VECS;
}
@@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
bio_init(bio);
if (nr_iovecs > inline_vecs) {
- bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+ bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+ if (!bvl && gfp_mask != saved_gfp) {
+ punt_bios_to_rescuer(bs);
+ gfp_mask = saved_gfp;
+ bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+ }
+
if (unlikely(!bvl))
goto err_free;
+
+ bio->bi_flags |= 1 << BIO_OWNS_VEC;
} else if (nr_iovecs) {
bvl = bio->bi_inline_vecs;
}
@@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
}
EXPORT_SYMBOL(bio_add_page);
+struct submit_bio_ret {
+ struct completion event;
+ int error;
+};
+
+static void submit_bio_wait_endio(struct bio *bio, int error)
+{
+ struct submit_bio_ret *ret = bio->bi_private;
+
+ ret->error = error;
+ complete(&ret->event);
+}
+
+/**
+ * submit_bio_wait - submit a bio, and wait until it completes
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+ * bio_endio() on failure.
+ */
+int submit_bio_wait(int rw, struct bio *bio)
+{
+ struct submit_bio_ret ret;
+
+ rw |= REQ_SYNC;
+ init_completion(&ret.event);
+ bio->bi_private = &ret;
+ bio->bi_end_io = submit_bio_wait_endio;
+ submit_bio(rw, bio);
+ wait_for_completion(&ret.event);
+
+ return ret.error;
+}
+EXPORT_SYMBOL(submit_bio_wait);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio: bio to advance
+ * @bytes: number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+ if (bio_integrity(bio))
+ bio_integrity_advance(bio, bytes);
+
+ bio->bi_sector += bytes >> 9;
+ bio->bi_size -= bytes;
+
+ if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+ return;
+
+ while (bytes) {
+ if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+ WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+ bio->bi_idx, bio->bi_vcnt);
+ break;
+ }
+
+ if (bytes >= bio_iovec(bio)->bv_len) {
+ bytes -= bio_iovec(bio)->bv_len;
+ bio->bi_idx++;
+ } else {
+ bio_iovec(bio)->bv_len -= bytes;
+ bio_iovec(bio)->bv_offset += bytes;
+ bytes = 0;
+ }
+ }
+}
+EXPORT_SYMBOL(bio_advance);
+
+/**
+ * bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp_mask);
+ if (!bv->bv_page) {
+ while (--bv >= bio->bi_io_vec)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_alloc_pages);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+ struct bio_vec *src_bv, *dst_bv;
+ unsigned src_offset, dst_offset, bytes;
+ void *src_p, *dst_p;
+
+ src_bv = bio_iovec(src);
+ dst_bv = bio_iovec(dst);
+
+ src_offset = src_bv->bv_offset;
+ dst_offset = dst_bv->bv_offset;
+
+ while (1) {
+ if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+ src_bv++;
+ if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+ src = src->bi_next;
+ if (!src)
+ break;
+
+ src_bv = bio_iovec(src);
+ }
+
+ src_offset = src_bv->bv_offset;
+ }
+
+ if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+ dst_bv++;
+ if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+ dst = dst->bi_next;
+ if (!dst)
+ break;
+
+ dst_bv = bio_iovec(dst);
+ }
+
+ dst_offset = dst_bv->bv_offset;
+ }
+
+ bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+ src_bv->bv_offset + src_bv->bv_len - src_offset);
+
+ src_p = kmap_atomic(src_bv->bv_page);
+ dst_p = kmap_atomic(dst_bv->bv_page);
+
+ memcpy(dst_p + dst_bv->bv_offset,
+ src_p + src_bv->bv_offset,
+ bytes);
+
+ kunmap_atomic(dst_p);
+ kunmap_atomic(src_p);
+
+ src_offset += bytes;
+ dst_offset += bytes;
+ }
+}
+EXPORT_SYMBOL(bio_copy_data);
+
struct bio_map_data {
struct bio_vec *iovecs;
struct sg_iovec *sgvecs;
@@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
int iov_idx = 0;
unsigned int iov_off = 0;
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment_all(bvec, bio, i) {
char *bv_addr = page_address(bvec->bv_page);
unsigned int bv_len = iovecs[i].bv_len;
@@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
return bio;
cleanup:
if (!map_data)
- bio_for_each_segment(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i)
__free_page(bvec->bv_page);
bio_put(bio);
@@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio)
/*
* make sure we dirty pages we wrote to
*/
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment_all(bvec, bio, i) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
@@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
int i;
char *p = bmd->sgvecs[0].iov_base;
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment_all(bvec, bio, i) {
char *addr = page_address(bvec->bv_page);
int len = bmd->iovecs[i].bv_len;
@@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
if (!reading) {
void *p = data;
- bio_for_each_segment(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i) {
char *addr = page_address(bvec->bv_page);
memcpy(addr, p, bvec->bv_len);
@@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern);
*/
void bio_set_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec;
int i;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct page *page = bvec[i].bv_page;
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
if (page && !PageCompound(page))
set_page_dirty_lock(page);
@@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio)
static void bio_release_pages(struct bio *bio)
{
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec;
int i;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct page *page = bvec[i].bv_page;
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
if (page)
put_page(page);
@@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work)
void bio_check_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec;
int nr_clean_pages = 0;
int i;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct page *page = bvec[i].bv_page;
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
if (PageDirty(page) || PageCompound(page)) {
page_cache_release(page);
- bvec[i].bv_page = NULL;
+ bvec->bv_page = NULL;
} else {
nr_clean_pages++;
}
@@ -1477,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
bi->bi_sector + first_sectors);
- BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
- BUG_ON(bi->bi_idx != 0);
+ BUG_ON(bio_segments(bi) > 1);
atomic_set(&bp->cnt, 3);
bp->error = 0;
bp->bio1 = *bi;
@@ -1488,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
bp->bio1.bi_size = first_sectors << 9;
if (bi->bi_vcnt != 0) {
- bp->bv1 = bi->bi_io_vec[0];
- bp->bv2 = bi->bi_io_vec[0];
+ bp->bv1 = *bio_iovec(bi);
+ bp->bv2 = *bio_iovec(bi);
if (bio_is_rw(bi)) {
bp->bv2.bv_offset += first_sectors << 9;
@@ -1541,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
if (index >= bio->bi_idx)
index = bio->bi_vcnt - 1;
- __bio_for_each_segment(bv, bio, i, 0) {
+ bio_for_each_segment_all(bv, bio, i) {
if (i == index) {
if (offset > bv->bv_offset)
sectors += (offset - bv->bv_offset) / sector_sz;
@@ -1559,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset);
* create memory pools for biovec's in a bio_set.
* use the global biovec slabs created for general use.
*/
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
{
struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
- bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
- if (!bs->bvec_pool)
- return -ENOMEM;
-
- return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
- mempool_destroy(bs->bvec_pool);
+ return mempool_create_slab_pool(pool_entries, bp->slab);
}
void bioset_free(struct bio_set *bs)
{
+ if (bs->rescue_workqueue)
+ destroy_workqueue(bs->rescue_workqueue);
+
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
+ if (bs->bvec_pool)
+ mempool_destroy(bs->bvec_pool);
+
bioset_integrity_free(bs);
- biovec_free_pools(bs);
bio_put_slab(bs);
kfree(bs);
@@ -1612,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
bs->front_pad = front_pad;
+ spin_lock_init(&bs->rescue_lock);
+ bio_list_init(&bs->rescue_list);
+ INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
if (!bs->bio_slab) {
kfree(bs);
@@ -1622,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
if (!bs->bio_pool)
goto bad;
- if (!biovec_create_pools(bs, pool_size))
- return bs;
+ bs->bvec_pool = biovec_create_pool(bs, pool_size);
+ if (!bs->bvec_pool)
+ goto bad;
+
+ bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+ if (!bs->rescue_workqueue)
+ goto bad;
+ return bs;
bad:
bioset_free(bs);
return NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aae187a7f94a..bb43ce081d6e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -324,31 +325,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *bd_inode = file->f_mapping->host;
- loff_t size;
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
- size = i_size_read(bd_inode);
-
- retval = -EINVAL;
- switch (whence) {
- case SEEK_END:
- offset += size;
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- case SEEK_SET:
- break;
- default:
- goto out;
- }
- if (offset >= 0 && offset <= size) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- }
- retval = offset;
- }
-out:
+ retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
mutex_unlock(&bd_inode->i_mutex);
return retval;
}
@@ -617,11 +597,9 @@ void bd_forget(struct inode *inode)
struct block_device *bdev = NULL;
spin_lock(&bdev_lock);
- if (inode->i_bdev) {
- if (!sb_is_blkdev_sb(inode->i_sb))
- bdev = inode->i_bdev;
- __bd_forget(inode);
- }
+ if (!sb_is_blkdev_sb(inode->i_sb))
+ bdev = inode->i_bdev;
+ __bd_forget(inode);
spin_unlock(&bdev_lock);
if (bdev)
@@ -1047,7 +1025,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
}
EXPORT_SYMBOL(bd_set_size);
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
/*
* bd_mutex locking:
@@ -1402,9 +1380,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return blkdev_get(bdev, filp->f_mode, filp);
}
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
{
- int ret = 0;
struct gendisk *disk = bdev->bd_disk;
struct block_device *victim = NULL;
@@ -1424,7 +1401,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
- ret = disk->fops->release(disk, mode);
+ disk->fops->release(disk, mode);
}
if (!bdev->bd_openers) {
struct module *owner = disk->fops->owner;
@@ -1443,10 +1420,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
bdput(bdev);
if (victim)
__blkdev_put(victim, mode, 1);
- return ret;
}
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
{
mutex_lock(&bdev->bd_mutex);
@@ -1490,15 +1466,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
mutex_unlock(&bdev->bd_mutex);
- return __blkdev_put(bdev, mode, 0);
+ __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
- return blkdev_put(bdev, filp->f_mode);
+ blkdev_put(bdev, filp->f_mode);
+ return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1559,7 +1535,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
return 0;
size -= pos;
- if (size < INT_MAX)
+ if (size < iocb->ki_left)
nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
return generic_file_aio_read(iocb, iov, nr_segs, pos);
}
@@ -1586,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = {
.writepages = generic_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
};
const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 9a8622a5b867..2b3b83296977 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,5 +1,5 @@
config BTRFS_FS
- tristate "Btrfs filesystem Unstable disk format"
+ tristate "Btrfs filesystem support"
select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
@@ -52,3 +52,23 @@ config BTRFS_FS_CHECK_INTEGRITY
In most cases, unless you are a btrfs developer who needs
to verify the integrity of (super)-block write requests
during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+ bool "Btrfs will run sanity tests upon loading"
+ depends on BTRFS_FS
+ help
+ This will run some basic sanity tests on the free space cache
+ code to make sure it is acting as it should. These are mostly
+ regression tests and are only really interesting to btrfs devlopers.
+
+ If unsure, say N.
+
+config BTRFS_DEBUG
+ bool "Btrfs debugging support"
+ depends on BTRFS_FS
+ help
+ Enable run-time debugging support for the btrfs filesystem. This may
+ enable additional and expensive checks with negative impact on
+ performance, or export extra information via sysfs.
+
+ If unsure, say N.
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bd605c87adfd..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* to a logical address
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- u64 time_seq,
- struct __prelim_ref *ref,
- struct ulist *parents,
- const u64 *extent_item_pos)
+ struct btrfs_path *path, u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
- struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int root_level;
int level = ref->level;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
-
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
time_seq, ref->wanted_disk_byte,
extent_item_pos);
out:
- btrfs_free_path(path);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
return ret;
}
@@ -322,7 +316,7 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root, u64 time_seq,
+ struct btrfs_path *path, u64 time_seq,
struct list_head *head,
const u64 *extent_item_pos)
{
@@ -349,9 +343,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
- err = __resolve_indirect_ref(fs_info, search_commit_root,
- time_seq, ref, parents,
- extent_item_pos);
+ err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+ parents, extent_item_pos);
+ if (err == -ENOMEM)
+ goto out;
if (err)
continue;
@@ -367,7 +362,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
if (!new_ref) {
ret = -ENOMEM;
- break;
+ goto out;
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
@@ -377,7 +372,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
ulist_reinit(parents);
}
-
+out:
ulist_free(parents);
return ret;
}
@@ -421,7 +416,10 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
fs_info->tree_root->leafsize, 0);
- BUG_ON(!eb);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
@@ -443,7 +441,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
* having a parent).
* mode = 2: merge identical parents
*/
-static int __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, int mode)
{
struct list_head *pos1;
@@ -489,7 +487,6 @@ static int __merge_refs(struct list_head *head, int mode)
}
}
- return 0;
}
/*
@@ -582,7 +579,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
default:
WARN_ON(1);
}
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
return 0;
@@ -599,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
int slot;
struct extent_buffer *leaf;
struct btrfs_key key;
+ struct btrfs_key found_key;
unsigned long ptr;
unsigned long end;
struct btrfs_extent_item *ei;
@@ -616,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
end = (unsigned long)ei + item_size;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
*info_level = btrfs_tree_block_level(leaf, info);
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
+ } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ *info_level = found_key.offset;
} else {
BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
}
@@ -680,7 +683,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
default:
WARN_ON(1);
}
- BUG_ON(ret);
+ if (ret)
+ return ret;
ptr += btrfs_extent_inline_ref_size(type);
}
@@ -762,7 +766,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
default:
WARN_ON(1);
}
- BUG_ON(ret);
+ if (ret)
+ return ret;
+
}
return ret;
@@ -787,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int info_level = 0;
int ret;
- int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
struct list_head prefs_delayed;
struct list_head prefs;
struct __prelim_ref *ref;
@@ -796,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&prefs_delayed);
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)-1;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
+ if (!trans)
+ path->search_commit_root = 1;
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -817,7 +826,7 @@ again:
goto out;
BUG_ON(ret == 0);
- if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ if (trans) {
/*
* look if there are updates for this ref queued and lock the
* head
@@ -861,7 +870,8 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == bytenr &&
- key.type == BTRFS_EXTENT_ITEM_KEY) {
+ (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs);
if (ret)
@@ -880,18 +890,14 @@ again:
if (ret)
goto out;
- ret = __merge_refs(&prefs, 1);
- if (ret)
- goto out;
+ __merge_refs(&prefs, 1);
- ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
- &prefs, extent_item_pos);
+ ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+ extent_item_pos);
if (ret)
goto out;
- ret = __merge_refs(&prefs, 2);
- if (ret)
- goto out;
+ __merge_refs(&prefs, 2);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
@@ -900,7 +906,8 @@ again:
if (ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
}
if (ref->count && ref->parent) {
struct extent_inode_elem *eie = NULL;
@@ -911,7 +918,11 @@ again:
info_level);
eb = read_tree_block(fs_info->extent_root,
ref->parent, bsz, 0);
- BUG_ON(!eb);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie);
ref->inode_list = eie;
@@ -920,6 +931,8 @@ again:
ret = ulist_add_merge(refs, ref->parent,
(uintptr_t)ref->inode_list,
(u64 *)&eie, GFP_NOFS);
+ if (ret < 0)
+ goto out;
if (!ret && extent_item_pos) {
/*
* we've recorded that parent, so we must extend
@@ -930,7 +943,6 @@ again:
eie = eie->next;
eie->next = ref->inode_list;
}
- BUG_ON(ret < 0);
}
kfree(ref);
}
@@ -1180,6 +1192,20 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
return ret;
}
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
@@ -1249,32 +1275,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root,
- struct btrfs_path *path,
- struct btrfs_inode_ref *iref,
- struct extent_buffer *eb_in, u64 parent,
- char *dest, u32 size)
-{
- return btrfs_ref_to_path(fs_root, path,
- btrfs_inode_ref_name_len(eb_in, iref),
- (unsigned long)(iref + 1),
- eb_in, parent, dest, size);
-}
-
-/*
* this makes the path point to (logical EXTENT_ITEM *)
* returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
* tree blocks and <0 on error.
@@ -1285,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
{
int ret;
u64 flags;
+ u64 size = 0;
u32 item_size;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
- key.type = BTRFS_EXTENT_ITEM_KEY;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
key.offset = (u64)-1;
@@ -1303,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
return ret;
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
- if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+ if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+ size = fs_info->extent_root->leafsize;
+ else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+ size = found_key->offset;
+
+ if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+ found_key->type != BTRFS_METADATA_ITEM_KEY) ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical) {
+ found_key->objectid + size <= logical) {
pr_debug("logical %llu is not within any extent\n",
(unsigned long long)logical);
return -ENOENT;
@@ -1461,9 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- struct list_head data_refs = LIST_HEAD_INIT(data_refs);
- struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct ulist *refs = NULL;
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
@@ -1475,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
- if (search_commit_root) {
- trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
- } else {
+ if (!search_commit_root) {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -1508,11 +1514,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
iterate, ctx);
}
ulist_free(roots);
- roots = NULL;
}
free_leaf_list(refs);
- ulist_free(roots);
out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 310a7f6d09b1..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
#include "ulist.h"
#include "extent_io.h"
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
@@ -59,9 +57,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots);
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
- struct btrfs_inode_ref *iref, struct extent_buffer *eb,
- u64 parent, char *dest, u32 size);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9b97d4960e6..08b286b2a2c5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -93,7 +93,7 @@ struct btrfs_inode {
unsigned long runtime_flags;
- /* Keep track of who's O_SYNC/fsycing currently */
+ /* Keep track of who's O_SYNC/fsyncing currently */
atomic_t sync_writers;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 18af6f48781a..1431a6965017 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
unsigned int j;
DECLARE_COMPLETION_ONSTACK(complete);
- bio = bio_alloc(GFP_NOFS, num_pages - i);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
if (!bio) {
printk(KERN_INFO
"btrfsic: bio_alloc() for %u pages failed!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 15b94089abc4..b189bd1e7a3e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -82,6 +82,10 @@ struct compressed_bio {
u32 sums;
};
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+ u64 disk_start, struct bio_vec *bvec,
+ int vcnt, size_t srclen);
+
static inline int compressed_bio_size(struct btrfs_root *root,
unsigned long disk_size)
{
@@ -106,7 +110,6 @@ static int check_compressed_csum(struct inode *inode,
u64 disk_start)
{
int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page *page;
unsigned long i;
char *kaddr;
@@ -121,7 +124,7 @@ static int check_compressed_csum(struct inode *inode,
csum = ~(u32)0;
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+ csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
kunmap_atomic(kaddr);
@@ -739,7 +742,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
-struct btrfs_compress_op *btrfs_compress_op[] = {
+static struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_zlib_compress,
&btrfs_lzo_compress,
};
@@ -910,8 +913,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
- struct bio_vec *bvec, int vcnt, size_t srclen)
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+ u64 disk_start, struct bio_vec *bvec,
+ int vcnt, size_t srclen)
{
struct list_head *workspace;
int ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9afb0a62ae82..0c803b4fbf93 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out);
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
- struct bio_vec *bvec, int vcnt, size_t srclen);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ca9d8f1a3bb6..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,16 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot);
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
-struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid,
- u64 time_seq);
-struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 time_seq);
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -208,7 +203,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
@@ -361,6 +356,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
}
/*
+ * Increment the upper half of tree_mod_seq, set lower half zero.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
+{
+ u64 seq = atomic64_read(&fs_info->tree_mod_seq);
+ seq &= 0xffffffff00000000ull;
+ seq += 1ull << 32;
+ atomic64_set(&fs_info->tree_mod_seq, seq);
+ return seq;
+}
+
+/*
+ * Increment the lower half of tree_mod_seq.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
+ * are generated should not technically require a spin lock here. (Rationale:
+ * incrementing the minor while incrementing the major seq number is between its
+ * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
+ * just returns a unique sequence number as usual.) We have decided to leave
+ * that requirement in here and rethink it once we notice it really imposes a
+ * problem on some workload.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * return the last minor in the previous major tree_mod_seq number
+ */
+u64 btrfs_tree_mod_seq_prev(u64 seq)
+{
+ return (seq & 0xffffffff00000000ull) - 1ull;
+}
+
+/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
@@ -376,10 +409,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- seq = btrfs_inc_tree_mod_seq(fs_info);
+ seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info);
@@ -524,7 +557,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
if (!tm)
return -ENOMEM;
- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
return tm->seq;
}
@@ -643,7 +679,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
- struct extent_buffer *new_root, gfp_t flags)
+ struct extent_buffer *new_root, gfp_t flags,
+ int log_removal)
{
struct tree_mod_elem *tm;
int ret;
@@ -651,7 +688,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (tree_mod_dont_log(fs_info, NULL))
return 0;
- __tree_mod_log_free_eb(fs_info, old_root);
+ if (log_removal)
+ __tree_mod_log_free_eb(fs_info, old_root);
ret = tree_mod_alloc(fs_info, flags, &tm);
if (ret < 0)
@@ -738,7 +776,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
static noinline void
tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
struct extent_buffer *src, unsigned long dst_offset,
- unsigned long src_offset, int nr_items, int log_removal)
+ unsigned long src_offset, int nr_items)
{
int ret;
int i;
@@ -752,12 +790,10 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
}
for (i = 0; i < nr_items; i++) {
- if (log_removal) {
- ret = tree_mod_log_insert_key_locked(fs_info, src,
- i + src_offset,
- MOD_LOG_KEY_REMOVE);
- BUG_ON(ret < 0);
- }
+ ret = tree_mod_log_insert_key_locked(fs_info, src,
+ i + src_offset,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
ret = tree_mod_log_insert_key_locked(fs_info, dst,
i + dst_offset,
MOD_LOG_KEY_ADD);
@@ -802,11 +838,12 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
static noinline void
tree_mod_log_set_root_pointer(struct btrfs_root *root,
- struct extent_buffer *new_root_node)
+ struct extent_buffer *new_root_node,
+ int log_removal)
{
int ret;
ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, GFP_NOFS);
+ new_root_node, GFP_NOFS, log_removal);
BUG_ON(ret < 0);
}
@@ -867,7 +904,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(root, buf)) {
ret = btrfs_lookup_extent_info(trans, root, buf->start,
- buf->len, &refs, &flags);
+ btrfs_header_level(buf), 1,
+ &refs, &flags);
if (ret)
return ret;
if (refs == 0) {
@@ -913,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
+ int level = btrfs_header_level(buf);
+
ret = btrfs_set_disk_extent_flags(trans, root,
buf->start,
buf->len,
- new_flags, 0);
+ new_flags, level, 0);
if (ret)
return ret;
}
@@ -1028,7 +1068,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
extent_buffer_get(cow);
- tree_mod_log_set_root_pointer(root, cow);
+ tree_mod_log_set_root_pointer(root, cow, 1);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1049,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- tree_mod_log_free_eb(root->fs_info, buf);
+ if (last_ref)
+ tree_mod_log_free_eb(root->fs_info, buf);
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
}
@@ -1067,11 +1108,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
static struct tree_mod_elem *
__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
- struct btrfs_root *root, u64 time_seq)
+ struct extent_buffer *eb_root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct tree_mod_elem *found = NULL;
- u64 root_logical = root->node->start;
+ u64 root_logical = eb_root->start;
int looped = 0;
if (!time_seq)
@@ -1105,7 +1146,6 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
found = tm;
root_logical = tm->old_root.logical;
- BUG_ON(root_logical == root->node->start);
looped = 1;
}
@@ -1122,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
* time_seq).
*/
static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
- struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq, struct tree_mod_elem *first_tm)
{
u32 n;
struct rb_node *next;
@@ -1133,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
+ tree_mod_log_read_lock(fs_info);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1187,9 +1228,17 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
if (tm->index != first_tm->index)
break;
}
+ tree_mod_log_read_unlock(fs_info);
btrfs_set_header_nritems(eb, n);
}
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
static struct extent_buffer *
tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
u64 time_seq)
@@ -1223,9 +1272,12 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
}
extent_buffer_get(eb_rewin);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
- __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+ extent_buffer_get(eb_rewin);
+ btrfs_tree_read_lock(eb_rewin);
+ __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
@@ -1243,33 +1295,35 @@ static inline struct extent_buffer *
get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
- struct extent_buffer *eb;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
u32 blocksize;
- eb = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
if (!tm)
- return root->node;
+ return eb_root;
if (tm->op == MOD_LOG_ROOT_REPLACE) {
old_root = &tm->old_root;
old_generation = tm->generation;
logical = old_root->logical;
} else {
- logical = root->node->start;
+ logical = eb_root->start;
}
tm = tree_mod_log_search(root->fs_info, logical, time_seq);
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- btrfs_tree_read_unlock(root->node);
- free_extent_buffer(root->node);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
blocksize = btrfs_level_size(root, old_root->level);
old = read_tree_block(root, logical, blocksize, 0);
- if (!old) {
+ if (!old || !extent_buffer_uptodate(old)) {
+ free_extent_buffer(old);
pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
logical);
WARN_ON(1);
@@ -1278,13 +1332,13 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(old);
}
} else if (old_root) {
- btrfs_tree_read_unlock(root->node);
- free_extent_buffer(root->node);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(logical, root->nodesize);
} else {
- eb = btrfs_clone_extent_buffer(root->node);
- btrfs_tree_read_unlock(root->node);
- free_extent_buffer(root->node);
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
}
if (!eb)
@@ -1294,12 +1348,12 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, root->root_key.objectid);
+ btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
if (tm)
- __tree_mod_log_rewind(eb, time_seq, tm);
+ __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -1311,15 +1365,15 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
- tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
level = tm->old_root.level;
} else {
- rcu_read_lock();
- level = btrfs_header_level(root->node);
- rcu_read_unlock();
+ level = btrfs_header_level(eb_root);
}
+ free_extent_buffer(eb_root);
return level;
}
@@ -1514,8 +1568,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur) {
cur = read_tree_block(root, blocknr,
blocksize, gen);
- if (!cur)
+ if (!cur || !extent_buffer_uptodate(cur)) {
+ free_extent_buffer(cur);
return -EIO;
+ }
} else if (!uptodate) {
err = btrfs_read_buffer(cur, gen);
if (err) {
@@ -1680,6 +1736,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
struct extent_buffer *parent, int slot)
{
int level = btrfs_header_level(parent);
+ struct extent_buffer *eb;
+
if (slot < 0)
return NULL;
if (slot >= btrfs_header_nritems(parent))
@@ -1687,9 +1745,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
- return read_tree_block(root, btrfs_node_blockptr(parent, slot),
- btrfs_level_size(root, level - 1),
- btrfs_node_ptr_generation(parent, slot));
+ eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+ btrfs_level_size(root, level - 1),
+ btrfs_node_ptr_generation(parent, slot));
+ if (eb && !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ eb = NULL;
+ }
+
+ return eb;
}
/*
@@ -1754,7 +1818,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- tree_mod_log_set_root_pointer(root, child);
+ tree_mod_log_set_root_pointer(root, child, 1);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -1818,7 +1882,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1);
+ del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
@@ -1862,7 +1926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot);
+ del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
@@ -2117,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
}
}
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
{
int slot;
int nritems;
@@ -2131,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int ret = 0;
int blocksize;
parent = path->nodes[level + 1];
if (!parent)
- return 0;
+ return;
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
@@ -2163,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block2 = 0;
free_extent_buffer(eb);
}
- if (block1 || block2) {
- ret = -EAGAIN;
-
- /* release the whole path */
- btrfs_release_path(path);
-
- /* read the blocks */
- if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
- if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
- if (block1) {
- eb = read_tree_block(root, block1, blocksize, 0);
- free_extent_buffer(eb);
- }
- if (block2) {
- eb = read_tree_block(root, block2, blocksize, 0);
- free_extent_buffer(eb);
- }
- }
- return ret;
+ if (block1)
+ readahead_tree_block(root, block1, blocksize, 0);
+ if (block2)
+ readahead_tree_block(root, block2, blocksize, 0);
}
@@ -2210,9 +2252,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
int no_skips = 0;
struct extent_buffer *t;
- if (path->really_keep_locks)
- return;
-
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
@@ -2260,7 +2299,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
int i;
- if (path->keep_locks || path->really_keep_locks)
+ if (path->keep_locks)
return;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2301,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
if (tmp) {
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
- /*
- * we found an up to date block without
- * sleeping, return
- * right away
- */
- *eb_ret = tmp;
- return 0;
- }
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- free_extent_buffer(tmp);
- btrfs_set_path_blocking(p);
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ *eb_ret = tmp;
+ return 0;
+ }
- /* now we're allowed to do a blocking uptodate check */
- tmp = read_tree_block(root, blocknr, blocksize, gen);
- if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
- *eb_ret = tmp;
- return 0;
- }
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ /* the pages were up to date, but we failed
+ * the generation number check. Do a full
+ * read for the generation number that is correct.
+ * We must do this without dropping locks so
+ * we can trust our generation number
+ */
+ btrfs_set_path_blocking(p);
+
+ /* now we're allowed to do a blocking uptodate check */
+ ret = btrfs_read_buffer(tmp, gen);
+ if (!ret) {
+ *eb_ret = tmp;
+ return 0;
}
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
/*
@@ -2390,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = split_node(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2414,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = balance_level(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2493,7 +2519,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (!cow)
write_lock_level = -1;
- if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
+ if (cow && (p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
@@ -2795,15 +2821,9 @@ again:
btrfs_clear_path_blocking(p, b,
BTRFS_READ_LOCK);
}
+ b = tree_mod_log_rewind(root->fs_info, b, time_seq);
p->locks[level] = BTRFS_READ_LOCK;
p->nodes[level] = b;
- b = tree_mod_log_rewind(root->fs_info, b, time_seq);
- if (b != p->nodes[level]) {
- btrfs_tree_unlock_rw(p->nodes[level],
- p->locks[level]);
- p->locks[level] = 0;
- p->nodes[level] = b;
- }
} else {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
@@ -2902,8 +2922,7 @@ again:
* higher levels
*
*/
-static void fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
@@ -2928,8 +2947,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
@@ -2951,7 +2969,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
btrfs_set_item_key(eb, &disk_key, slot);
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
- fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
}
/*
@@ -2998,7 +3016,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items = min(src_nritems - 8, push_items);
tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
- push_items, 1);
+ push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
@@ -3069,7 +3087,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
sizeof(struct btrfs_key_ptr));
tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
- src_nritems - push_items, push_items, 1);
+ src_nritems - push_items, push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3144,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- tree_mod_log_set_root_pointer(root, c);
+ tree_mod_log_set_root_pointer(root, c, 0);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -3221,18 +3239,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
int mid;
int ret;
u32 c_nritems;
- int tree_mod_log_removal = 1;
c = path->nodes[level];
WARN_ON(btrfs_header_generation(c) != trans->transid);
if (c == root->node) {
- /* trying to split the root, lets make a new one */
- ret = insert_new_root(trans, root, path, level + 1);
/*
- * removal of root nodes has been logged by
- * tree_mod_log_set_root_pointer due to locking
+ * trying to split the root, lets make a new one
+ *
+ * tree mod log: We don't log_removal old root in
+ * insert_new_root, because that root buffer will be kept as a
+ * normal node. We are going to log removal of half of the
+ * elements below with tree_mod_log_eb_copy. We're holding a
+ * tree lock on the buffer, which is why we cannot race with
+ * other tree_mod_log users.
*/
- tree_mod_log_removal = 0;
+ ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
} else {
@@ -3270,8 +3291,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
- tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid,
- tree_mod_log_removal);
+ tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
@@ -3687,7 +3707,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -3934,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
- if (data_size) {
+ if (data_size && path->nodes[1]) {
wret = push_leaf_right(trans, root, path, data_size,
data_size, 0, 0);
if (wret < 0)
@@ -4047,8 +4067,7 @@ again:
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
- fixup_low_keys(trans, root, path,
- &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
@@ -4264,7 +4283,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(trans, root, path, new_key, &item_size,
+ setup_items_for_insert(root, path, new_key, &item_size,
item_size, item_size +
sizeof(struct btrfs_item), 1);
leaf = path->nodes[0];
@@ -4281,9 +4300,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
u32 new_size, int from_end)
{
int slot;
@@ -4367,7 +4384,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
}
item = btrfs_item_nr(leaf, slot);
@@ -4381,10 +4398,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
}
/*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
*/
-void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
u32 data_size)
{
int slot;
@@ -4454,8 +4470,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
*/
-void setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *cpu_key, u32 *data_size,
u32 total_data, u32 total_size, int nr)
{
@@ -4531,7 +4546,7 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans,
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
@@ -4571,7 +4586,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ setup_items_for_insert(root, path, cpu_key, data_size,
total_data, total_size, nr);
return 0;
}
@@ -4609,8 +4624,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* the tree should have been previously balanced so the deletion does not
* empty a node.
*/
-static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
@@ -4642,7 +4657,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(trans, root, path, &disk_key, level + 1);
+ fixup_low_keys(root, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
}
@@ -4663,7 +4678,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1]);
+ del_ptr(root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -4744,7 +4759,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(root, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
@@ -5464,139 +5479,6 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
return btrfs_next_old_leaf(root, path, 0);
}
-/* Release the path up to but not including the given level */
-static void btrfs_release_level(struct btrfs_path *path, int level)
-{
- int i;
-
- for (i = 0; i < level; i++) {
- path->slots[i] = 0;
- if (!path->nodes[i])
- continue;
- if (path->locks[i]) {
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
- path->locks[i] = 0;
- }
- free_extent_buffer(path->nodes[i]);
- path->nodes[i] = NULL;
- }
-}
-
-/*
- * This function assumes 2 things
- *
- * 1) You are using path->keep_locks
- * 2) You are not inserting items.
- *
- * If either of these are not true do not use this function. If you need a next
- * leaf with either of these not being true then this function can be easily
- * adapted to do that, but at the moment these are the limitations.
- */
-int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- int del)
-{
- struct extent_buffer *b;
- struct btrfs_key key;
- u32 nritems;
- int level = 1;
- int slot;
- int ret = 1;
- int write_lock_level = BTRFS_MAX_LEVEL;
- int ins_len = del ? -1 : 0;
-
- WARN_ON(!(path->keep_locks || path->really_keep_locks));
-
- nritems = btrfs_header_nritems(path->nodes[0]);
- btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
-
- while (path->nodes[level]) {
- nritems = btrfs_header_nritems(path->nodes[level]);
- if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
-search:
- btrfs_release_path(path);
- ret = btrfs_search_slot(trans, root, &key, path,
- ins_len, 1);
- if (ret < 0)
- goto out;
- level = 1;
- continue;
- }
-
- if (path->slots[level] >= nritems - 1) {
- level++;
- continue;
- }
-
- btrfs_release_level(path, level);
- break;
- }
-
- if (!path->nodes[level]) {
- ret = 1;
- goto out;
- }
-
- path->slots[level]++;
- b = path->nodes[level];
-
- while (b) {
- level = btrfs_header_level(b);
-
- if (!should_cow_block(trans, root, b))
- goto cow_done;
-
- btrfs_set_path_blocking(path);
- ret = btrfs_cow_block(trans, root, b,
- path->nodes[level + 1],
- path->slots[level + 1], &b);
- if (ret)
- goto out;
-cow_done:
- path->nodes[level] = b;
- btrfs_clear_path_blocking(path, NULL, 0);
- if (level != 0) {
- ret = setup_nodes_for_search(trans, root, path, b,
- level, ins_len,
- &write_lock_level);
- if (ret == -EAGAIN)
- goto search;
- if (ret)
- goto out;
-
- b = path->nodes[level];
- slot = path->slots[level];
-
- ret = read_block_for_search(trans, root, path,
- &b, level, slot, &key, 0);
- if (ret == -EAGAIN)
- goto search;
- if (ret)
- goto out;
- level = btrfs_header_level(b);
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(path);
- btrfs_tree_lock(b);
- btrfs_clear_path_blocking(path, b,
- BTRFS_WRITE_LOCK);
- }
- path->locks[level] = BTRFS_WRITE_LOCK;
- path->nodes[level] = b;
- path->slots[level] = 0;
- } else {
- path->slots[level] = 0;
- ret = 0;
- break;
- }
- }
-
-out:
- if (ret)
- btrfs_release_path(path);
-
- return ret;
-}
-
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d82922179db..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -88,12 +88,12 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
/* holds quota configuration and tracking */
#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -340,6 +340,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
*/
#define BTRFS_FS_STATE_ERROR 0
#define BTRFS_FS_STATE_REMOUNTING 1
+#define BTRFS_FS_STATE_TRANS_ABORTED 2
/* Super block flags */
/* Errors detected */
@@ -508,6 +509,7 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -518,7 +520,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -583,7 +586,6 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
- unsigned int really_keep_locks:1;
};
/*
@@ -959,8 +961,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
-#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
+#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
enum btrfs_raid_types {
@@ -1019,9 +1021,9 @@ struct btrfs_block_group_item {
*/
#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
/*
- * SCANNING is set during the initialization phase
+ * RESCAN is set during the initialization phase
*/
-#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
/*
* Some qgroup entries are known to be out of date,
* either because the configuration has changed in a way that
@@ -1050,7 +1052,7 @@ struct btrfs_qgroup_status_item {
* only used during scanning to record the progress
* of the scan. It contains a logical address
*/
- __le64 scan;
+ __le64 rescan;
} __attribute__ ((__packed__));
struct btrfs_qgroup_info_item {
@@ -1100,6 +1102,18 @@ struct btrfs_space_info {
account */
/*
+ * bytes_pinned is kept in line with what is actually pinned, as in
+ * we've called update_block_group and dropped the bytes_used counter
+ * and increased the bytes_pinned counter. However this means that
+ * bytes_pinned does not reflect the bytes that will be pinned once the
+ * delayed refs are flushed, so this counter is inc'ed everytime we call
+ * btrfs_free_extent so it is a realtime count of what will be freed
+ * once the transaction is committed. It will be zero'ed everytime the
+ * transaction commits.
+ */
+ struct percpu_counter total_bytes_pinned;
+
+ /*
* we bump reservation progress every time we decrement
* bytes_reserved. This way people waiting for reservations
* know something good has happened and they can check
@@ -1360,6 +1374,17 @@ struct btrfs_fs_info {
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
+ /*
+ * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+ * when they are updated.
+ *
+ * Because we do not clear the flags for ever, so we needn't use
+ * the lock on the read side.
+ *
+ * We also needn't use the lock when we mount the fs, because
+ * there is no other task which will update the flag.
+ */
+ spinlock_t super_lock;
struct btrfs_super_block *super_copy;
struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
@@ -1409,7 +1434,7 @@ struct btrfs_fs_info {
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
- atomic_t tree_mod_seq;
+ atomic64_t tree_mod_seq;
struct list_head tree_mod_seq_list;
struct seq_list tree_mod_seq_elem;
@@ -1424,25 +1449,22 @@ struct btrfs_fs_info {
atomic_t open_ioctl_trans;
/*
- * this is used by the balancing code to wait for all the pending
- * ordered extents
+ * this is used to protect the following list -- ordered_roots.
*/
- spinlock_t ordered_extent_lock;
+ spinlock_t ordered_root_lock;
/*
- * all of the data=ordered extents pending writeback
+ * all fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
* these can span multiple transactions and basically include
* every dirty data page that isn't from nodatacow
*/
- struct list_head ordered_extents;
+ struct list_head ordered_roots;
- spinlock_t delalloc_lock;
- /*
- * all of the inodes that have delalloc bytes. It is possible for
- * this list to be empty even when there is still dirty data=ordered
- * extents waiting to finish IO.
- */
- struct list_head delalloc_inodes;
+ spinlock_t delalloc_root_lock;
+ /* all fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
/*
* there is a pool of worker threads for checksumming during writes
@@ -1485,8 +1507,6 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
- int enospc_unlink;
- int trans_no_join;
u64 total_pinned;
@@ -1581,12 +1601,28 @@ struct btrfs_fs_info {
struct rb_root qgroup_tree;
spinlock_t qgroup_lock;
+ /*
+ * used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
+ /* protect user change for quota operations */
+ struct mutex qgroup_ioctl_lock;
+
/* list of dirty qgroups to be written at next commit */
struct list_head dirty_qgroups;
/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
u64 qgroup_seq;
+ /* qgroup rescan items */
+ struct mutex qgroup_rescan_lock; /* protects the progress item */
+ struct btrfs_key qgroup_rescan_progress;
+ struct btrfs_workers qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
+
/* filesystem state */
unsigned long fs_state;
@@ -1718,6 +1754,31 @@ struct btrfs_root {
int force_cow;
spinlock_t root_item_lock;
+ atomic_t refs;
+
+ spinlock_t delalloc_lock;
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+ struct list_head delalloc_root;
+ u64 nr_delalloc_inodes;
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+ struct list_head ordered_root;
+ u64 nr_ordered_extents;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1808,6 +1869,12 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_EXTENT_ITEM_KEY 168
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY 169
+
#define BTRFS_TREE_BLOCK_REF_KEY 176
#define BTRFS_EXTENT_DATA_REF_KEY 178
@@ -2766,8 +2833,10 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
{
- int t = btrfs_super_csum_type(s);
- BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+ u16 t = btrfs_super_csum_type(s);
+ /*
+ * csum type is validated at mount time
+ */
return btrfs_csum_sizes[t];
}
@@ -2864,8 +2933,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
version, 64);
BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
- scan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+ rescan, 64);
/* btrfs_qgroup_info_item */
BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
@@ -2999,17 +3068,21 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
num_items;
}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+ struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@@ -3017,8 +3090,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-u64 btrfs_find_block_group(struct btrfs_root *root,
- u64 search_start, u64 search_hint, int owner);
struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
@@ -3028,10 +3099,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
u64 parent, int last_ref);
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- int level);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
@@ -3044,7 +3111,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, u64 data);
+ struct btrfs_key *ins, int is_data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3052,7 +3119,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
- int is_data);
+ int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
@@ -3084,7 +3151,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -3133,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
@@ -3161,8 +3230,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
-void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
@@ -3198,12 +3266,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
u32 data_size);
-void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -3243,8 +3308,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *cpu_key, u32 *data_size,
u32 total_data, u32 total_size, int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -3264,9 +3328,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- int del);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3281,7 +3342,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
}
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
@@ -3298,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+ return (root->fs_info->sb->s_flags & MS_RDONLY ||
+ btrfs_fs_closing(root->fs_info));
+}
+
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
kfree(fs_info->balance_ctl);
@@ -3318,10 +3390,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
- return atomic_inc_return(&fs_info->tree_mod_seq);
-}
+u64 btrfs_tree_mod_seq_prev(u64 seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
@@ -3345,12 +3414,11 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_root_item *item);
-void btrfs_read_root_item(struct btrfs_root *root,
- struct extent_buffer *eb, int slot,
- struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
- btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
@@ -3380,9 +3448,6 @@ struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len);
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -3460,16 +3525,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
-u64 btrfs_file_extent_length(struct btrfs_path *path);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig);
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, int cow);
int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u64 isize);
@@ -3492,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes);
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3529,10 +3593,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
-int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root, u64 new_dirid);
int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
@@ -3542,7 +3606,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -3560,7 +3623,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3611,7 +3673,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
-void btrfs_drop_pages(struct page **pages, size_t num_pages);
int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
struct page **pages, size_t num_pages,
loff_t pos, size_t write_bytes,
@@ -3634,14 +3695,31 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
#ifdef CONFIG_PRINTK
__printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#else
static inline __printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
#endif
+#define btrfs_emerg(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+
__printf(5, 6)
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
@@ -3663,11 +3741,28 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
disk_super = fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features & flag)) {
- features |= flag;
- btrfs_set_super_incompat_flags(disk_super, features);
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
}
}
+#define btrfs_fs_incompat(fs_info, opt) \
+ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
@@ -3753,7 +3848,6 @@ void btrfs_scrub_continue_super(struct btrfs_root *root);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
@@ -3784,7 +3878,9 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 14fce27b4780..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -202,7 +202,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_unlock(&root->lock);
}
-struct btrfs_delayed_node *btrfs_first_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
struct btrfs_delayed_root *delayed_root)
{
struct list_head *p;
@@ -221,7 +221,7 @@ out:
return node;
}
-struct btrfs_delayed_node *btrfs_next_delayed_node(
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_root *delayed_root;
@@ -282,7 +282,7 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
__btrfs_release_delayed_node(node, 0);
}
-struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
struct btrfs_delayed_root *delayed_root)
{
struct list_head *p;
@@ -308,7 +308,7 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
{
struct btrfs_delayed_item *item;
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
@@ -383,7 +383,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
return NULL;
}
-struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node,
struct btrfs_key *key)
{
@@ -394,45 +394,6 @@ struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
return item;
}
-struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- struct btrfs_delayed_item *item;
-
- item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
- NULL, NULL);
- return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- struct btrfs_delayed_item *item, *next;
-
- item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
- NULL, &next);
- if (!item)
- item = next;
-
- return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- struct btrfs_delayed_item *item, *next;
-
- item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
- NULL, &next);
- if (!item)
- item = next;
-
- return item;
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct btrfs_delayed_item *ins,
int action)
@@ -535,7 +496,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
}
}
-struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node)
{
struct rb_node *p;
@@ -548,7 +509,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
return item;
}
-struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct btrfs_delayed_node *delayed_node)
{
struct rb_node *p;
@@ -561,7 +522,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
return item;
}
-struct btrfs_delayed_item *__btrfs_next_delayed_item(
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
struct btrfs_delayed_item *item)
{
struct rb_node *p;
@@ -574,20 +535,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item(
return next;
}
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
- u64 root_id)
-{
- struct btrfs_key root_key;
-
- if (root->objectid == root_id)
- return root;
-
- root_key.objectid = root_id;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_item *item)
@@ -766,10 +713,9 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
* This helper will insert some continuous items into the same leaf according
* to the free space of the leaf.
*/
-static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *item)
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
{
struct btrfs_delayed_item *curr, *next;
int free_space;
@@ -848,7 +794,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
btrfs_clear_path_blocking(path, NULL, 0);
/* insert the keys of the items */
- setup_items_for_insert(trans, root, path, keys, data_size,
+ setup_items_for_insert(root, path, keys, data_size,
total_data_size, total_size, nitems);
/* insert the dir index items */
@@ -932,7 +878,7 @@ do_again:
if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
/* insert the continuous items into the same leaf */
path->slots[0]++;
- btrfs_batch_insert_items(trans, root, path, curr);
+ btrfs_batch_insert_items(root, path, curr);
}
btrfs_release_delayed_item(prev);
btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -1721,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
* btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
*
*/
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
- filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list)
{
struct btrfs_dir_item *di;
@@ -1744,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < filp->f_pos) {
+ if (curr->key.offset < ctx->pos) {
if (atomic_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- filp->f_pos = curr->key.offset;
+ ctx->pos = curr->key.offset;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -1759,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
d_type = btrfs_filetype_table[di->type];
btrfs_disk_key_to_cpu(&location, &di->location);
- over = filldir(dirent, name, name_len, curr->key.offset,
+ over = !dir_emit(ctx, name, name_len,
location.objectid, d_type);
if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
struct list_head *del_list);
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
- filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
/* for init */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b7a0641ead77..c219463fb1fd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -40,16 +40,19 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
* compare two delayed tree backrefs with same bytenr and type
*/
static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
- struct btrfs_delayed_tree_ref *ref1)
+ struct btrfs_delayed_tree_ref *ref1, int type)
{
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
+ if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
return 0;
}
@@ -113,7 +116,8 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
- btrfs_delayed_node_to_tree_ref(ref1));
+ btrfs_delayed_node_to_tree_ref(ref1),
+ ref1->type);
} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
@@ -357,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
if (seq >= elem->seq) {
- pr_debug("holding back delayed_ref %llu, lowest is "
- "%llu (%p)\n", seq, elem->seq, delayed_refs);
+ pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+ (u32)(seq >> 32), (u32)seq,
+ (u32)(elem->seq >> 32), (u32)elem->seq,
+ delayed_refs);
ret = 1;
}
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f75fcaf79aeb..70b962cc177d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
u64 flags_to_set;
+ int level;
unsigned int update_key:1;
unsigned int update_flags:1;
unsigned int is_data:1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7ba7b3900cb8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ if (btrfs_fs_incompat(fs_info, RAID56)) {
+ pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
+ return -EINVAL;
+ }
+
switch (args->start.cont_reading_from_srcdev_mode) {
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -395,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -465,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 502c2158167c..79e594e341c7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,10 @@
#include "hash.h"
#include "transaction.h"
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len);
+
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
@@ -49,7 +53,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- btrfs_extend_item(trans, root, path, data_size);
+ btrfs_extend_item(root, path, data_size);
} else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
@@ -379,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len)
{
@@ -442,8 +446,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- btrfs_truncate_item(trans, root, path,
- item_len - sub_item_len, 1);
+ btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6d19a0a554aa..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
+#include <linux/uuid.h>
#include <asm/unaligned.h>
#include "compat.h"
#include "ctree.h"
@@ -69,6 +70,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -149,7 +152,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
{ .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
{ .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
{ .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
@@ -222,7 +225,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
+ ret = add_extent_mapping(em_tree, em, 0);
if (ret == -EEXIST) {
free_extent_map(em);
em = lookup_extent_mapping(em_tree, start, len);
@@ -238,7 +241,7 @@ out:
return em;
}
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
{
return crc32c(seed, data, len);
}
@@ -274,7 +277,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
if (err)
return 1;
cur_len = min(len, map_len - (offset - map_start));
- crc = btrfs_csum_data(root, kaddr + offset - map_start,
+ crc = btrfs_csum_data(kaddr + offset - map_start,
crc, cur_len);
len -= cur_len;
offset += cur_len;
@@ -354,6 +357,49 @@ out:
}
/*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+ struct btrfs_super_block *disk_sb =
+ (struct btrfs_super_block *)raw_disk_sb;
+ u16 csum_type = btrfs_super_csum_type(disk_sb);
+ int ret = 0;
+
+ if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+ u32 crc = ~(u32)0;
+ const int csum_size = sizeof(crc);
+ char result[csum_size];
+
+ /*
+ * The super_block structure does not span the whole
+ * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+ * is filled with zeros and is included in the checkum.
+ */
+ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+ crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, result);
+
+ if (memcmp(raw_disk_sb, result, csum_size))
+ ret = 1;
+
+ if (ret && btrfs_super_generation(disk_sb) < 10) {
+ printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+ ret = 0;
+ }
+ }
+
+ if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+ printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+ csum_type);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*/
@@ -530,41 +576,6 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
-struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
- struct page *page, int max_walk)
-{
- struct extent_buffer *eb;
- u64 start = page_offset(page);
- u64 target = start;
- u64 min_start;
-
- if (start < max_walk)
- min_start = 0;
- else
- min_start = start - max_walk;
-
- while (start >= min_start) {
- eb = find_extent_buffer(tree, start, 0);
- if (eb) {
- /*
- * we found an extent buffer and it contains our page
- * horray!
- */
- if (eb->start <= target &&
- eb->start + eb->len > target)
- return eb;
-
- /* we found an extent buffer that wasn't for us */
- free_extent_buffer(eb);
- return NULL;
- }
- if (start == 0)
- break;
- start -= PAGE_CACHE_SIZE;
- }
- return NULL;
-}
-
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror)
{
@@ -613,6 +624,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
goto err;
}
found_level = btrfs_header_level(eb);
+ if (found_level >= BTRFS_MAX_LEVEL) {
+ btrfs_info(root->fs_info, "bad tree block level %d\n",
+ (int)btrfs_header_level(eb));
+ ret = -EIO;
+ goto err;
+ }
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
@@ -636,10 +653,9 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
if (!ret)
set_extent_buffer_uptodate(eb);
err:
- if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
- clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ if (reads_done &&
+ test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, ret);
- }
if (ret) {
/*
@@ -993,17 +1009,12 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
return 0;
- /*
- * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
- * slab allocation from alloc_extent_state down the callchain where
- * it'd hit a BUG_ON as those flags are not allowed.
- */
- gfp_flags &= ~GFP_SLAB_BUG_MASK;
- return try_release_extent_buffer(page, gfp_flags);
+ return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1181,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
+ root->nr_delalloc_inodes = 0;
+ root->nr_ordered_extents = 0;
root->name = NULL;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1189,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->delalloc_inodes);
+ INIT_LIST_HEAD(&root->delalloc_root);
+ INIT_LIST_HEAD(&root->ordered_extents);
+ INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->delalloc_lock);
+ spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
@@ -1206,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
+ atomic_set(&root->refs, 1);
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1224,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->root_item_lock);
}
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
-{
- int ret;
- u32 blocksize;
- u64 generation;
-
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
- ret = btrfs_find_last_root(tree_root, objectid,
- &root->root_item, &root->root_key);
- if (ret > 0)
- return -ENOENT;
- else if (ret < 0)
- return ret;
-
- generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
- root->commit_root = NULL;
- root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
- if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
- free_extent_buffer(root->node);
- root->node = NULL;
- return -EIO;
- }
- root->commit_root = btrfs_root_node(root);
- return 0;
-}
-
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1275,6 +1262,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
u64 bytenr;
+ uuid_le uuid;
root = btrfs_alloc_root(fs_info);
if (!root)
@@ -1324,6 +1312,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
+ uuid_le_gen(&uuid);
+ memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
root->root_item.drop_level = 0;
key.objectid = objectid;
@@ -1438,63 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *l;
u64 generation;
u32 blocksize;
- int ret = 0;
- int slot;
+ int ret;
- root = btrfs_alloc_root(fs_info);
- if (!root)
+ path = btrfs_alloc_path();
+ if (!path)
return ERR_PTR(-ENOMEM);
- if (location->offset == (u64)-1) {
- ret = find_and_setup_root(tree_root, fs_info,
- location->objectid, root);
- if (ret) {
- kfree(root);
- return ERR_PTR(ret);
- }
- goto out;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root) {
+ ret = -ENOMEM;
+ goto alloc_fail;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, location->objectid);
+ root, fs_info, key->objectid);
- path = btrfs_alloc_path();
- if (!path) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret == 0) {
- l = path->nodes[0];
- slot = path->slots[0];
- btrfs_read_root_item(tree_root, l, slot, &root->root_item);
- memcpy(&root->root_key, location, sizeof(*location));
- }
- btrfs_free_path(path);
+ ret = btrfs_find_root(tree_root, key, path,
+ &root->root_item, &root->root_key);
if (ret) {
- kfree(root);
if (ret > 0)
ret = -ENOENT;
- return ERR_PTR(ret);
+ goto find_fail;
}
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
+ if (!root->node) {
+ ret = -ENOMEM;
+ goto find_fail;
+ } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ ret = -EIO;
+ goto read_fail;
+ }
root->commit_root = btrfs_root_node(root);
- BUG_ON(!root->node); /* -ENOMEM */
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_free_path(path);
+ return root;
+
+read_fail:
+ free_extent_buffer(root->node);
+find_fail:
+ kfree(root);
+alloc_fail:
+ root = ERR_PTR(ret);
+ goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_read_tree_root(tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1502,6 +1502,66 @@ out:
return root;
}
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+ int ret;
+
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned || !root->free_ino_ctl) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ btrfs_init_free_ino_ctl(root);
+ mutex_init(&root->fs_commit_mutex);
+ spin_lock_init(&root->cache_lock);
+ init_waitqueue_head(&root->cache_wait);
+
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ return 0;
+fail:
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+ return ret;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
+{
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+
+ return ret;
+}
+
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
@@ -1522,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->quota_root ? fs_info->quota_root :
ERR_PTR(-ENOENT);
again:
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)location->objectid);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root)
return root;
- root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ root = btrfs_read_fs_root(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
goto fail;
}
- btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
-
- ret = get_anon_bdev(&root->anon_dev);
+ ret = btrfs_init_fs_root(root);
if (ret)
goto fail;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = -ENOENT;
- goto fail;
- }
-
ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
if (ret < 0)
goto fail;
if (ret == 0)
root->orphan_item_inserted = 1;
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (ret)
- goto fail;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
- if (ret == 0)
- root->in_radix = 1;
-
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
if (ret == -EEXIST) {
free_fs_root(root);
@@ -1581,10 +1613,6 @@ again:
}
goto fail;
}
-
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- WARN_ON(ret);
return root;
fail:
free_fs_root(root);
@@ -1656,17 +1684,38 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
+ int again;
do {
- if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
- mutex_trylock(&root->fs_info->cleaner_mutex)) {
- btrfs_run_delayed_iputs(root);
- btrfs_clean_old_snapshots(root);
+ again = 0;
+
+ /* Make the cleaner go to sleep early. */
+ if (btrfs_need_cleaner_sleep(root))
+ goto sleep;
+
+ if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+ goto sleep;
+
+ /*
+ * Avoid the problem that we change the status of the fs
+ * during the above check and trylock.
+ */
+ if (btrfs_need_cleaner_sleep(root)) {
mutex_unlock(&root->fs_info->cleaner_mutex);
- btrfs_run_defrag_inodes(root->fs_info);
+ goto sleep;
}
- if (!try_to_freeze()) {
+ btrfs_run_delayed_iputs(root);
+ again = btrfs_clean_one_deleted_snapshot(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /*
+ * The defragger has dealt with the R/O remount and umount,
+ * needn't do anything special here.
+ */
+ btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+ if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
@@ -1699,7 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = get_seconds();
- if (!cur->blocked &&
+ if (cur->state < TRANS_STATE_BLOCKED &&
(now < cur->start_time || now - cur->start_time < 30)) {
spin_unlock(&root->fs_info->trans_lock);
delay = HZ * 5;
@@ -1935,35 +1984,60 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
return 0;
}
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+ btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+ btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_raid56_workers);
+ btrfs_stop_workers(&fs_info->rmw_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->endio_freespace_worker);
+ btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->delayed_workers);
+ btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->readahead_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
+ btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+}
+
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
{
free_extent_buffer(info->tree_root->node);
free_extent_buffer(info->tree_root->commit_root);
- free_extent_buffer(info->dev_root->node);
- free_extent_buffer(info->dev_root->commit_root);
- free_extent_buffer(info->extent_root->node);
- free_extent_buffer(info->extent_root->commit_root);
- free_extent_buffer(info->csum_root->node);
- free_extent_buffer(info->csum_root->commit_root);
- if (info->quota_root) {
- free_extent_buffer(info->quota_root->node);
- free_extent_buffer(info->quota_root->commit_root);
- }
-
info->tree_root->node = NULL;
info->tree_root->commit_root = NULL;
- info->dev_root->node = NULL;
- info->dev_root->commit_root = NULL;
- info->extent_root->node = NULL;
- info->extent_root->commit_root = NULL;
- info->csum_root->node = NULL;
- info->csum_root->commit_root = NULL;
+
+ if (info->dev_root) {
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ }
+ if (info->extent_root) {
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ }
+ if (info->csum_root) {
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+ }
if (info->quota_root) {
+ free_extent_buffer(info->quota_root->node);
+ free_extent_buffer(info->quota_root->commit_root);
info->quota_root->node = NULL;
info->quota_root->commit_root = NULL;
}
-
if (chunk_root) {
free_extent_buffer(info->chunk_root->node);
free_extent_buffer(info->chunk_root->commit_root);
@@ -1972,6 +2046,36 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
}
}
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
+
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (gang[0]->in_radix) {
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ btrfs_put_fs_root(gang[0]);
+ }
+ }
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ }
+}
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
@@ -2001,14 +2105,8 @@ int open_ctree(struct super_block *sb,
int backup_index = 0;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
- extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
- csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
- dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-
- if (!tree_root || !extent_root || !csum_root ||
- !chunk_root || !dev_root || !quota_root) {
+ if (!tree_root || !chunk_root) {
err = -ENOMEM;
goto fail;
}
@@ -2051,15 +2149,16 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
- spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
+ spin_lock_init(&fs_info->super_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
seqlock_init(&fs_info->profiles_lock);
@@ -2083,12 +2182,11 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->tree_mod_seq, 0);
+ atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
- fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
@@ -2099,8 +2197,8 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
- INIT_LIST_HEAD(&fs_info->ordered_extents);
- spin_lock_init(&fs_info->ordered_extent_lock);
+ INIT_LIST_HEAD(&fs_info->ordered_roots);
+ spin_lock_init(&fs_info->ordered_root_lock);
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
GFP_NOFS);
if (!fs_info->delayed_root) {
@@ -2187,11 +2285,14 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->dev_replace.lock);
spin_lock_init(&fs_info->qgroup_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
+ mutex_init(&fs_info->qgroup_rescan_lock);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2211,12 +2312,31 @@ int open_ctree(struct super_block *sb,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
+
+ /*
+ * Read super block and check the signature bytes only
+ */
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh) {
err = -EINVAL;
goto fail_alloc;
}
+ /*
+ * We want to check superblock checksum, the type is stored inside.
+ * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+ */
+ if (btrfs_check_super_csum(bh->b_data)) {
+ printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ /*
+ * super_copy is zeroed at allocation time and we never touch the
+ * following bytes up to INFO_SIZE, the checksum is calculated from
+ * the whole block of INFO_SIZE
+ */
memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
@@ -2224,6 +2344,13 @@ int open_ctree(struct super_block *sb,
memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+ ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ if (ret) {
+ printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -2232,13 +2359,6 @@ int open_ctree(struct super_block *sb,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
- ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
- if (ret) {
- printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
- err = ret;
- goto fail_alloc;
- }
-
/*
* run through our array of backup supers and setup
* our ring pointer to the oldest one
@@ -2290,6 +2410,9 @@ int open_ctree(struct super_block *sb,
if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+ printk(KERN_ERR "btrfs: has skinny extents\n");
+
/*
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
@@ -2319,6 +2442,10 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
+ /*
+ * Needn't use the lock because there is no other task which will
+ * update the flag.
+ */
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2394,6 +2521,8 @@ int open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->readahead_workers, "readahead",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -2428,6 +2557,7 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->caching_workers);
ret |= btrfs_start_workers(&fs_info->readahead_workers);
ret |= btrfs_start_workers(&fs_info->flush_workers);
+ ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
if (ret) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2475,8 +2605,8 @@ int open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
- BUG_ON(!chunk_root->node); /* -ENOMEM */
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ if (!chunk_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
@@ -2526,33 +2656,44 @@ retry_root_backup:
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_EXTENT_TREE_OBJECTID, extent_root);
- if (ret)
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ extent_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(extent_root)) {
+ ret = PTR_ERR(extent_root);
goto recovery_tree_root;
+ }
extent_root->track_dirty = 1;
+ fs_info->extent_root = extent_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_DEV_TREE_OBJECTID, dev_root);
- if (ret)
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ dev_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(dev_root)) {
+ ret = PTR_ERR(dev_root);
goto recovery_tree_root;
+ }
dev_root->track_dirty = 1;
+ fs_info->dev_root = dev_root;
+ btrfs_init_devices_late(fs_info);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_CSUM_TREE_OBJECTID, csum_root);
- if (ret)
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ csum_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(csum_root)) {
+ ret = PTR_ERR(csum_root);
goto recovery_tree_root;
+ }
csum_root->track_dirty = 1;
+ fs_info->csum_root = csum_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_QUOTA_TREE_OBJECTID, quota_root);
- if (ret) {
- kfree(quota_root);
- quota_root = fs_info->quota_root = NULL;
- } else {
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ quota_root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(quota_root)) {
quota_root->track_dirty = 1;
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
+ fs_info->quota_root = quota_root;
}
fs_info->generation = generation;
@@ -2661,6 +2802,13 @@ retry_root_backup:
log_tree_root->node = read_tree_block(tree_root, bytenr,
blocksize,
generation + 1);
+ if (!log_tree_root->node ||
+ !extent_buffer_uptodate(log_tree_root->node)) {
+ printk(KERN_ERR "btrfs: failed to read log tree\n");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ goto fail_trans_kthread;
+ }
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
@@ -2698,11 +2846,9 @@ retry_root_backup:
location.objectid = BTRFS_FS_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
+ location.offset = 0;
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (!fs_info->fs_root)
- goto fail_qgroup;
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
goto fail_qgroup;
@@ -2734,12 +2880,16 @@ retry_root_backup:
return ret;
}
+ btrfs_qgroup_rescan_resume(fs_info);
+
return 0;
fail_qgroup:
btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
+ btrfs_cleanup_transaction(fs_info->tree_root);
+ del_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -2750,6 +2900,7 @@ fail_cleaner:
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_block_groups:
+ btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
fail_tree_roots:
@@ -2757,22 +2908,7 @@ fail_tree_roots:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
- btrfs_stop_workers(&fs_info->generic_worker);
- btrfs_stop_workers(&fs_info->readahead_workers);
- btrfs_stop_workers(&fs_info->fixup_workers);
- btrfs_stop_workers(&fs_info->delalloc_workers);
- btrfs_stop_workers(&fs_info->workers);
- btrfs_stop_workers(&fs_info->endio_workers);
- btrfs_stop_workers(&fs_info->endio_meta_workers);
- btrfs_stop_workers(&fs_info->endio_raid56_workers);
- btrfs_stop_workers(&fs_info->rmw_workers);
- btrfs_stop_workers(&fs_info->endio_meta_write_workers);
- btrfs_stop_workers(&fs_info->endio_write_workers);
- btrfs_stop_workers(&fs_info->endio_freespace_worker);
- btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->delayed_workers);
- btrfs_stop_workers(&fs_info->caching_workers);
- btrfs_stop_workers(&fs_info->flush_workers);
+ btrfs_stop_all_workers(fs_info);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2904,7 +3040,10 @@ static int write_dev_supers(struct btrfs_device *device,
if (wait) {
bh = __find_get_block(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
- BUG_ON(!bh);
+ if (!bh) {
+ errors++;
+ continue;
+ }
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
errors++;
@@ -2919,7 +3058,7 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_set_super_bytenr(sb, bytenr);
crc = ~(u32)0;
- crc = btrfs_csum_data(NULL, (char *)sb +
+ crc = btrfs_csum_data((char *)sb +
BTRFS_CSUM_SIZE, crc,
BTRFS_SUPER_INFO_SIZE -
BTRFS_CSUM_SIZE);
@@ -2931,6 +3070,13 @@ static int write_dev_supers(struct btrfs_device *device,
*/
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
+ if (!bh) {
+ printk(KERN_ERR "btrfs: couldn't get super "
+ "buffer head for bytenr %Lu\n", bytenr);
+ errors++;
+ continue;
+ }
+
memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
/* one reference for submit_bh */
@@ -3013,7 +3159,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
* caller
*/
device->flush_bio = NULL;
- bio = bio_alloc(GFP_NOFS, 0);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
@@ -3141,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_RAID10)) {
num_tolerated_disk_barrier_failures = 1;
} else if (flags &
- BTRFS_BLOCK_GROUP_RAID5) {
+ BTRFS_BLOCK_GROUP_RAID6) {
num_tolerated_disk_barrier_failures = 2;
}
}
@@ -3153,7 +3299,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
return num_tolerated_disk_barrier_failures;
}
-int write_all_supers(struct btrfs_root *root, int max_mirrors)
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
struct btrfs_device *dev;
@@ -3249,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3280,38 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
kfree(root->name);
- kfree(root);
+ btrfs_put_fs_root(root);
}
-static void del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_root(struct btrfs_root *root)
{
- int ret;
- struct btrfs_root *gang[8];
- int i;
-
- while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&gang[0]->root_list);
-
- if (gang[0]->in_radix) {
- btrfs_free_fs_root(fs_info, gang[0]);
- } else {
- free_extent_buffer(gang[0]->node);
- free_extent_buffer(gang[0]->commit_root);
- kfree(gang[0]);
- }
- }
-
- while (1) {
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang));
- if (!ret)
- break;
- for (i = 0; i < ret; i++)
- btrfs_free_fs_root(fs_info, gang[i]);
- }
+ free_fs_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3349,8 +3471,8 @@ int btrfs_commit_super(struct btrfs_root *root)
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
- btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ wake_up_process(root->fs_info->cleaner_kthread);
/* wait until ongoing cleanup work done */
down_write(&root->fs_info->cleanup_work_sem);
@@ -3426,43 +3548,15 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- free_extent_buffer(fs_info->extent_root->node);
- free_extent_buffer(fs_info->extent_root->commit_root);
- free_extent_buffer(fs_info->tree_root->node);
- free_extent_buffer(fs_info->tree_root->commit_root);
- free_extent_buffer(fs_info->chunk_root->node);
- free_extent_buffer(fs_info->chunk_root->commit_root);
- free_extent_buffer(fs_info->dev_root->node);
- free_extent_buffer(fs_info->dev_root->commit_root);
- free_extent_buffer(fs_info->csum_root->node);
- free_extent_buffer(fs_info->csum_root->commit_root);
- if (fs_info->quota_root) {
- free_extent_buffer(fs_info->quota_root->node);
- free_extent_buffer(fs_info->quota_root->commit_root);
- }
-
btrfs_free_block_groups(fs_info);
+ btrfs_stop_all_workers(fs_info);
+
del_fs_roots(fs_info);
- iput(fs_info->btree_inode);
+ free_root_pointers(fs_info, 1);
- btrfs_stop_workers(&fs_info->generic_worker);
- btrfs_stop_workers(&fs_info->fixup_workers);
- btrfs_stop_workers(&fs_info->delalloc_workers);
- btrfs_stop_workers(&fs_info->workers);
- btrfs_stop_workers(&fs_info->endio_workers);
- btrfs_stop_workers(&fs_info->endio_meta_workers);
- btrfs_stop_workers(&fs_info->endio_raid56_workers);
- btrfs_stop_workers(&fs_info->rmw_workers);
- btrfs_stop_workers(&fs_info->endio_meta_write_workers);
- btrfs_stop_workers(&fs_info->endio_write_workers);
- btrfs_stop_workers(&fs_info->endio_freespace_worker);
- btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->delayed_workers);
- btrfs_stop_workers(&fs_info->caching_workers);
- btrfs_stop_workers(&fs_info->readahead_workers);
- btrfs_stop_workers(&fs_info->flush_workers);
+ iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3567,18 +3661,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
- if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) {
- printk(KERN_ERR "btrfs: unsupported checksum algorithm\n");
- return -EINVAL;
- }
-
- if (read_only)
- return 0;
-
+ /*
+ * Placeholder for checks
+ */
return 0;
}
-void btrfs_error_commit_super(struct btrfs_root *root)
+static void btrfs_error_commit_super(struct btrfs_root *root)
{
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
@@ -3600,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
@@ -3608,11 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_root_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
+
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
@@ -3620,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
/*
* This will just short circuit the ordered completion stuff which will
* make sure the ordered extent gets properly cleaned up.
*/
- list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+ list_for_each_entry(ordered, &root->ordered_extents,
root_extent_list)
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ list_del_init(&root->ordered_root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ cond_resched_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
}
int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3650,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->root)) != NULL) {
struct btrfs_delayed_ref_head *head = NULL;
+ bool pin_bytes = false;
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
atomic_set(&ref->refs, 1);
@@ -3669,6 +3783,8 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
+ if (head->must_insert_reserved)
+ pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
@@ -3679,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (head)
- mutex_unlock(&head->mutex);
spin_unlock(&delayed_refs->lock);
+ if (head) {
+ if (pin_bytes)
+ btrfs_pin_extent(root, ref->bytenr,
+ ref->num_bytes, 1);
+ mutex_unlock(&head->mutex);
+ }
btrfs_put_delayed_ref(ref);
cond_resched();
@@ -3718,21 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+ delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&btrfs_inode->runtime_flags);
+ spin_unlock(&root->delalloc_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
+
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ list_del_init(&root->delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ btrfs_destroy_delalloc_inodes(root);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3740,13 +3888,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
int mark)
{
int ret;
- struct page *page;
- struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_buffer *eb;
u64 start = 0;
u64 end;
- u64 offset;
- unsigned long index;
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
@@ -3756,36 +3900,17 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- index = start >> PAGE_CACHE_SHIFT;
- start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
- page = find_get_page(btree_inode->i_mapping, index);
- if (!page)
+ eb = btrfs_find_tree_block(root, start,
+ root->leafsize);
+ start += root->leafsize;
+ if (!eb)
continue;
- offset = page_offset(page);
-
- spin_lock(&dirty_pages->buffer_lock);
- eb = radix_tree_lookup(
- &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
- offset >> PAGE_CACHE_SHIFT);
- spin_unlock(&dirty_pages->buffer_lock);
- if (eb)
- ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
- &eb->bflags);
- if (PageWriteback(page))
- end_page_writeback(page);
-
- lock_page(page);
- if (PageDirty(page)) {
- clear_page_dirty_for_io(page);
- spin_lock_irq(&page->mapping->tree_lock);
- radix_tree_tag_clear(&page->mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&page->mapping->tree_lock);
- }
+ wait_on_extent_buffer_writeback(eb);
- unlock_page(page);
- page_cache_release(page);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags))
+ clear_extent_buffer_dirty(eb);
+ free_extent_buffer_stale(eb);
}
}
@@ -3839,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
cur_trans->dirty_pages.dirty_bytes);
- /* FIXME: cleanup wait for commit */
- cur_trans->in_commit = 1;
- cur_trans->blocked = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(cur_trans);
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- cur_trans->commit_done = 1;
- wake_up(&cur_trans->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
@@ -3860,13 +3980,16 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ cur_trans->state =TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+
/*
memset(cur_trans, 0, sizeof(*cur_trans));
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
*/
}
-int btrfs_cleanup_transaction(struct btrfs_root *root)
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
LIST_HEAD(list);
@@ -3875,7 +3998,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
list_splice_init(&root->fs_info->trans_list, &list);
- root->fs_info->trans_no_join = 1;
+ root->fs_info->running_transaction = NULL;
spin_unlock(&root->fs_info->trans_lock);
while (!list_empty(&list)) {
@@ -3883,41 +4006,31 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_ordered_operations(t, root);
- btrfs_destroy_ordered_extents(root);
+ btrfs_destroy_all_ordered_extents(root->fs_info);
btrfs_destroy_delayed_refs(t, root);
- btrfs_block_rsv_release(root,
- &root->fs_info->trans_block_rsv,
- t->dirty_pages.dirty_bytes);
-
- /* FIXME: cleanup wait for commit */
- t->in_commit = 1;
- t->blocked = 1;
+ /*
+ * FIXME: cleanup wait for commit
+ * We needn't acquire the lock here, because we are during
+ * the umount, there is no other task which will change it.
+ */
+ t->state = TRANS_STATE_COMMIT_START;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(t);
- t->blocked = 0;
+ t->state = TRANS_STATE_UNBLOCKED;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
- t->commit_done = 1;
- smp_mb();
- if (waitqueue_active(&t->commit_wait))
- wake_up(&t->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
- btrfs_destroy_delalloc_inodes(root);
-
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
+ btrfs_destroy_all_delalloc_inodes(root->fs_info);
btrfs_destroy_marked_extents(root, &t->dirty_pages,
EXTENT_DIRTY);
@@ -3925,15 +4038,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ t->state = TRANS_STATE_COMPLETED;
+ smp_mb();
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+
atomic_set(&t->use_count, 0);
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);
}
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 0;
- spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 034d7dc552b2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -61,23 +61,48 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
-void btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+ if (atomic_inc_not_zero(&root->refs))
+ return root;
+ return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->refs))
+ kfree(root);
+}
+
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
@@ -93,10 +118,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_cleanup_transaction(struct btrfs_root *root);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 objectid);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
goto fail;
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- err = -ENOENT;
- goto fail;
- }
-
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d551231caba..0236de711989 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -105,6 +106,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -270,9 +273,27 @@ static int exclude_super_stripes(struct btrfs_root *root,
return ret;
while (nr--) {
- cache->bytes_super += stripe_len;
- ret = add_excluded_extent(root, logical[nr],
- stripe_len);
+ u64 start, len;
+
+ if (logical[nr] > cache->key.objectid +
+ cache->key.offset)
+ continue;
+
+ if (logical[nr] + stripe_len <= cache->key.objectid)
+ continue;
+
+ start = logical[nr];
+ if (start < cache->key.objectid) {
+ start = cache->key.objectid;
+ len = (logical[nr] + stripe_len) - start;
+ } else {
+ len = min_t(u64, stripe_len,
+ cache->key.objectid +
+ cache->key.offset - start);
+ }
+
+ cache->bytes_super += len;
+ ret = add_excluded_extent(root, start, len);
if (ret) {
kfree(logical);
return ret;
@@ -419,8 +440,7 @@ again:
if (ret)
break;
- if (need_resched() ||
- btrfs_next_leaf(extent_root, path)) {
+ if (need_resched()) {
caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->extent_commit_sem);
@@ -428,6 +448,12 @@ again:
cond_resched();
goto again;
}
+
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ break;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
continue;
@@ -442,11 +468,16 @@ again:
block_group->key.offset)
break;
- if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
total_found += add_new_free_space(block_group,
fs_info, last,
key.objectid);
- last = key.objectid + key.offset;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ last = key.objectid +
+ fs_info->tree_root->leafsize;
+ else
+ last = key.objectid + key.offset;
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
@@ -656,55 +687,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-u64 btrfs_find_block_group(struct btrfs_root *root,
- u64 search_start, u64 search_hint, int owner)
-{
- struct btrfs_block_group_cache *cache;
- u64 used;
- u64 last = max(search_hint, search_start);
- u64 group_start = 0;
- int full_search = 0;
- int factor = 9;
- int wrapped = 0;
-again:
- while (1) {
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- if (!cache)
- break;
-
- spin_lock(&cache->lock);
- last = cache->key.objectid + cache->key.offset;
- used = btrfs_block_group_used(&cache->item);
-
- if ((full_search || !cache->ro) &&
- block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
- if (used + cache->pinned + cache->reserved <
- div_factor(cache->key.offset, factor)) {
- group_start = cache->key.objectid;
- spin_unlock(&cache->lock);
- btrfs_put_block_group(cache);
- goto found;
- }
- }
- spin_unlock(&cache->lock);
- btrfs_put_block_group(cache);
- cond_resched();
- }
- if (!wrapped) {
- last = search_start;
- wrapped = 1;
- goto again;
- }
- if (!full_search && factor < 10) {
- last = search_start;
- full_search = 1;
- factor = 10;
- goto again;
- }
-found:
- return group_start;
-}
-
/* simple helper to search for an existing extent at a given offset */
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
{
@@ -718,15 +700,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
key.objectid = start;
key.offset = len;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
0, 0);
+ if (ret > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == start &&
+ key.type == BTRFS_METADATA_ITEM_KEY)
+ ret = 0;
+ }
btrfs_free_path(path);
return ret;
}
/*
- * helper function to lookup reference count and flags of extent.
+ * helper function to lookup reference count and flags of a tree block.
*
* the head node for delayed ref is used to store the sum of all the
* reference count modifications queued up in the rbtree. the head
@@ -736,7 +724,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags)
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -749,13 +737,29 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 extent_flags;
int ret;
+ /*
+ * If we don't have skinny metadata, don't bother doing anything
+ * different
+ */
+ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+ offset = root->leafsize;
+ metadata = 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = num_bytes;
+ if (metadata) {
+ key.objectid = bytenr;
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = offset;
+ } else {
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = offset;
+ }
+
if (!trans) {
path->skip_locking = 1;
path->search_commit_root = 1;
@@ -766,6 +770,13 @@ again:
if (ret < 0)
goto out_free;
+ if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = root->leafsize;
+ btrfs_release_path(path);
+ goto again;
+ }
+
if (ret == 0) {
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1001,7 +1012,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
return ret;
BUG_ON(ret); /* Corruption */
- btrfs_extend_item(trans, root, path, new_size);
+ btrfs_extend_item(root, path, new_size);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1453,6 +1464,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
int want;
int ret;
int err = 0;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1464,11 +1477,46 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
} else
extra_size = -1;
+
+ /*
+ * Owner is our parent level, so we can just add one to get the level
+ * for the block we are interested in.
+ */
+ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner;
+ }
+
+again:
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
if (ret < 0) {
err = ret;
goto out;
}
+
+ /*
+ * We may be a newly converted file system which still has the old fat
+ * extent entries for metadata, so try and see if we have one of those.
+ */
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+ if (ret) {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
if (ret && !insert) {
err = -ENOENT;
goto out;
@@ -1504,11 +1552,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ei + 1);
end = (unsigned long)ei + item_size;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
- } else {
- BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
}
err = -ENOENT;
@@ -1590,8 +1636,7 @@ out:
* helper to add new inline back ref
*/
static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+void setup_inline_extent_backref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
u64 parent, u64 root_objectid,
@@ -1614,7 +1659,7 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- btrfs_extend_item(trans, root, path, size);
+ btrfs_extend_item(root, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1683,8 +1728,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
* helper to update/remove inline back ref
*/
static noinline_for_stack
-void update_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+void update_inline_extent_backref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
@@ -1740,7 +1784,7 @@ void update_inline_extent_backref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- btrfs_truncate_item(trans, root, path, item_size, 1);
+ btrfs_truncate_item(root, path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
}
@@ -1762,10 +1806,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
root_objectid, owner, offset, 1);
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
- update_inline_extent_backref(trans, root, path, iref,
+ update_inline_extent_backref(root, path, iref,
refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- setup_inline_extent_backref(trans, root, path, iref, parent,
+ setup_inline_extent_backref(root, path, iref, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
ret = 0;
@@ -1802,7 +1846,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
- update_inline_extent_backref(trans, root, path, iref,
+ update_inline_extent_backref(root, path, iref,
-refs_to_drop, NULL);
} else if (is_data) {
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
@@ -1973,10 +2017,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
- if (extent_op) {
- BUG_ON(extent_op->update_key);
+ if (extent_op)
flags |= extent_op->flags_to_set;
- }
ret = alloc_reserved_file_extent(trans, root,
parent, ref_root, flags,
ref->objectid, ref->offset,
@@ -2029,18 +2071,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
+ int metadata = !extent_op->is_data;
if (trans->aborted)
return 0;
+ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ metadata = 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = node->bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = node->num_bytes;
+ if (metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = extent_op->level;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+ }
+
+again:
path->reada = 1;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
@@ -2050,6 +2103,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
goto out;
}
if (ret > 0) {
+ if (metadata) {
+ btrfs_release_path(path);
+ metadata = 0;
+
+ key.offset = node->num_bytes;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ goto again;
+ }
err = -EIO;
goto out;
}
@@ -2089,10 +2150,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 parent = 0;
u64 ref_root = 0;
-
- ins.objectid = node->bytenr;
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
@@ -2100,10 +2159,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
else
ref_root = ref->root;
+ ins.objectid = node->bytenr;
+ if (skinny_metadata) {
+ ins.offset = ref->level;
+ ins.type = BTRFS_METADATA_ITEM_KEY;
+ } else {
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ }
+
BUG_ON(node->ref_mod != 1);
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
- BUG_ON(!extent_op || !extent_op->update_flags ||
- !extent_op->update_key);
+ BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, root,
parent, ref_root,
extent_op->flags_to_set,
@@ -2307,9 +2374,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- printk(KERN_DEBUG
- "btrfs: run_delayed_extent_op "
- "returned %d\n", ret);
+ btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
spin_lock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(locked_ref);
return ret;
@@ -2348,8 +2413,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_delayed_ref_unlock(locked_ref);
btrfs_put_delayed_ref(ref);
- printk(KERN_DEBUG
- "btrfs: run_one_delayed_ref returned %d\n", ret);
+ btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
@@ -2426,9 +2490,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
if (list_empty(&trans->qgroup_ref_list) !=
!trans->delayed_ref_elem.seq) {
/* list without seq or seq without list */
- printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+ btrfs_err(fs_info,
+ "qgroup accounting update error, list is%s empty, seq is %#x.%x",
list_empty(&trans->qgroup_ref_list) ? "" : " not",
- trans->delayed_ref_elem.seq);
+ (u32)(trans->delayed_ref_elem.seq >> 32),
+ (u32)trans->delayed_ref_elem.seq);
BUG();
}
@@ -2461,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
return 0;
}
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+ u64 num_bytes;
+
+ num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+ sizeof(struct btrfs_extent_inline_ref));
+ if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+ /*
+ * We don't ever fill up leaves all the way so multiply by 2 just to be
+ * closer to what we're really going to want to ouse.
+ */
+ return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *global_rsv;
+ u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ u64 num_bytes;
+ int ret = 0;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_heads = heads_to_leaves(root, num_heads);
+ if (num_heads > 1)
+ num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes <<= 1;
+ global_rsv = &root->fs_info->global_block_rsv;
+
+ /*
+ * If we can't allocate any more chunks lets make sure we have _lots_ of
+ * wiggle room since running delayed refs can create more delayed refs.
+ */
+ if (global_rsv->space_info->full)
+ num_bytes <<= 1;
+
+ spin_lock(&global_rsv->lock);
+ if (global_rsv->reserved <= num_bytes)
+ ret = 1;
+ spin_unlock(&global_rsv->lock);
+ return ret;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2508,7 +2619,8 @@ progress:
old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
if (old) {
DEFINE_WAIT(__wait);
- if (delayed_refs->num_entries < 16348)
+ if (delayed_refs->flushing ||
+ !btrfs_should_throttle_delayed_refs(trans, root))
return 0;
prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2543,7 +2655,7 @@ again:
while (1) {
if (!(run_all || run_most) &&
- delayed_refs->num_heads_ready < 64)
+ !btrfs_should_throttle_delayed_refs(trans, root))
break;
/*
@@ -2564,6 +2676,7 @@ again:
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
atomic_dec(&delayed_refs->procs_running_refs);
+ wake_up(&delayed_refs->wait);
return ret;
}
@@ -2650,7 +2763,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
- int is_data)
+ int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2663,6 +2776,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->update_flags = 1;
extent_op->update_key = 0;
extent_op->is_data = is_data ? 1 : 0;
+ extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
num_bytes, extent_op);
@@ -3040,6 +3154,11 @@ again:
WARN_ON(ret);
if (i_size_read(inode) > 0) {
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &root->fs_info->global_block_rsv);
+ if (ret)
+ goto out_put;
+
ret = btrfs_truncate_free_space_cache(root, trans, path,
inode);
if (ret)
@@ -3239,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info *found;
int i;
int factor;
+ int ret;
if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -3262,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
@@ -3337,7 +3463,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
* progress (either running or paused) picks the target profile (if it's
* already available), otherwise falls back to plain reducing.
*/
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
* we add in the count of missing devices because we want
@@ -3494,10 +3620,11 @@ alloc:
}
/*
- * If we have less pinned bytes than we want to allocate then
- * don't bother committing the transaction, it won't help us.
+ * If we don't have enough pinned space to deal with this
+ * allocation don't bother committing the transaction.
*/
- if (data_sinfo->bytes_pinned < bytes)
+ if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+ bytes) < 0)
committed = 1;
spin_unlock(&data_sinfo->lock);
@@ -3506,6 +3633,7 @@ commit_trans:
if (!committed &&
!atomic_read(&root->fs_info->open_ioctl_trans)) {
committed = 1;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3538,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
+ WARN_ON(data_sinfo->bytes_may_use < bytes);
data_sinfo->bytes_may_use -= bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 0);
@@ -3557,6 +3686,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+ return (global->size << 1);
+}
+
static int should_alloc_chunk(struct btrfs_root *root,
struct btrfs_space_info *sinfo, int force)
{
@@ -3574,7 +3708,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
* global_rsv, it doesn't change except when the transaction commits.
*/
if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
- num_allocated += global_rsv->size;
+ num_allocated += calc_global_rsv_need_space(global_rsv);
/*
* in limited mode, we want to have some free space up to
@@ -3627,8 +3761,8 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
thresh = get_system_chunk_thresh(root, type);
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
- printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
- left, thresh, type);
+ btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+ left, thresh, type);
dump_space_info(info, 0, 0);
}
@@ -3746,7 +3880,7 @@ static int can_overcommit(struct btrfs_root *root,
{
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 profile = btrfs_get_alloc_profile(root, 0);
- u64 rsv_size = 0;
+ u64 space_size;
u64 avail;
u64 used;
u64 to_add;
@@ -3754,18 +3888,16 @@ static int can_overcommit(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly;
- spin_lock(&global_rsv->lock);
- rsv_size = global_rsv->size;
- spin_unlock(&global_rsv->lock);
-
/*
* We only want to allow over committing if we have lots of actual space
* free, but if we don't have enough space to handle the global reserve
* space then we could end up having a real enospc problem when trying
* to allocate a chunk or some other such important allocation.
*/
- rsv_size <<= 1;
- if (used + rsv_size >= space_info->total_bytes)
+ spin_lock(&global_rsv->lock);
+ space_size = calc_global_rsv_need_space(global_rsv);
+ spin_unlock(&global_rsv->lock);
+ if (used + space_size >= space_info->total_bytes)
return 0;
used += space_info->bytes_may_use;
@@ -3808,16 +3940,15 @@ static int can_overcommit(struct btrfs_root *root,
return 0;
}
-void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
- unsigned long nr_pages)
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+ unsigned long nr_pages)
{
struct super_block *sb = root->fs_info->sb;
- int started;
- /* If we can not start writeback, just sync all the delalloc file. */
- started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
- if (!started) {
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
/*
* We needn't worry the filesystem going from r/w to r/o though
* we don't acquire ->s_umount mutex, because the filesystem
@@ -3825,8 +3956,9 @@ void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_inodes(root, 0);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+ if (!current->journal_info)
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
}
}
@@ -3856,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (delalloc_bytes == 0) {
if (trans)
return;
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
return;
}
@@ -3884,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -3922,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
/* See if there is enough pinned space to make this reservation */
spin_lock(&space_info->lock);
- if (space_info->bytes_pinned >= bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes) >= 0) {
spin_unlock(&space_info->lock);
goto commit;
}
@@ -3937,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
- if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
spin_unlock(&space_info->lock);
return -ENOSPC;
@@ -4222,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ block_rsv_add_bytes(dest, num_bytes, 1);
+ return 0;
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -4489,6 +4648,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ if (fs_info->quota_root)
+ fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
update_global_block_rsv(fs_info);
@@ -4953,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
int factor;
/* block accounting for super block */
- spin_lock(&info->delalloc_lock);
+ spin_lock(&info->delalloc_root_lock);
old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
btrfs_set_super_bytes_used(info->super_copy, old_val);
- spin_unlock(&info->delalloc_lock);
+ spin_unlock(&info->delalloc_root_lock);
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
@@ -5090,9 +5251,11 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group_cache *cache;
+ int ret;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache); /* Logic error */
+ if (!cache)
+ return -EINVAL;
/*
* pull in the free space cache (if any) so that our pin
@@ -5105,8 +5268,82 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
pin_down_extent(root, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
- btrfs_remove_free_space(cache, bytenr, num_bytes);
+ ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, start);
+ if (!block_group)
+ return -EINVAL;
+
+ cache_block_group(block_group, 0);
+ caching_ctl = get_caching_control(block_group);
+
+ if (!caching_ctl) {
+ /* Logic error */
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ if (ret)
+ goto out_lock;
+
+ num_bytes = (start + num_bytes) -
+ caching_ctl->progress;
+ start = caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ }
+out_lock:
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+ struct extent_buffer *eb)
+{
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key key;
+ int found_type;
+ int i;
+
+ if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+ if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ continue;
+ key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ __exclude_logged_extent(log, key.objectid, key.offset);
+ }
+
return 0;
}
@@ -5172,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
+ struct btrfs_space_info *space_info;
down_write(&fs_info->extent_commit_sem);
@@ -5194,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
+ list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+ percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
update_global_block_rsv(fs_info);
}
@@ -5291,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ BUG_ON(!space_info); /* Logic bug */
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -5312,6 +5574,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int num_to_del = 1;
u32 item_size;
u64 refs;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
path = btrfs_alloc_path();
if (!path)
@@ -5323,6 +5587,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
BUG_ON(!is_data && refs_to_drop != 1);
+ if (is_data)
+ skinny_metadata = 0;
+
ret = lookup_extent_backref(trans, extent_root, path, &iref,
bytenr, num_bytes, parent,
root_objectid, owner_objectid,
@@ -5339,6 +5606,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.offset == owner_objectid) {
+ found_extent = 1;
+ break;
+ }
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
@@ -5364,12 +5636,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
+ if (!is_data && skinny_metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner_objectid;
+ }
+
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
+ if (ret > 0 && skinny_metadata && path->slots[0]) {
+ /*
+ * Couldn't find our skinny metadata item,
+ * see if we have ye olde extent item.
+ */
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ }
+
if (ret) {
- printk(KERN_ERR "umm, got %d back from search"
- ", was looking for %llu\n", ret,
- (unsigned long long)bytenr);
+ btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+ ret, (unsigned long long)bytenr);
if (ret > 0)
btrfs_print_leaf(extent_root,
path->nodes[0]);
@@ -5383,13 +5682,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else if (ret == -ENOENT) {
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
- printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
- "parent %llu root %llu owner %llu offset %llu\n",
- (unsigned long long)bytenr,
- (unsigned long long)parent,
- (unsigned long long)root_objectid,
- (unsigned long long)owner_objectid,
- (unsigned long long)owner_offset);
+ btrfs_err(info,
+ "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
+ (unsigned long long)bytenr,
+ (unsigned long long)parent,
+ (unsigned long long)root_objectid,
+ (unsigned long long)owner_objectid,
+ (unsigned long long)owner_offset);
} else {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5417,9 +5716,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, extent_root, &key, path,
-1, 1);
if (ret) {
- printk(KERN_ERR "umm, got %d back from search"
- ", was looking for %llu\n", ret,
- (unsigned long long)bytenr);
+ btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+ ret, (unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
if (ret < 0) {
@@ -5435,7 +5733,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
- if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
bi = (struct btrfs_tree_block_info *)(ei + 1);
@@ -5443,7 +5742,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs < refs_to_drop);
+ if (refs < refs_to_drop) {
+ btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+ "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
refs -= refs_to_drop;
if (refs > 0) {
@@ -5468,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+ root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
@@ -5591,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_block_group_cache *cache = NULL;
+ int pin = 1;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5623,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ pin = 0;
}
out:
+ if (pin)
+ add_pinned_bytes(root->fs_info, buf->len,
+ btrfs_header_level(buf),
+ root->root_key.objectid);
+
/*
* Deleting the buffer, clear the corrupt flag since it doesn't matter
* anymore.
@@ -5641,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+ add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
/*
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
@@ -5758,7 +6074,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 data)
+ u64 flags)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -5769,8 +6085,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int empty_cluster = 2 * 1024 * 1024;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = __get_raid_index(data);
- int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+ int index = __get_raid_index(flags);
+ int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
@@ -5783,11 +6099,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+ trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
- space_info = __find_space_info(root->fs_info, data);
+ space_info = __find_space_info(root->fs_info, flags);
if (!space_info) {
- printk(KERN_ERR "No space info for %llu\n", data);
+ btrfs_err(root->fs_info, "No space info for %llu", flags);
return -ENOSPC;
}
@@ -5798,13 +6114,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (btrfs_mixed_space_info(space_info))
use_cluster = false;
- if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+ if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
empty_cluster = 64 * 1024;
}
- if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+ if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
btrfs_test_opt(root, SSD)) {
last_ptr = &root->fs_info->data_alloc_cluster;
}
@@ -5833,7 +6149,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, data) &&
+ if (block_group && block_group_bits(block_group, flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -5871,7 +6187,7 @@ search:
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, data)) {
+ if (!block_group_bits(block_group, flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5 |
@@ -5883,7 +6199,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((data & extra) && !(block_group->flags & extra))
+ if ((flags & extra) && !(block_group->flags & extra))
goto loop;
}
@@ -5914,7 +6230,7 @@ have_block_group:
if (used_block_group != block_group &&
(!used_block_group ||
used_block_group->ro ||
- !block_group_bits(used_block_group, data))) {
+ !block_group_bits(used_block_group, flags))) {
used_block_group = block_group;
goto refill_cluster;
}
@@ -6110,7 +6426,7 @@ loop:
index = 0;
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
- ret = do_chunk_alloc(trans, root, data,
+ ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
/*
* Do not bail out on ENOSPC since we
@@ -6188,16 +6504,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, u64 data)
+ struct btrfs_key *ins, int is_data)
{
bool final_tried = false;
+ u64 flags;
int ret;
- data = btrfs_get_alloc_profile(root, data);
+ flags = btrfs_get_alloc_profile(root, is_data);
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- hint_byte, ins, data);
+ hint_byte, ins, flags);
if (ret == -ENOSPC) {
if (!final_tried) {
@@ -6210,10 +6527,10 @@ again:
} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
- sinfo = __find_space_info(root->fs_info, data);
- printk(KERN_ERR "btrfs allocation failed flags %llu, "
- "wanted %llu\n", (unsigned long long)data,
- (unsigned long long)num_bytes);
+ sinfo = __find_space_info(root->fs_info, flags);
+ btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+ (unsigned long long)flags,
+ (unsigned long long)num_bytes);
if (sinfo)
dump_space_info(sinfo, num_bytes, 1);
}
@@ -6232,8 +6549,8 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
cache = btrfs_lookup_block_group(root->fs_info, start);
if (!cache) {
- printk(KERN_ERR "Unable to find block group for %llu\n",
- (unsigned long long)start);
+ btrfs_err(root->fs_info, "Unable to find block group for %llu",
+ (unsigned long long)start);
return -ENOSPC;
}
@@ -6328,9 +6645,9 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
- printk(KERN_ERR "btrfs update block group failed for %llu "
- "%llu\n", (unsigned long long)ins->objectid,
- (unsigned long long)ins->offset);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ (unsigned long long)ins->objectid,
+ (unsigned long long)ins->offset);
BUG();
}
return ret;
@@ -6349,7 +6666,12 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
- u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+ u32 size = sizeof(*extent_item) + sizeof(*iref);
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ if (!skinny_metadata)
+ size += sizeof(*block_info);
path = btrfs_alloc_path();
if (!path)
@@ -6370,12 +6692,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_set_extent_generation(leaf, extent_item, trans->transid);
btrfs_set_extent_flags(leaf, extent_item,
flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
- block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
- btrfs_set_tree_block_key(leaf, block_info, key);
- btrfs_set_tree_block_level(leaf, block_info, level);
+ if (skinny_metadata) {
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ } else {
+ block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+ btrfs_set_tree_block_key(leaf, block_info, key);
+ btrfs_set_tree_block_level(leaf, block_info, level);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ }
- iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
if (parent > 0) {
BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
@@ -6390,11 +6716,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, root->leafsize, 1);
if (ret) { /* -ENOENT, logic error */
- printk(KERN_ERR "btrfs update block group failed for %llu "
- "%llu\n", (unsigned long long)ins->objectid,
- (unsigned long long)ins->offset);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ (unsigned long long)ins->objectid,
+ (unsigned long long)ins->offset);
BUG();
}
return ret;
@@ -6428,58 +6754,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
- struct btrfs_caching_control *caching_ctl;
- u64 start = ins->objectid;
- u64 num_bytes = ins->offset;
-
- block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, 0);
- caching_ctl = get_caching_control(block_group);
-
- if (!caching_ctl) {
- BUG_ON(!block_group_cache_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- BUG_ON(ret); /* -ENOMEM */
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret); /* -ENOMEM */
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- BUG_ON(ret); /* -ENOMEM */
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- BUG_ON(ret); /* -ENOMEM */
-
- start = caching_ctl->progress;
- num_bytes = ins->objectid + ins->offset -
- caching_ctl->progress;
- ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret); /* -ENOMEM */
- }
- mutex_unlock(&caching_ctl->mutex);
- put_caching_control(caching_ctl);
+ /*
+ * Mixed block groups will exclude before processing the log so we only
+ * need to do the exlude dance if this fs isn't mixed.
+ */
+ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+ if (ret)
+ return ret;
}
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ if (!block_group)
+ return -EINVAL;
+
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
BUG_ON(ret); /* logic error */
- btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
+ btrfs_put_block_group(block_group);
return ret;
}
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- int level)
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, u32 blocksize, int level)
{
struct extent_buffer *buf;
@@ -6522,51 +6823,51 @@ use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
int ret;
+ bool global_updated = false;
block_rsv = get_block_rsv(trans, root);
- if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
- /*
- * If we couldn't reserve metadata bytes try and use some from
- * the global reserve.
- */
- if (ret && block_rsv != global_rsv) {
- ret = block_rsv_use_bytes(global_rsv, blocksize);
- if (!ret)
- return global_rsv;
- return ERR_PTR(ret);
- } else if (ret) {
- return ERR_PTR(ret);
- }
+ if (unlikely(block_rsv->size == 0))
+ goto try_reserve;
+again:
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
return block_rsv;
+
+ if (block_rsv->failfast)
+ return ERR_PTR(ret);
+
+ if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+ global_updated = true;
+ update_global_block_rsv(root->fs_info);
+ goto again;
}
- ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "btrfs: block rsv returned %d\n", ret);
+ }
+try_reserve:
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
- if (ret && !block_rsv->failfast) {
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL * 10,
- /*DEFAULT_RATELIMIT_BURST*/ 1);
- if (__ratelimit(&_rs))
- WARN(1, KERN_DEBUG
- "btrfs: block rsv returned %d\n", ret);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- return block_rsv;
- } else if (ret && block_rsv != global_rsv) {
- ret = block_rsv_use_bytes(global_rsv, blocksize);
- if (!ret)
- return global_rsv;
- }
+ /*
+ * If we couldn't reserve metadata bytes try and use some from
+ * the global reserve if its space type is the same as the global
+ * reservation.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+ block_rsv->space_info == global_rsv->space_info) {
+ ret = block_rsv_use_bytes(global_rsv, blocksize);
+ if (!ret)
+ return global_rsv;
}
-
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(ret);
}
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
@@ -6594,7 +6895,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
-
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
@@ -6627,9 +6929,13 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- extent_op->update_key = 1;
+ if (skinny_metadata)
+ extent_op->update_key = 0;
+ else
+ extent_op->update_key = 1;
extent_op->update_flags = 1;
extent_op->is_data = 0;
+ extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
ins.objectid,
@@ -6704,8 +7010,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
/* We don't lock the tree block, it's OK to be racy here */
- ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
- &refs, &flags);
+ ret = btrfs_lookup_extent_info(trans, root, bytenr,
+ wc->level - 1, 1, &refs,
+ &flags);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -6772,7 +7079,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
ret = btrfs_lookup_extent_info(trans, root,
- eb->start, eb->len,
+ eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
BUG_ON(ret == -ENOMEM);
@@ -6800,7 +7107,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
- eb->len, flag, 0);
+ eb->len, flag,
+ btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -6870,7 +7178,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
&wc->refs[level - 1],
&wc->flags[level - 1]);
if (ret < 0) {
@@ -6878,7 +7186,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return ret;
}
- BUG_ON(wc->refs[level - 1] == 0);
+ if (unlikely(wc->refs[level - 1] == 0)) {
+ btrfs_err(root->fs_info, "Missing references.");
+ BUG();
+ }
*lookup_info = 0;
if (wc->stage == DROP_REFERENCE) {
@@ -6917,8 +7228,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, blocksize, generation);
- if (!next)
+ if (!next || !extent_buffer_uptodate(next)) {
+ free_extent_buffer(next);
return -EIO;
+ }
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
}
@@ -7001,7 +7314,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
- eb->start, eb->len,
+ eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
if (ret < 0) {
@@ -7137,6 +7450,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* reference count by one. if update_ref is true, this function
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
*/
int btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int update_ref,
@@ -7211,8 +7526,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_lookup_extent_info(trans, root,
path->nodes[level]->start,
- path->nodes[level]->len,
- &wc->refs[level],
+ level, 1, &wc->refs[level],
&wc->flags[level]);
if (ret < 0) {
err = ret;
@@ -7238,6 +7552,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
+ if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ pr_debug("btrfs: drop snapshot early exit\n");
+ err = -EAGAIN;
+ goto out_end_trans;
+ }
+
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
err = ret;
@@ -7295,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
- NULL, NULL);
+ ret = btrfs_find_root(tree_root, &root->root_key, path,
+ NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, tree_root, ret);
err = ret;
@@ -7313,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->in_radix) {
- btrfs_free_fs_root(tree_root->fs_info, root);
+ btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
- kfree(root);
+ btrfs_put_fs_root(root);
}
out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
@@ -7630,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
+ struct btrfs_trans_handle *trans;
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
@@ -7716,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
do_div(min_free, dev_min);
}
+ /* We need to do this so that we can look at pending chunks */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 dev_offset;
@@ -7726,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free &&
!device->is_tgtdev_for_dev_replace) {
- ret = find_free_dev_extent(device, min_free,
+ ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7738,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
}
}
mutex_unlock(&root->fs_info->chunk_mutex);
+ btrfs_end_transaction(trans, root);
out:
btrfs_put_block_group(block_group);
return ret;
@@ -7880,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
dump_space_info(space_info, 0, 0);
}
}
+ percpu_counter_destroy(&space_info->total_bytes_pinned);
list_del(&space_info->list);
kfree(space_info);
}
@@ -8020,10 +8350,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
free_excluded_extents(root, cache);
}
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ goto error;
+ }
+
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
&space_info);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ spin_lock(&info->block_group_cache_lock);
+ rb_erase(&cache->cache_node,
+ &info->block_group_cache_tree);
+ spin_unlock(&info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ goto error;
+ }
+
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -8031,9 +8377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
- ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret); /* Logic error */
-
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
set_block_group_ro(cache, 1);
@@ -8089,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
sizeof(item));
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ ret = btrfs_finish_chunk_alloc(trans, extent_root,
+ key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
}
}
@@ -8156,9 +8503,24 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ rb_erase(&cache->cache_node,
+ &root->fs_info->block_group_cache_tree);
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
update_global_block_rsv(root->fs_info);
spin_lock(&cache->space_info->lock);
@@ -8167,9 +8529,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret); /* Logic error */
-
list_add_tail(&cache->new_bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -8414,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
- if (!ret)
- wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ ret = wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
}
ret = btrfs_trim_block_group(cache,
&group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cdee391fc7bf..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -23,13 +23,83 @@
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
+#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
static LIST_HEAD(states);
-#define LEAK_DEBUG 0
-#if LEAK_DEBUG
static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(new, head);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(entry);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+ struct extent_state *state;
+ struct extent_buffer *eb;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+ "state %lu in tree %p refs %d\n",
+ (unsigned long long)state->start,
+ (unsigned long long)state->end,
+ state->state, state->tree, atomic_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+ "refs %d\n", (unsigned long long)eb->start,
+ eb->len, atomic_read(&eb->refs));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(inode, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct inode *inode, u64 start, u64 end)
+{
+ u64 isize = i_size_read(inode);
+
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ printk_ratelimited(KERN_DEBUG
+ "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ caller,
+ (unsigned long long)btrfs_ino(inode),
+ (unsigned long long)isize,
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+}
+#else
+#define btrfs_leak_debug_add(new, head) do {} while (0)
+#define btrfs_leak_debug_del(entry) do {} while (0)
+#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
#define BUFFER_LRU_MAX 64
@@ -75,38 +145,26 @@ int __init extent_io_init(void)
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
goto free_state_cache;
+
+ btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+ offsetof(struct btrfs_io_bio, bio));
+ if (!btrfs_bioset)
+ goto free_buffer_cache;
return 0;
+free_buffer_cache:
+ kmem_cache_destroy(extent_buffer_cache);
+ extent_buffer_cache = NULL;
+
free_state_cache:
kmem_cache_destroy(extent_state_cache);
+ extent_state_cache = NULL;
return -ENOMEM;
}
void extent_io_exit(void)
{
- struct extent_state *state;
- struct extent_buffer *eb;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- printk(KERN_ERR "btrfs state leak: start %llu end %llu "
- "state %lu in tree %p refs %d\n",
- (unsigned long long)state->start,
- (unsigned long long)state->end,
- state->state, state->tree, atomic_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
-
- }
-
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
- "refs %d\n", (unsigned long long)eb->start,
- eb->len, atomic_read(&eb->refs));
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
+ btrfs_leak_debug_check();
/*
* Make sure all delayed rcu free are flushed before we
@@ -117,6 +175,8 @@ void extent_io_exit(void)
kmem_cache_destroy(extent_state_cache);
if (extent_buffer_cache)
kmem_cache_destroy(extent_buffer_cache);
+ if (btrfs_bioset)
+ bioset_free(btrfs_bioset);
}
void extent_io_tree_init(struct extent_io_tree *tree,
@@ -134,9 +194,6 @@ void extent_io_tree_init(struct extent_io_tree *tree,
static struct extent_state *alloc_extent_state(gfp_t mask)
{
struct extent_state *state;
-#if LEAK_DEBUG
- unsigned long flags;
-#endif
state = kmem_cache_alloc(extent_state_cache, mask);
if (!state)
@@ -144,11 +201,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->state = 0;
state->private = 0;
state->tree = NULL;
-#if LEAK_DEBUG
- spin_lock_irqsave(&leak_lock, flags);
- list_add(&state->leak_list, &states);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+ btrfs_leak_debug_add(&state->leak_list, &states);
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -160,15 +213,8 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(&state->refs)) {
-#if LEAK_DEBUG
- unsigned long flags;
-#endif
WARN_ON(state->tree);
-#if LEAK_DEBUG
- spin_lock_irqsave(&leak_lock, flags);
- list_del(&state->leak_list);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+ btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@@ -308,21 +354,21 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, int *bits)
+ struct extent_state *state, unsigned long *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, int *bits)
+ struct extent_state *state, unsigned long *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, int *bits);
+ struct extent_state *state, unsigned long *bits);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -336,7 +382,7 @@ static void set_state_bits(struct extent_io_tree *tree,
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
- int *bits)
+ unsigned long *bits)
{
struct rb_node *node;
@@ -424,10 +470,10 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- int *bits, int wake)
+ unsigned long *bits, int wake)
{
struct extent_state *next;
- int bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -463,7 +509,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
return prealloc;
}
-void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
btrfs_panic(tree_fs_info(tree), err, "Locking error: "
"Extent tree was modified by another "
@@ -483,7 +529,7 @@ void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete,
+ unsigned long bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask)
{
@@ -495,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err;
int clear = 0;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
if (delete)
bits |= ~EXTENT_CTLBITS;
bits |= EXTENT_FIRST_DELALLOC;
@@ -644,11 +695,14 @@ static void wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned long bits)
{
struct extent_state *state;
struct rb_node *node;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
spin_lock(&tree->lock);
again:
while (1) {
@@ -685,9 +739,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- int *bits)
+ unsigned long *bits)
{
- int bits_to_set = *bits & ~EXTENT_CTLBITS;
+ unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -730,8 +784,9 @@ static void uncache_state(struct extent_state **cached_ptr)
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask)
+ unsigned long bits, unsigned long exclusive_bits,
+ u64 *failed_start, struct extent_state **cached_state,
+ gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -740,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
@@ -923,9 +980,9 @@ search_again:
goto again;
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned long bits, u64 * failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
cached_state, mask);
@@ -950,7 +1007,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits,
+ unsigned long bits, unsigned long clear_bits,
struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
@@ -960,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -1143,14 +1202,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, gfp_t mask)
+ unsigned long bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, gfp_t mask)
+ unsigned long bits, gfp_t mask)
{
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
@@ -1189,7 +1248,7 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
cached_state, mask);
}
@@ -1205,7 +1264,7 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached_state)
+ unsigned long bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
@@ -1313,8 +1372,9 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
* return it. tree->lock must be held. NULL will returned if
* nothing was found after 'start'
*/
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, int bits)
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, unsigned long bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1348,7 +1408,7 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits,
+ u64 *start_ret, u64 *end_ret, unsigned long bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1638,7 +1698,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
- int clear_bits = 0;
+ unsigned long clear_bits = 0;
if (op & EXTENT_CLEAR_UNLOCK)
clear_bits |= EXTENT_LOCKED;
@@ -1777,6 +1837,64 @@ out:
return ret;
}
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
+ int count)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ BUG_ON(!node);
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ BUG_ON(state->start != start);
+
+ while (count) {
+ state->private = *csums++;
+ count--;
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index)
+{
+ struct bio_vec *bvec = bio->bi_io_vec + bio_index;
+
+ return page_offset(bvec->bv_page) + bvec->bv_offset;
+}
+
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index,
+ u32 csums[], int count)
+{
+ struct rb_node *node;
+ struct extent_state *state = NULL;
+ u64 start;
+
+ spin_lock(&tree->lock);
+ do {
+ start = __btrfs_get_bio_offset(bio, bio_index);
+ if (state == NULL || state->start != start) {
+ node = tree_search(tree, start);
+ BUG_ON(!node);
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ BUG_ON(state->start != start);
+ }
+ state->private = *csums++;
+ count--;
+ bio_index++;
+
+ state = next_state(state);
+ } while (count);
+ spin_unlock(&tree->lock);
+}
+
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
{
struct rb_node *node;
@@ -1811,7 +1929,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled, struct extent_state *cached)
+ unsigned long bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -1873,28 +1991,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
}
/*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static void check_page_locked(struct extent_io_tree *tree, struct page *page)
-{
- u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
- unlock_page(page);
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static void check_page_writeback(struct extent_io_tree *tree,
- struct page *page)
-{
- end_page_writeback(page);
-}
-
-/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
@@ -1971,7 +2067,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
return 0;
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_private = &compl;
@@ -2261,7 +2357,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
free_io_failure(inode, failrec, 0);
return -EIO;
@@ -2323,19 +2419,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
struct extent_io_tree *tree;
u64 start;
u64 end;
- int whole_page;
do {
struct page *page = bvec->bv_page;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+ printk("%s page write in btrfs with offset %u and length %u\n",
+ bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+ ? KERN_ERR "partial" : KERN_INFO "incomplete",
+ bvec->bv_offset, bvec->bv_len);
- if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
- whole_page = 1;
- else
- whole_page = 0;
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
@@ -2343,10 +2444,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
if (end_extent_writepage(page, err, start, end))
continue;
- if (whole_page)
- end_page_writeback(page);
- else
- check_page_writeback(tree, page);
+ end_page_writeback(page);
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
@@ -2371,7 +2469,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_io_tree *tree;
u64 start;
u64 end;
- int whole_page;
int mirror;
int ret;
@@ -2382,19 +2479,27 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct page *page = bvec->bv_page;
struct extent_state *cached = NULL;
struct extent_state *state;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%ld\n", (u64)bio->bi_sector, err,
- (long int)bio->bi_bdev);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
-
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
-
- if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
- whole_page = 1;
- else
- whole_page = 0;
+ "mirror=%lu\n", (u64)bio->bi_sector, err,
+ io_bio->mirror_num);
+ tree = &BTRFS_I(inode)->io_tree;
+
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+ printk("%s page read in btrfs with offset %u and length %u\n",
+ bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+ ? KERN_ERR "partial" : KERN_INFO "incomplete",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
if (++bvec <= bvec_end)
prefetchw(&bvec->bv_page->flags);
@@ -2410,7 +2515,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
spin_unlock(&tree->lock);
- mirror = (int)(unsigned long)bio->bi_bdev;
+ mirror = io_bio->mirror_num;
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
@@ -2453,39 +2558,43 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
- if (whole_page) {
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+ if (uptodate) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+
+ /* Zero out the end if this page straddles i_size */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && offset)
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
} else {
- if (uptodate) {
- check_page_uptodate(tree, page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- check_page_locked(tree, page);
+ ClearPageUptodate(page);
+ SetPageError(page);
}
+ unlock_page(page);
} while (bvec <= bvec_end);
bio_put(bio);
}
+/*
+ * this allocates from the btrfs_bioset. We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags)
{
struct bio *bio;
- bio = bio_alloc(gfp_flags, nr_vecs);
+ bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
+ while (!bio && (nr_vecs /= 2)) {
+ bio = bio_alloc_bioset(gfp_flags,
+ nr_vecs, btrfs_bioset);
+ }
}
if (bio) {
@@ -2496,6 +2605,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+}
+
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+}
+
+
static int __must_check submit_one_bio(int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
@@ -2560,8 +2682,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
if (old_compressed)
contig = bio->bi_sector == sector;
else
- contig = bio->bi_sector + (bio->bi_size >> 9) ==
- sector;
+ contig = bio_end_sector(bio) == sector;
if (prev_bio_flags != bio_flags || !contig ||
merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
@@ -2596,7 +2717,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
return ret;
}
-void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
@@ -2626,7 +2748,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags)
+ unsigned long *bio_flags, int rw)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -2772,7 +2894,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
pnr -= page->index;
- ret = submit_extent_page(READ, tree, page,
+ ret = submit_extent_page(rw, tree, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
@@ -2805,7 +2927,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
int ret;
ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
- &bio_flags);
+ &bio_flags, READ);
if (bio)
ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
@@ -2874,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0;
}
@@ -3104,7 +3226,7 @@ static int eb_wait(void *word)
return 0;
}
-static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
{
wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
TASK_UNINTERRUPTIBLE);
@@ -3229,7 +3351,7 @@ static int write_one_eb(struct extent_buffer *eb,
u64 offset = eb->start;
unsigned long i, num_pages;
unsigned long bio_flags = 0;
- int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+ int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
@@ -3666,14 +3788,14 @@ int extent_readpages(struct extent_io_tree *tree,
continue;
for (i = 0; i < nr; i++) {
__extent_read_full_page(tree, pagepool[i], get_extent,
- &bio, 0, &bio_flags);
+ &bio, 0, &bio_flags, READ);
page_cache_release(pagepool[i]);
}
nr = 0;
}
for (i = 0; i < nr; i++) {
__extent_read_full_page(tree, pagepool[i], get_extent,
- &bio, 0, &bio_flags);
+ &bio, 0, &bio_flags, READ);
page_cache_release(pagepool[i]);
}
@@ -3714,9 +3836,9 @@ int extent_invalidatepage(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-int try_release_extent_state(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask)
+static int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree,
+ struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
@@ -3913,7 +4035,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4000,19 +4122,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
out_free:
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state, GFP_NOFS);
return ret;
}
static void __free_extent_buffer(struct extent_buffer *eb)
{
-#if LEAK_DEBUG
- unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_del(&eb->leak_list);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+ btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -4022,9 +4139,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
gfp_t mask)
{
struct extent_buffer *eb = NULL;
-#if LEAK_DEBUG
- unsigned long flags;
-#endif
eb = kmem_cache_zalloc(extent_buffer_cache, mask);
if (eb == NULL)
@@ -4044,11 +4158,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
-#if LEAK_DEBUG
- spin_lock_irqsave(&leak_lock, flags);
- list_add(&eb->leak_list, &buffers);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+ btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
@@ -4386,7 +4497,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
/* Expects to have eb->eb_lock already held */
-static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+static int release_extent_buffer(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
@@ -4444,7 +4555,7 @@ void free_extent_buffer(struct extent_buffer *eb)
* I know this is terrible, but it's temporary until we stop tracking
* the uptodate bits and such for the extent buffers.
*/
- release_extent_buffer(eb, GFP_ATOMIC);
+ release_extent_buffer(eb);
}
void free_extent_buffer_stale(struct extent_buffer *eb)
@@ -4458,7 +4569,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_dec(&eb->refs);
- release_extent_buffer(eb, GFP_NOFS);
+ release_extent_buffer(eb);
}
void clear_extent_buffer_dirty(struct extent_buffer *eb)
@@ -4510,17 +4621,6 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
return was_dirty;
}
-static int range_straddles_pages(u64 start, u64 len)
-{
- if (len < PAGE_CACHE_SIZE)
- return 1;
- if (start & (PAGE_CACHE_SIZE - 1))
- return 1;
- if ((start + len) & (PAGE_CACHE_SIZE - 1))
- return 1;
- return 0;
-}
-
int clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
@@ -4552,37 +4652,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
return 0;
}
-int extent_range_uptodate(struct extent_io_tree *tree,
- u64 start, u64 end)
-{
- struct page *page;
- int ret;
- int pg_uptodate = 1;
- int uptodate;
- unsigned long index;
-
- if (range_straddles_pages(start, end - start + 1)) {
- ret = test_range_bit(tree, start, end,
- EXTENT_UPTODATE, 1, NULL);
- if (ret)
- return 1;
- }
- while (start <= end) {
- index = start >> PAGE_CACHE_SHIFT;
- page = find_get_page(tree->mapping, index);
- if (!page)
- return 1;
- uptodate = PageUptodate(page);
- page_cache_release(page);
- if (!uptodate) {
- pg_uptodate = 0;
- break;
- }
- start += PAGE_CACHE_SIZE;
- }
- return pg_uptodate;
-}
-
int extent_buffer_uptodate(struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -4645,7 +4714,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
- mirror_num, &bio_flags);
+ mirror_num, &bio_flags,
+ READ | REQ_META);
if (err)
ret = err;
} else {
@@ -4654,7 +4724,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
if (bio) {
- err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+ bio_flags);
if (err)
return err;
}
@@ -5018,7 +5089,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
-int try_release_extent_buffer(struct page *page, gfp_t mask)
+int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
@@ -5048,9 +5119,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
}
spin_unlock(&page->mapping->private_lock);
- if ((mask & GFP_NOFS) == GFP_NOFS)
- mask = GFP_NOFS;
-
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
* so just return, this page will likely be freed soon anyway.
@@ -5060,5 +5128,5 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
return 0;
}
- return release_extent_buffer(eb, mask);
+ return release_extent_buffer(eb);
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 258c92156857..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -81,9 +82,9 @@ struct extent_io_ops {
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
- int *bits);
+ unsigned long *bits);
void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- int *bits);
+ unsigned long *bits);
void (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -116,7 +117,9 @@ struct extent_state {
/* for use by the FS */
u64 private;
+#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
+#endif
};
#define INLINE_EXTENT_BUFFER_PAGES 16
@@ -132,7 +135,6 @@ struct extent_buffer {
atomic_t refs;
atomic_t io_pages;
int read_mirror;
- struct list_head leak_list;
struct rcu_head rcu_head;
pid_t lock_owner;
@@ -159,6 +161,9 @@ struct extent_buffer {
wait_queue_head_t read_lock_wq;
wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -185,13 +190,10 @@ void extent_io_tree_init(struct extent_io_tree *tree,
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int try_release_extent_buffer(struct page *page, gfp_t mask);
-int try_release_extent_state(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask);
+int try_release_extent_buffer(struct page *page);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached);
+ unsigned long bits, struct extent_state **cached);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
@@ -207,16 +209,17 @@ u64 count_range_bits(struct extent_io_tree *tree,
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled, struct extent_state *cached_state);
+ unsigned long bits, int filled,
+ struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, gfp_t mask);
+ unsigned long bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, struct extent_state **cached,
- gfp_t mask);
+ unsigned long bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, gfp_t mask);
+ unsigned long bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, u64 *failed_start,
+ unsigned long bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -229,17 +232,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits,
+ unsigned long bits, unsigned long clear_bits,
struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits,
+ u64 *start_ret, u64 *end_ret, unsigned long bits,
struct extent_state **cached_state);
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, int bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
@@ -261,6 +262,10 @@ int extent_readpages(struct extent_io_tree *tree,
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
+ int count);
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio,
+ int bvec_index, u32 csums[], int count);
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
@@ -278,6 +283,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline unsigned long num_extent_pages(u64 start, u64 len)
{
@@ -313,7 +319,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_buffer *eb);
@@ -323,8 +328,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len);
-int extent_range_uptodate(struct extent_io_tree *tree,
- u64 start, u64 end);
int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode,
@@ -334,6 +337,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
struct btrfs_fs_info;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2834ca5768ea..a4a7a1a8da95 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -174,6 +174,14 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
test_bit(EXTENT_FLAG_LOGGING, &next->flags))
return 0;
+ /*
+ * We don't want to merge stuff that hasn't been written to the log yet
+ * since it may not reflect exactly what is on disk, and that would be
+ * bad.
+ */
+ if (!list_empty(&prev->list) || !list_empty(&next->list))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -209,9 +217,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- list_move(&em->list, &tree->modified_extents);
- list_del_init(&merge->list);
rb_erase(&merge->rb_node, &tree->map);
free_extent_map(merge);
}
@@ -227,7 +233,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
merge->in_tree = 0;
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- list_del_init(&merge->list);
free_extent_map(merge);
}
}
@@ -302,7 +307,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
* reference dropped if the merge attempt was successful.
*/
int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em)
+ struct extent_map *em, int modified)
{
int ret = 0;
struct rb_node *rb;
@@ -324,7 +329,10 @@ int add_extent_mapping(struct extent_map_tree *tree,
em->mod_start = em->start;
em->mod_len = em->len;
- try_merge_map(tree, em);
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
+ try_merge_map(tree, em);
out:
return ret;
}
@@ -337,8 +345,9 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len, int strict)
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, int strict)
{
struct extent_map *em;
struct rb_node *rb_node;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index c6598c89cff8..61adc44b7805 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,6 +26,7 @@ struct extent_map {
u64 mod_len;
u64 orig_start;
u64 orig_block_len;
+ u64 ram_bytes;
u64 block_start;
u64 block_len;
u64 generation;
@@ -61,7 +62,7 @@ void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em);
+ struct extent_map *em, int modified);
int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *alloc_extent_map(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c4628a201cb3..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
- sizeof(struct btrfs_sector_sum) * \
- (r)->sectorsize - (r)->sectorsize)
+ sizeof(u32) * (r)->sectorsize)
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -83,10 +82,11 @@ out:
return ret;
}
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, int cow)
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow)
{
int ret;
struct btrfs_key file_key;
@@ -152,32 +152,12 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
-u64 btrfs_file_extent_length(struct btrfs_path *path)
-{
- int extent_type;
- struct btrfs_file_extent_item *fi;
- u64 len;
-
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC)
- len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
- else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
- len = btrfs_file_extent_inline_len(path->nodes[0], fi);
- else
- BUG();
-
- return len;
-}
-
static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
- u32 sum;
+ u32 sum[16];
+ int len;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
u64 offset = 0;
@@ -186,7 +166,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 disk_bytenr;
u32 diff;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- int ret;
+ int count;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -214,10 +194,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (dio)
offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
+ len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index);
if (!dio)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
- if (ret == 0)
+ count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum,
+ len);
+ if (count)
goto found;
if (!item || disk_bytenr < item_start_offset ||
@@ -230,10 +212,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
path, disk_bytenr, 0);
if (IS_ERR(item)) {
- ret = PTR_ERR(item);
- if (ret == -ENOENT || ret == -EFBIG)
- ret = 0;
- sum = 0;
+ count = 1;
+ sum[0] = 0;
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
@@ -269,19 +249,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
diff = disk_bytenr - item_start_offset;
diff = diff / root->sectorsize;
diff = diff * csum_size;
-
- read_extent_buffer(path->nodes[0], &sum,
+ count = min_t(int, len, (item_last_offset - disk_bytenr) >>
+ inode->i_sb->s_blocksize_bits);
+ read_extent_buffer(path->nodes[0], sum,
((unsigned long)item) + diff,
- csum_size);
+ csum_size * count);
found:
- if (dst)
- *dst++ = sum;
- else
- set_state_private(io_tree, offset, sum);
- disk_bytenr += bvec->bv_len;
- offset += bvec->bv_len;
- bio_index++;
- bvec++;
+ if (dst) {
+ memcpy(dst, sum, count * csum_size);
+ dst += count;
+ } else {
+ if (dio)
+ extent_cache_csums_dio(io_tree, offset, sum,
+ count);
+ else
+ extent_cache_csums(io_tree, bio, bio_index, sum,
+ count);
+ }
+ while (count--) {
+ disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
+ bio_index++;
+ bvec++;
+ }
}
btrfs_free_path(path);
return 0;
@@ -306,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -358,11 +347,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- key.type != BTRFS_EXTENT_CSUM_KEY)
- break;
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.offset > end)
+ key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset > end)
break;
if (key.offset > start)
@@ -380,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
- MAX_ORDERED_SUM_BYTES(root));
+ MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
- GFP_NOFS);
+ GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
- sector_sum = sums->sums;
sums->bytenr = start;
- sums->len = size;
+ sums->len = (int)size;
offset = (start - key.offset) >>
root->fs_info->sb->s_blocksize_bits;
offset *= csum_size;
+ size >>= root->fs_info->sb->s_blocksize_bits;
- while (size > 0) {
- read_extent_buffer(path->nodes[0],
- &sector_sum->sum,
- ((unsigned long)item) +
- offset, csum_size);
- sector_sum->bytenr = start;
-
- size -= root->sectorsize;
- start += root->sectorsize;
- offset += csum_size;
- sector_sum++;
- }
+ read_extent_buffer(path->nodes[0],
+ sums->sums,
+ ((unsigned long)item) + offset,
+ csum_size * size);
+
+ start += root->sectorsize * size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -429,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
+ int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
- u64 disk_bytenr;
WARN_ON(bio->bi_vcnt <= 0);
sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
if (!sums)
return -ENOMEM;
- sector_sum = sums->sums;
- disk_bytenr = (u64)bio->bi_sector << 9;
sums->len = bio->bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -456,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = (u64)bio->bi_sector << 9;
+ index = 0;
while (bio_index < bio->bi_vcnt) {
if (!contig)
@@ -475,29 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
BUG_ON(!sums); /* -ENOMEM */
- sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = ((u64)bio->bi_sector << 9) +
+ total_bytes;
+ index = 0;
}
data = kmap_atomic(bvec->bv_page);
- sector_sum->sum = ~(u32)0;
- sector_sum->sum = btrfs_csum_data(root,
- data + bvec->bv_offset,
- sector_sum->sum,
- bvec->bv_len);
+ sums->sums[index] = ~(u32)0;
+ sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+ sums->sums[index],
+ bvec->bv_len);
kunmap_atomic(data);
- btrfs_csum_final(sector_sum->sum,
- (char *)&sector_sum->sum);
- sector_sum->bytenr = disk_bytenr;
+ btrfs_csum_final(sums->sums[index],
+ (char *)(sums->sums + index));
- sector_sum++;
bio_index++;
+ index++;
total_bytes += bvec->bv_len;
this_sum_bytes += bvec->bv_len;
- disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
bvec++;
}
@@ -518,8 +494,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
-static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static noinline void truncate_one_csum(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
u64 bytenr, u64 len)
@@ -544,7 +519,7 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(trans, root, path, new_size, 1);
+ btrfs_truncate_item(root, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -556,10 +531,10 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(trans, root, path, new_size, 0);
+ btrfs_truncate_item(root, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(trans, root, path, key);
+ btrfs_set_item_key_safe(root, path, key);
} else {
BUG();
}
@@ -674,7 +649,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
} else {
- truncate_one_csum(trans, root, path, &key, bytenr, len);
+ truncate_one_csum(root, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -686,62 +661,46 @@ out:
return ret;
}
-static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
- struct btrfs_sector_sum *sector_sum,
- u64 total_bytes, u64 sectorsize)
-{
- u64 tmp = sectorsize;
- u64 next_sector = sector_sum->bytenr;
- struct btrfs_sector_sum *next = sector_sum + 1;
-
- while ((tmp + total_bytes) < sums->len) {
- if (next_sector + sectorsize != next->bytenr)
- break;
- tmp += sectorsize;
- next_sector = next->bytenr;
- next++;
- }
- return tmp;
-}
-
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
{
- u64 bytenr;
- int ret;
struct btrfs_key file_key;
struct btrfs_key found_key;
- u64 next_offset;
- u64 total_bytes = 0;
- int found_next;
struct btrfs_path *path;
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
+ u64 next_offset;
+ u64 total_bytes = 0;
u64 csum_offset;
- struct btrfs_sector_sum *sector_sum;
+ u64 bytenr;
u32 nritems;
u32 ins_size;
+ int index = 0;
+ int found_next;
+ int ret;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
- sector_sum = sums->sums;
again:
next_offset = (u64)-1;
found_next = 0;
+ bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = sector_sum->bytenr;
- bytenr = sector_sum->bytenr;
+ file_key.offset = bytenr;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
- item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+ item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
- leaf = path->nodes[0];
ret = 0;
+ leaf = path->nodes[0];
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
@@ -821,8 +780,7 @@ again:
free_space = btrfs_leaf_free_space(root, leaf) -
sizeof(struct btrfs_item) - csum_size;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
WARN_ON(tmp < 1);
@@ -835,7 +793,8 @@ again:
diff /= csum_size;
diff *= csum_size;
- btrfs_extend_item(trans, root, path, diff);
+ btrfs_extend_item(root, path, diff);
+ ret = 0;
goto csum;
}
@@ -845,8 +804,7 @@ insert:
if (found_next) {
u64 tmp;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
root->fs_info->sb->s_blocksize_bits);
@@ -867,31 +825,25 @@ insert:
WARN_ON(1);
goto fail_unlock;
}
-csum:
leaf = path->nodes[0];
+csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- ret = 0;
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+ btrfs_item_size_nr(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
- write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
- total_bytes += root->sectorsize;
- sector_sum++;
- if (total_bytes < sums->len) {
- item = (struct btrfs_csum_item *)((char *)item +
- csum_size);
- if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
- sector_sum->bytenr) {
- bytenr = sector_sum->bytenr;
- goto next_sector;
- }
- }
+ ins_size = (u32)(sums->len - total_bytes) >>
+ root->fs_info->sb->s_blocksize_bits;
+ ins_size *= csum_size;
+ ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+ ins_size);
+ write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+ ins_size);
+
+ ins_size /= csum_size;
+ total_bytes += ins_size * root->sectorsize;
+ index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ade03e6f7bd2..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
@@ -192,8 +193,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
* the same inode in the tree, we will merge them together (by
* __btrfs_add_inode_defrag()) and free the one that we want to requeue.
*/
-void btrfs_requeue_inode_defrag(struct inode *inode,
- struct inode_defrag *defrag)
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -308,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(inode_root);
goto cleanup;
}
- if (btrfs_root_refs(&inode_root->root_item) == 0) {
- ret = -ENOENT;
- goto cleanup;
- }
key.objectid = defrag->ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -473,7 +470,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
@@ -497,9 +494,9 @@ void btrfs_drop_pages(struct page **pages, size_t num_pages)
* doing real data extents, marking pages dirty and delalloc as required.
*/
int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- struct page **pages, size_t num_pages,
- loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
{
int err = 0;
int i;
@@ -552,6 +549,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int testend = 1;
unsigned long flags;
int compressed = 0;
+ bool modified;
WARN_ON(end < start);
if (end == (u64)-1) {
@@ -561,6 +559,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
while (1) {
int no_splits = 0;
+ modified = false;
if (!split)
split = alloc_extent_map();
if (!split2)
@@ -592,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
remove_extent_mapping(em_tree, em);
if (no_splits)
goto next;
@@ -607,15 +607,15 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
else
split->block_len = split->len;
+ split->ram_bytes = em->ram_bytes;
split->orig_block_len = max(split->block_len,
em->orig_block_len);
split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
- ret = add_extent_mapping(em_tree, split);
+ ret = add_extent_mapping(em_tree, split, modified);
BUG_ON(ret); /* Logic error */
- list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -632,6 +632,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->generation = gen;
split->orig_block_len = max(em->block_len,
em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
if (compressed) {
split->block_len = em->block_len;
@@ -643,9 +644,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->orig_start = em->orig_start;
}
- ret = add_extent_mapping(em_tree, split);
+ ret = add_extent_mapping(em_tree, split, modified);
BUG_ON(ret); /* Logic error */
- list_move(&split->list, &em_tree->modified_extents);
free_extent_map(split);
split = NULL;
}
@@ -821,7 +821,7 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
extent_offset += end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1037,7 +1037,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -1071,7 +1071,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1313,6 +1313,56 @@ fail:
}
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+ while (1) {
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered) {
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ return PTR_ERR(trans);
+ }
+
+ num_bytes = lockend - lockstart + 1;
+ ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+ NULL);
+ btrfs_end_transaction(trans, root);
+ if (ret <= 0) {
+ ret = 0;
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ NULL, GFP_NOFS);
+ *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ }
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+ return ret;
+}
+
static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct iov_iter *i,
loff_t pos)
@@ -1320,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
+ u64 release_bytes = 0;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool only_release_metadata = false;
bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1344,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
offset);
size_t num_pages = (write_bytes + offset +
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1358,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- ret = btrfs_delalloc_reserve_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ if (ret == -ENOSPC &&
+ (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC))) {
+ ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret > 0) {
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = (write_bytes + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = 0;
+ } else {
+ ret = -ENOSPC;
+ }
+ }
+
if (ret)
break;
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode,
+ reserve_bytes);
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
@@ -1371,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, write_bytes,
force_page_uptodate);
- if (ret) {
- btrfs_delalloc_release_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
break;
- }
copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i);
@@ -1405,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* managed to copy.
*/
if (num_pages > dirty_pages) {
+ release_bytes = (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT;
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- btrfs_delalloc_release_space(inode,
- (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode,
+ release_bytes);
+ else
+ btrfs_delalloc_release_space(inode,
+ release_bytes);
}
+ release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
if (copied > 0) {
ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied,
NULL);
if (ret) {
- btrfs_delalloc_release_space(inode,
- dirty_pages << PAGE_CACHE_SHIFT);
btrfs_drop_pages(pages, num_pages);
break;
}
}
+ release_bytes = 0;
btrfs_drop_pages(pages, num_pages);
+ if (only_release_metadata && copied > 0) {
+ u64 lockstart = round_down(pos, root->sectorsize);
+ u64 lockend = lockstart +
+ (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+ set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_NORESERVE, NULL,
+ NULL, GFP_NOFS);
+ only_release_metadata = false;
+ }
+
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1441,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
kfree(pages);
+ if (release_bytes) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, release_bytes);
+ else
+ btrfs_delalloc_release_space(inode, release_bytes);
+ }
+
return num_written ? num_written : ret;
}
@@ -1514,8 +1617,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
size_t count, ocount;
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
- sb_start_write(inode->i_sb);
-
mutex_lock(&inode->i_mutex);
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -1617,7 +1718,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
out:
- sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
@@ -1885,7 +1985,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
path->slots[0]++;
key.offset = offset;
- btrfs_set_item_key_safe(trans, root, path, &key);
+ btrfs_set_item_key_safe(root, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -1915,6 +2015,7 @@ out:
} else {
hole_em->start = offset;
hole_em->len = end - offset;
+ hole_em->ram_bytes = hole_em->len;
hole_em->orig_start = offset;
hole_em->block_start = EXTENT_MAP_HOLE;
@@ -1927,10 +2028,7 @@ out:
do {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, hole_em);
- if (!ret)
- list_move(&hole_em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, hole_em, 1);
write_unlock(&em_tree->lock);
} while (ret == -EEXIST);
free_extent_map(hole_em);
@@ -2176,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out_reserve_fail;
}
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
@@ -2192,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start);
if (ret)
goto out;
+ } else {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
}
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -2426,20 +2533,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
}
}
- if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
- offset = -EINVAL;
- goto out;
- }
- if (offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
mutex_unlock(&inode->i_mutex);
return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1f84fc09c1a8..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -104,7 +104,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
spin_lock(&block_group->lock);
if (!((BTRFS_I(inode)->flags & flags) == flags)) {
- printk(KERN_INFO "Old style space inode found, converting.\n");
+ btrfs_info(root->fs_info,
+ "Old style space inode found, converting.");
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
BTRFS_INODE_NODATACOW;
block_group->disk_cache_state = BTRFS_DC_CLEAR;
@@ -119,9 +120,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return inode;
}
-int __create_free_space_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u64 ino, u64 offset)
+static int __create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 ino, u64 offset)
{
struct btrfs_key key;
struct btrfs_disk_key disk_key;
@@ -195,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root,
block_group->key.objectid);
}
-int btrfs_truncate_free_space_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode)
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
{
- struct btrfs_block_rsv *rsv;
u64 needed_bytes;
- loff_t oldsize;
- int ret = 0;
-
- rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ int ret;
/* 1 for slack space, 1 for updating the inode */
needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
btrfs_calc_trans_metadata_size(root, 1);
- spin_lock(&trans->block_rsv->lock);
- if (trans->block_rsv->reserved < needed_bytes) {
- spin_unlock(&trans->block_rsv->lock);
- trans->block_rsv = rsv;
- return -ENOSPC;
- }
- spin_unlock(&trans->block_rsv->lock);
+ spin_lock(&rsv->lock);
+ if (rsv->reserved < needed_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&rsv->lock);
+ return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode)
+{
+ loff_t oldsize;
+ int ret = 0;
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
@@ -230,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
-
if (ret) {
- trans->block_rsv = rsv;
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -240,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, inode);
if (ret)
btrfs_abort_transaction(trans, root, ret);
- trans->block_rsv = rsv;
return ret;
}
@@ -431,7 +432,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
io_ctl_unmap_page(io_ctl);
@@ -461,7 +462,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
kunmap(io_ctl->pages[0]);
io_ctl_map_page(io_ctl, 0);
- crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+ crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
@@ -624,9 +625,9 @@ next:
spin_unlock(&ctl->tree_lock);
}
-int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
- struct btrfs_path *path, u64 offset)
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_path *path, u64 offset)
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
@@ -669,10 +670,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
btrfs_release_path(path);
if (BTRFS_I(inode)->generation != generation) {
- printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
- " not match free space cache generation (%llu)\n",
- (unsigned long long)BTRFS_I(inode)->generation,
- (unsigned long long)generation);
+ btrfs_err(root->fs_info,
+ "free space inode generation (%llu) "
+ "did not match free space cache generation (%llu)",
+ (unsigned long long)BTRFS_I(inode)->generation,
+ (unsigned long long)generation);
return 0;
}
@@ -721,8 +723,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = link_free_space(ctl, e);
spin_unlock(&ctl->tree_lock);
if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
+ btrfs_err(root->fs_info,
+ "Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -741,8 +743,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ctl->op->recalc_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
- printk(KERN_ERR "Duplicate entries in "
- "free space cache, dumping\n");
+ btrfs_err(root->fs_info,
+ "Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -833,8 +835,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
if (!matched) {
__btrfs_remove_free_space_cache(ctl);
- printk(KERN_ERR "block group %llu has an wrong amount of free "
- "space\n", block_group->key.objectid);
+ btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+ block_group->key.objectid);
ret = -1;
}
out:
@@ -845,8 +847,8 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- printk(KERN_ERR "btrfs: failed to load free space cache "
- "for block group %llu\n", block_group->key.objectid);
+ btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+ block_group->key.objectid);
}
iput(inode);
@@ -866,11 +868,11 @@ out:
* on mount. This will return 0 if it was successfull in writing the cache out,
* and -1 if it was not.
*/
-int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u64 offset)
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
@@ -917,10 +919,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Make sure we can fit our crcs into the first page */
if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
- WARN_ON(1);
+ (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
goto out_nospc;
- }
io_ctl_set_generation(&io_ctl, trans->transid);
@@ -1104,8 +1104,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
ret = 0;
#ifdef DEBUG
- printk(KERN_ERR "btrfs: failed to write free space cache "
- "for block group %llu\n", block_group->key.objectid);
+ btrfs_err(root->fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
#endif
}
@@ -1564,7 +1565,8 @@ again:
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
- BUG_ON(ret < 0 || search_start != *offset);
+ if (ret < 0 || search_start != *offset)
+ return -EINVAL;
/* We may have found more bits than what we need */
search_bytes = min(search_bytes, *bytes);
@@ -1970,7 +1972,6 @@ again:
re_search = true;
goto again;
}
- BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
out:
@@ -2064,7 +2065,8 @@ out:
return 0;
}
-void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
+static void __btrfs_remove_free_space_cache_locked(
+ struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *node;
@@ -2931,8 +2933,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
ret = __load_free_space_cache(root, inode, ctl, path, 0);
if (ret < 0)
- printk(KERN_ERR "btrfs: failed to load free ino cache for "
- "root %llu\n", root->root_key.objectid);
+ btrfs_err(fs_info,
+ "failed to load free ino cache for root %llu",
+ root->root_key.objectid);
out_put:
iput(inode);
out:
@@ -2959,11 +2962,534 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
if (ret) {
btrfs_delalloc_release_metadata(inode, inode->i_size);
#ifdef DEBUG
- printk(KERN_ERR "btrfs: failed to write free ino cache "
- "for root %llu\n", root->root_key.objectid);
+ btrfs_err(root->fs_info,
+ "failed to write free ino cache for root %llu",
+ root->root_key.objectid);
#endif
}
iput(inode);
return ret;
}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = 1024 * 1024 * 1024;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+
+ spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->new_bg_list);
+
+ btrfs_init_free_space_ctl(cache);
+
+ return cache;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache. This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+static int check_exists(struct btrfs_block_group_cache *cache, u64 offset,
+ u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret = 0;
+
+ spin_lock(&ctl->tree_lock);
+ info = tree_search_offset(ctl, offset, 0, 0);
+ if (!info) {
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!info)
+ goto out;
+ }
+
+have_info:
+ if (info->bitmap) {
+ u64 bit_off, bit_bytes;
+ struct rb_node *n;
+ struct btrfs_free_space *tmp;
+
+ bit_off = offset;
+ bit_bytes = ctl->unit;
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ if (!ret) {
+ if (bit_off == offset) {
+ ret = 1;
+ goto out;
+ } else if (bit_off > offset &&
+ offset + bytes > bit_off) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ n = rb_prev(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (tmp->offset + tmp->bytes < offset)
+ break;
+ if (offset + bytes < tmp->offset) {
+ n = rb_prev(&info->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ n = rb_next(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (offset + bytes < tmp->offset)
+ break;
+ if (tmp->offset + tmp->bytes < offset) {
+ n = rb_next(&info->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ goto out;
+ }
+
+ if (info->offset == offset) {
+ ret = 1;
+ goto out;
+ }
+
+ if (offset > info->offset && offset < info->offset + info->bytes)
+ ret = 1;
+out:
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+static int add_free_space_entry(struct btrfs_block_group_cache *cache,
+ u64 offset, u64 bytes, bool bitmap)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info = NULL, *bitmap_info;
+ void *map = NULL;
+ u64 bytes_added;
+ int ret;
+
+again:
+ if (!info) {
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+ }
+
+ if (!bitmap) {
+ spin_lock(&ctl->tree_lock);
+ info->offset = offset;
+ info->bytes = bytes;
+ ret = link_free_space(ctl, info);
+ spin_unlock(&ctl->tree_lock);
+ if (ret)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return ret;
+ }
+
+ if (!map) {
+ map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!map) {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock(&ctl->tree_lock);
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ info->bitmap = map;
+ map = NULL;
+ add_new_bitmap(ctl, info, offset);
+ bitmap_info = info;
+ }
+
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ spin_unlock(&ctl->tree_lock);
+
+ if (bytes)
+ goto again;
+
+ if (map)
+ kfree(map);
+ return 0;
+}
+
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+ int ret = 0;
+
+ test_msg("Running extent only tests\n");
+
+ /* First just make sure we can remove an entire entry */
+ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error adding initial extents %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing extent %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 0, 4 * 1024 * 1024)) {
+ test_msg("Full remove left some lingering space\n");
+ return -1;
+ }
+
+ /* Ok edge and middle cases now */
+ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error adding half extent %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing tail end %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing front end %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ if (ret) {
+ test_msg("Error removing middle piece %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 0, 1 * 1024 * 1024)) {
+ test_msg("Still have space at the front\n");
+ return -1;
+ }
+
+ if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ test_msg("Still have space in the middle\n");
+ return -1;
+ }
+
+ if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ test_msg("Still have space at the end\n");
+ return -1;
+ }
+
+ /* Cleanup */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+ u64 next_bitmap_offset;
+ int ret;
+
+ test_msg("Running bitmap only tests\n");
+
+ ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't create a bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing bitmap full range %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 0, 4 * 1024 * 1024)) {
+ test_msg("Left some space in bitmap\n");
+ return -1;
+ }
+
+ ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add to our bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove middle chunk %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * The first bitmap we have starts at offset 0 so the next one is just
+ * at the end of the first bitmap.
+ */
+ next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+ /* Test a bit straddling two bitmaps */
+ ret = add_free_space_entry(cache, next_bitmap_offset -
+ (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+ (1 * 1024 * 1024), 2 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+ 2 * 1024 * 1024)) {
+ test_msg("Left some space when removing overlapping\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+ u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ int ret;
+
+ test_msg("Running bitmap and extent tests\n");
+
+ /*
+ * First let's do something simple, an extent at the same offset as the
+ * bitmap, but the free space completely in the extent and then
+ * completely in the bitmap.
+ */
+ ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't create bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove extent entry %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 0, 1 * 1024 * 1024)) {
+ test_msg("Left remnants after our remove\n");
+ return -1;
+ }
+
+ /* Now to add back the extent entry and remove from the bitmap */
+ ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't re-add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove from bitmap %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ test_msg("Left remnants in the bitmap\n");
+ return -1;
+ }
+
+ /*
+ * Ok so a little more evil, extent entry and bitmap at the same offset,
+ * removing an overlapping chunk.
+ */
+ ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add to a bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ test_msg("Left over peices after removing overlapping\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /* Now with the extent entry offset into the bitmap */
+ ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add space to the bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent to the cache %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Problem removing overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ test_msg("Left something behind when removing space");
+ return -1;
+ }
+
+ /*
+ * This has blown up in the past, the extent entry starts before the
+ * bitmap entry, but we're trying to remove an offset that falls
+ * completely within the bitmap range and is in both the extent entry
+ * and the bitmap entry, looks like this
+ *
+ * [ extent ]
+ * [ bitmap ]
+ * [ del ]
+ */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+ 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+ 5 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+ 5 * 1024 * 1024);
+ if (ret) {
+ test_msg("Failed to free our space %d\n", ret);
+ return ret;
+ }
+
+ if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+ 5 * 1024 * 1024)) {
+ test_msg("Left stuff over\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * This blew up before, we have part of the free space in a bitmap and
+ * then the entirety of the rest of the space in an extent. This used
+ * to return -EAGAIN back from btrfs_remove_extent, make sure this
+ * doesn't happen.
+ */
+ ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+ return ret;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ return 0;
+}
+
+void btrfs_test_free_space_cache(void)
+{
+ struct btrfs_block_group_cache *cache;
+
+ test_msg("Running btrfs free space cache tests\n");
+
+ cache = init_test_block_group();
+ if (!cache) {
+ test_msg("Couldn't run the tests\n");
+ return;
+ }
+
+ if (test_extents(cache))
+ goto out;
+ if (test_bitmaps(cache))
+ goto out;
+ if (test_bitmaps_and_extents(cache))
+ goto out;
+out:
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ test_msg("Free space cache tests finished\n");
+}
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8f2613f779ed..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
@@ -110,4 +112,7 @@ int btrfs_return_cluster_to_free_space(
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+void btrfs_test_free_space_cache(void);
+
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 48b8fda93132..e0b7034d6343 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -183,10 +183,11 @@ int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
return -ENOENT;
}
-int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index)
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 *index)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -246,7 +247,7 @@ int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + del_len,
item_size - (ptr + del_len - item_start));
- btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+ btrfs_truncate_item(root, path, item_size - del_len, 1);
out:
btrfs_free_path(path);
@@ -309,7 +310,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+ btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
@@ -361,7 +362,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
name, name_len, NULL))
goto out;
- btrfs_extend_item(trans, root, path, ins_len);
+ btrfs_extend_item(root, path, ins_len);
ret = 0;
}
if (ret < 0)
@@ -417,7 +418,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- btrfs_extend_item(trans, root, path, ins_len);
+ btrfs_extend_item(root, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d26f67a59e36..2c66ddbbe670 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
num_bytes = trans->bytes_reserved;
/*
* 1 item for inode item insertion if need
- * 3 items for inode item update (in the worst case)
+ * 4 items for inode item update (in the worst case)
+ * 1 items for slack space if we need do truncation
* 1 item for free space object
* 3 items for pre-allocation
*/
- trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
@@ -468,7 +469,8 @@ again:
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ if (ret != -ENOSPC)
+ btrfs_abort_transaction(trans, root, ret);
goto out_put;
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09c58a35b429..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
+#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -41,6 +42,7 @@
#include <linux/mount.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -56,6 +58,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "backref.h"
+#include "hash.h"
struct btrfs_iget_args {
u64 ino;
@@ -100,7 +103,10 @@ static noinline int cow_file_range(struct inode *inode,
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
u64 len, u64 orig_start,
u64 block_start, u64 block_len,
- u64 orig_block_len, int type);
+ u64 orig_block_len, u64 ram_bytes,
+ int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
@@ -697,8 +703,12 @@ retry:
async_extent->nr_pages = 0;
async_extent->pages = NULL;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
goto retry;
+ }
goto out_free;
}
@@ -711,8 +721,10 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ ret = -ENOMEM;
goto out_free_reserve;
+ }
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -722,6 +734,7 @@ retry:
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
+ em->ram_bytes = async_extent->ram_size;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -730,10 +743,7 @@ retry:
while (1) {
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- if (!ret)
- list_move(&em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -921,7 +931,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
}
em = alloc_extent_map();
- BUG_ON(!em); /* -ENOMEM */
+ if (!em) {
+ ret = -ENOMEM;
+ goto out_reserve;
+ }
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
@@ -932,16 +945,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
+ em->ram_bytes = ram_size;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- if (!ret)
- list_move(&em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -950,11 +961,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
btrfs_drop_extent_cache(inode, start,
start + ram_size - 1, 0);
}
+ if (ret)
+ goto out_reserve;
cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto out_reserve;
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
@@ -962,7 +976,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
cur_alloc_size);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto out_unlock;
+ goto out_reserve;
}
}
@@ -991,6 +1005,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
out:
return ret;
+out_reserve:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
out_unlock:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -1194,6 +1210,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_bytenr;
u64 num_bytes;
u64 disk_num_bytes;
+ u64 ram_bytes;
int extent_type;
int ret, err;
int type;
@@ -1290,6 +1307,7 @@ next_slot:
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -1373,6 +1391,7 @@ out_check:
em->block_len = num_bytes;
em->block_start = disk_bytenr;
em->orig_block_len = disk_num_bytes;
+ em->ram_bytes = ram_bytes;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->mod_start = em->start;
em->mod_len = em->len;
@@ -1381,10 +1400,7 @@ out_check:
em->generation = -1;
while (1) {
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- if (!ret)
- list_move(&em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -1519,13 +1535,53 @@ static void btrfs_merge_extent_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(!list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root,
+ &root->fs_info->delalloc_roots);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes--;
+ if (!root->nr_delalloc_inodes) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(list_empty(&root->delalloc_root));
+ list_del_init(&root->delalloc_root);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
static void btrfs_set_bit_hook(struct inode *inode,
- struct extent_state *state, int *bits)
+ struct extent_state *state, unsigned long *bits)
{
/*
@@ -1551,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1569,7 +1617,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
static void btrfs_clear_bit_hook(struct inode *inode,
- struct extent_state *state, int *bits)
+ struct extent_state *state,
+ unsigned long *bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
@@ -1593,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_delalloc_release_metadata(inode, len);
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list)
+ && do_list && !(state->state & EXTENT_NORESERVE))
btrfs_free_reserved_data_space(inode, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1602,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
BTRFS_I(inode)->delalloc_bytes -= len;
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_del_init(&BTRFS_I(inode)->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_del_delalloc_inode(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -2252,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return 0;
return PTR_ERR(root);
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- /* parse ENOENT to 0 */
- return 0;
- }
/* step 2: get inode */
key.objectid = backref->inum;
@@ -2793,6 +2830,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
u32 csum = ~(u32)0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -2819,7 +2858,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
if (ret)
goto zeroit;
- csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
+ csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
btrfs_csum_final(csum, (char *)&csum);
if (csum != private)
goto zeroit;
@@ -2829,11 +2868,11 @@ good:
return 0;
zeroit:
- printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
- "private %llu\n",
- (unsigned long long)btrfs_ino(page->mapping->host),
- (unsigned long long)start, csum,
- (unsigned long long)private);
+ if (__ratelimit(&_rs))
+ btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu",
+ (unsigned long long)btrfs_ino(page->mapping->host),
+ (unsigned long long)start, csum,
+ (unsigned long long)private);
memset(kaddr + offset, 1, end - start + 1);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -3019,7 +3058,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
* We have done the truncate/delete so we can go ahead and remove the orphan
* item for this particular inode.
*/
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+ struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int delete_item = 0;
@@ -3114,8 +3154,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*/
if (found_key.offset == last_objectid) {
- printk(KERN_ERR "btrfs: Error removing orphan entry, "
- "stopping orphan cleanup\n");
+ btrfs_err(root->fs_info,
+ "Error removing orphan entry, stopping orphan cleanup");
ret = -EINVAL;
goto out;
}
@@ -3172,8 +3212,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = PTR_ERR(trans);
goto out;
}
- printk(KERN_ERR "auto deleting %Lu\n",
- found_key.objectid);
+ btrfs_debug(root->fs_info, "auto deleting %Lu",
+ found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -3201,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* 1 for the orphan item deletion. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
+ iput(inode);
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_orphan_add(trans, inode);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret) {
+ iput(inode);
goto out;
+ }
ret = btrfs_truncate(inode);
if (ret)
@@ -3237,13 +3280,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
}
if (nr_unlink)
- printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+ btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
if (nr_truncate)
- printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+ btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
out:
if (ret)
- printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+ btrfs_crit(root->fs_info,
+ "could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -3259,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
+ static u64 xattr_access = 0;
+ static u64 xattr_default = 0;
int scanned = 0;
+ if (!xattr_access) {
+ xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS));
+ xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT));
+ }
+
slot++;
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3270,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 0;
/* we found an xattr, assume we've got an acl */
- if (found_key.type == BTRFS_XATTR_ITEM_KEY)
- return 1;
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (found_key.offset == xattr_access ||
+ found_key.offset == xattr_default)
+ return 1;
+ }
/*
* we found a key greater than an xattr key, there can't
@@ -3591,9 +3647,10 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
dir_ino, &index);
if (ret) {
- printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
- "inode %llu parent %llu\n", name_len, name,
- (unsigned long long)ino, (unsigned long long)dir_ino);
+ btrfs_info(root->fs_info,
+ "failed to delete reference to %.*s, inode %llu parent %llu",
+ name_len, name,
+ (unsigned long long)ino, (unsigned long long)dir_ino);
btrfs_abort_transaction(trans, root, ret);
goto err;
}
@@ -3615,6 +3672,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
dir, index);
if (ret == -ENOENT)
ret = 0;
+ else if (ret)
+ btrfs_abort_transaction(trans, root, ret);
err:
btrfs_free_path(path);
if (ret)
@@ -3642,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
return ret;
}
-
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
- struct btrfs_path *path)
-{
- struct extent_buffer *eb;
- int level;
- u64 refs = 1;
-
- for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
- int ret;
-
- if (!path->nodes[level])
- break;
- eb = path->nodes[level];
- if (!btrfs_block_can_be_shared(root, eb))
- continue;
- ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
- &refs, NULL);
- if (refs > 1)
- return 1;
- }
- return 0;
-}
/*
* helper to start transaction for unlink and rmdir.
*
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
*/
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
- struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_path *path;
- struct btrfs_dir_item *di;
- struct inode *inode = dentry->d_inode;
- u64 index;
- int check_link = 1;
- int err = -ENOSPC;
int ret;
- u64 ino = btrfs_ino(inode);
- u64 dir_ino = btrfs_ino(dir);
/*
* 1 for the possible orphan item
@@ -3701,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
- if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
- return ERR_PTR(-ENOSPC);
-
- /* check if there is someone else holds reference */
- if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
- return ERR_PTR(-ENOSPC);
-
- if (atomic_read(&inode->i_count) > 2)
- return ERR_PTR(-ENOSPC);
-
- if (xchg(&root->fs_info->enospc_unlink, 1))
- return ERR_PTR(-ENOSPC);
-
- path = btrfs_alloc_path();
- if (!path) {
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(-ENOMEM);
- }
-
- /* 1 for the orphan item */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- root->fs_info->enospc_unlink = 0;
- return trans;
- }
-
- path->skip_locking = 1;
- path->search_commit_root = 1;
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(dir)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(inode)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
+ if (PTR_ERR(trans) == -ENOSPC) {
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
- if (ret == 0 && S_ISREG(inode->i_mode)) {
- ret = btrfs_lookup_file_extent(trans, root, path,
- ino, (u64)-1, 0);
- if (ret < 0) {
- err = ret;
- goto out;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, 5);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
}
- BUG_ON(ret == 0); /* Corruption */
- if (check_path_shared(root, path))
- goto out;
- btrfs_release_path(path);
- }
-
- if (!check_link) {
- err = 0;
- goto out;
- }
-
- di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- if (di) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- err = 0;
- goto out;
- }
- btrfs_release_path(path);
-
- ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
- dentry->d_name.len, ino, dir_ino, 0,
- &index);
- if (ret) {
- err = ret;
- goto out;
- }
-
- if (check_path_shared(root, path))
- goto out;
-
- btrfs_release_path(path);
-
- /*
- * This is a commit root search, if we can lookup inode item and other
- * relative items in the commit root, it means the transaction of
- * dir/file creation has been committed, and the dir index item that we
- * delay to insert has also been inserted into the commit root. So
- * we needn't worry about the delayed insertion of the dir index item
- * here.
- */
- di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- BUG_ON(ret == -ENOENT);
- if (check_path_shared(root, path))
- goto out;
-
- err = 0;
-out:
- btrfs_free_path(path);
- /* Migrate the orphan reservation over */
- if (!err)
- err = btrfs_block_rsv_migrate(trans->block_rsv,
- &root->fs_info->global_block_rsv,
- trans->bytes_reserved);
-
- if (err) {
- btrfs_end_transaction(trans, root);
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(err);
- }
-
- trans->block_rsv = &root->fs_info->global_block_rsv;
- return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
- btrfs_block_rsv_release(root, trans->block_rsv,
- trans->bytes_reserved);
trans->block_rsv = &root->fs_info->trans_block_rsv;
- BUG_ON(!root->fs_info->enospc_unlink);
- root->fs_info->enospc_unlink = 0;
+ trans->bytes_reserved = num_bytes;
}
- btrfs_end_transaction(trans, root);
+ return trans;
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3862,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int ret;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3880,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return ret;
}
@@ -3977,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
return -EPERM;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3999,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!err)
btrfs_i_size_write(inode, 0);
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return err;
@@ -4175,8 +4066,7 @@ search_again:
}
size =
btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(trans, root, path,
- size, 1);
+ btrfs_truncate_item(root, path, size, 1);
} else if (root->ref_cows) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
@@ -4378,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
u64 hole_size;
int err = 0;
+ /*
+ * If our size started in the middle of a page we need to zero out the
+ * rest of the page before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
if (size <= hole_start)
return 0;
@@ -4450,16 +4349,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
+ hole_em->ram_bytes = hole_size;
hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
while (1) {
write_lock(&em_tree->lock);
- err = add_extent_mapping(em_tree, hole_em);
- if (!err)
- list_move(&hole_em->list,
- &em_tree->modified_extents);
+ err = add_extent_mapping(em_tree, hole_em, 1);
write_unlock(&em_tree->lock);
if (err != -EEXIST)
break;
@@ -4670,8 +4567,9 @@ void btrfs_evict_inode(struct inode *inode)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
if (ret) {
- printk(KERN_WARNING "Could not get space for a "
- "delete, will truncate on mount %d\n", ret);
+ btrfs_warn(root->fs_info,
+ "Could not get space for a delete, will truncate on mount %d",
+ ret);
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
goto no_delete;
@@ -4712,6 +4610,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
no_delete:
+ btrfs_remove_delayed_node(inode);
clear_inode(inode);
return;
}
@@ -4805,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- err = -ENOENT;
- goto out;
- }
-
*sub_root = new_root;
location->objectid = btrfs_root_dirid(&new_root->root_item);
location->type = BTRFS_INODE_ITEM_KEY;
@@ -4827,14 +4721,13 @@ static void inode_tree_add(struct inode *inode)
struct rb_node **p;
struct rb_node *parent;
u64 ino = btrfs_ino(inode);
-again:
- p = &root->inode_tree.rb_node;
- parent = NULL;
if (inode_unhashed(inode))
return;
-
+again:
+ parent = NULL;
spin_lock(&root->inode_lock);
+ p = &root->inode_tree.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -5076,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (!(inode->i_sb->s_flags & MS_RDONLY))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&root->fs_info->cleanup_work_sem);
- if (ret)
+ if (ret) {
+ iput(inode);
inode = ERR_PTR(ret);
+ }
}
return inode;
@@ -5121,10 +5016,9 @@ unsigned char btrfs_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int btrfs_real_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_item *item;
struct btrfs_dir_item *di;
@@ -5145,29 +5039,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
char tmp_name[32];
char *name_ptr;
int name_len;
- int is_curr = 0; /* filp->f_pos points to the current index? */
+ int is_curr = 0; /* ctx->pos points to the current index? */
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
key_type = BTRFS_DIR_ITEM_KEY;
- /* special case for "." */
- if (filp->f_pos == 0) {
- over = filldir(dirent, ".", 1,
- filp->f_pos, btrfs_ino(inode), DT_DIR);
- if (over)
- return 0;
- filp->f_pos = 1;
- }
- /* special case for .., just use the back ref */
- if (filp->f_pos == 1) {
- u64 pino = parent_ino(filp->f_path.dentry);
- over = filldir(dirent, "..", 2,
- filp->f_pos, pino, DT_DIR);
- if (over)
- return 0;
- filp->f_pos = 2;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5181,7 +5061,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
}
btrfs_set_key_type(&key, key_type);
- key.offset = filp->f_pos;
+ key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5207,14 +5087,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
break;
if (btrfs_key_type(&found_key) != key_type)
break;
- if (found_key.offset < filp->f_pos)
+ if (found_key.offset < ctx->pos)
goto next;
if (key_type == BTRFS_DIR_INDEX_KEY &&
btrfs_should_delete_dir_index(&del_list,
found_key.offset))
goto next;
- filp->f_pos = found_key.offset;
+ ctx->pos = found_key.offset;
is_curr = 1;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5258,9 +5138,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
over = 0;
goto skip;
}
- over = filldir(dirent, name_ptr, name_len,
- found_key.offset, location.objectid,
- d_type);
+ over = !dir_emit(ctx, name_ptr, name_len,
+ location.objectid, d_type);
skip:
if (name_ptr != tmp_name)
@@ -5279,9 +5158,8 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
- filp->f_pos++;
- ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
- &ins_list);
+ ctx->pos++;
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
if (ret)
goto nopos;
}
@@ -5292,9 +5170,9 @@ next:
* 32-bit glibc will use getdents64, but then strtol -
* so the last number we can serve is this.
*/
- filp->f_pos = 0x7fffffff;
+ ctx->pos = 0x7fffffff;
else
- filp->f_pos++;
+ ctx->pos++;
nopos:
ret = 0;
err:
@@ -5335,7 +5213,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -5977,7 +5855,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
em->block_start += start_diff;
em->block_len -= start_diff;
}
- return add_extent_mapping(em_tree, em);
+ return add_extent_mapping(em_tree, em, 0);
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6151,6 +6029,7 @@ again:
goto not_found_em;
}
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
@@ -6259,18 +6138,18 @@ not_found_em:
insert:
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
- printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
- "[%llu %llu]\n", (unsigned long long)em->start,
- (unsigned long long)em->len,
- (unsigned long long)start,
- (unsigned long long)len);
+ btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+ (unsigned long long)em->start,
+ (unsigned long long)em->len,
+ (unsigned long long)start,
+ (unsigned long long)len);
err = -EIO;
goto out;
}
err = 0;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
+ ret = add_extent_mapping(em_tree, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -6482,7 +6361,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
}
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, 0);
+ ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em))
goto out;
@@ -6501,8 +6380,10 @@ out:
* returns 1 when the nocow is safe, < 1 on error, 0 if the
* block must be cow'd
*/
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 offset, u64 len)
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes)
{
struct btrfs_path *path;
int ret;
@@ -6516,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
u64 num_bytes;
int slot;
int found_type;
-
+ bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -6556,15 +6437,29 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
/* not a regular extent, must cow */
goto out;
}
+
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- backref_offset = btrfs_file_extent_offset(leaf, fi);
+ if (disk_bytenr == 0)
+ goto out;
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end < offset + len) {
- /* extent doesn't include our full range, must cow */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
goto out;
+
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (orig_start) {
+ *orig_start = key.offset - backref_offset;
+ *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
@@ -6584,13 +6479,14 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
- num_bytes = min(offset + len, extent_end) - offset;
+ num_bytes = min(offset + *len, extent_end) - offset;
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out;
/*
* all of the above have passed, it is safe to overwrite this extent
* without cow
*/
+ *len = num_bytes;
ret = 1;
out:
btrfs_free_path(path);
@@ -6661,7 +6557,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
u64 len, u64 orig_start,
u64 block_start, u64 block_len,
- u64 orig_block_len, int type)
+ u64 orig_block_len, u64 ram_bytes,
+ int type)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -6682,6 +6579,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
em->block_start = block_start;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->orig_block_len = orig_block_len;
+ em->ram_bytes = ram_bytes;
em->generation = -1;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
if (type == BTRFS_ORDERED_PREALLOC)
@@ -6691,10 +6589,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
btrfs_drop_extent_cache(inode, em->start,
em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- if (!ret)
- list_move(&em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
} while (ret == -EEXIST);
@@ -6789,7 +6684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
em->block_start != EXTENT_MAP_HOLE)) {
int type;
int ret;
- u64 block_start;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
@@ -6807,16 +6702,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (IS_ERR(trans))
goto must_cow;
- if (can_nocow_odirect(trans, inode, start, len) == 1) {
- u64 orig_start = em->orig_start;
- u64 orig_block_len = em->orig_block_len;
-
+ if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1) {
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
em = create_pinned_em(inode, start, len,
orig_start,
block_start, len,
- orig_block_len, type);
+ orig_block_len,
+ ram_bytes, type);
if (IS_ERR(em)) {
btrfs_end_transaction(trans, root);
goto unlock_err;
@@ -6910,7 +6804,11 @@ struct btrfs_dio_private {
/* IO errors */
int errors;
+ /* orig_bio is our btrfs_io_bio */
struct bio *orig_bio;
+
+ /* dio_bio came from fs/direct-io.c */
+ struct bio *dio_bio;
};
static void btrfs_endio_direct_read(struct bio *bio, int err)
@@ -6920,6 +6818,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct bio_vec *bvec = bio->bi_io_vec;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct bio *dio_bio;
u64 start;
start = dip->logical_offset;
@@ -6936,7 +6835,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
goto failed;
local_irq_save(flags);
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+ csum = btrfs_csum_data(kaddr + bvec->bv_offset,
csum, bvec->bv_len);
btrfs_csum_final(csum, (char *)&csum);
kunmap_atomic(kaddr);
@@ -6945,11 +6844,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
flush_dcache_page(bvec->bv_page);
if (csum != private) {
failed:
- printk(KERN_ERR "btrfs csum failed ino %llu off"
- " %llu csum %u private %u\n",
- (unsigned long long)btrfs_ino(inode),
- (unsigned long long)start,
- csum, (unsigned)private);
+ btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u",
+ (unsigned long long)btrfs_ino(inode),
+ (unsigned long long)start,
+ csum, (unsigned)private);
err = -EIO;
}
}
@@ -6960,14 +6858,15 @@ failed:
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
- bio->bi_private = dip->private;
+ dio_bio = dip->dio_bio;
kfree(dip);
/* If we had a csum failure make sure to clear the uptodate flag */
if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+ bio_put(bio);
}
static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -6978,6 +6877,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct btrfs_ordered_extent *ordered = NULL;
u64 ordered_offset = dip->logical_offset;
u64 ordered_bytes = dip->bytes;
+ struct bio *dio_bio;
int ret;
if (err)
@@ -7005,14 +6905,15 @@ out_test:
goto again;
}
out_done:
- bio->bi_private = dip->private;
+ dio_bio = dip->dio_bio;
kfree(dip);
/* If we had an error make sure to clear the uptodate flag */
if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- dio_end_io(bio, err);
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+ bio_put(bio);
}
static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -7048,10 +6949,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
if (!atomic_dec_and_test(&dip->pending_bios))
goto out;
- if (dip->errors)
+ if (dip->errors) {
bio_io_error(dip->orig_bio);
- else {
- set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+ } else {
+ set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
bio_endio(dip->orig_bio, 0);
}
out:
@@ -7226,48 +7127,54 @@ out_err:
return 0;
}
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
- loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+ struct inode *inode, loff_t file_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio *io_bio;
int skip_sum;
int write = rw & REQ_WRITE;
int ret = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+
+ if (!io_bio) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+
dip = kmalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_ordered;
+ goto free_io_bio;
}
- dip->private = bio->bi_private;
+ dip->private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
-
- dip->bytes = 0;
- do {
- dip->bytes += bvec->bv_len;
- bvec++;
- } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
-
- dip->disk_bytenr = (u64)bio->bi_sector << 9;
- bio->bi_private = dip;
+ dip->bytes = dio_bio->bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+ io_bio->bi_private = dip;
dip->errors = 0;
- dip->orig_bio = bio;
+ dip->orig_bio = io_bio;
+ dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
if (write)
- bio->bi_end_io = btrfs_endio_direct_write;
+ io_bio->bi_end_io = btrfs_endio_direct_write;
else
- bio->bi_end_io = btrfs_endio_direct_read;
+ io_bio->bi_end_io = btrfs_endio_direct_read;
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
+
+free_io_bio:
+ bio_put(io_bio);
+
free_ordered:
/*
* If this is a write, we need to clean up the reserved space and kill
@@ -7283,7 +7190,7 @@ free_ordered:
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
- bio_endio(bio, ret);
+ bio_endio(dio_bio, ret);
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -7347,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
atomic_inc(&inode->i_dio_count);
smp_mb__after_atomic_inc();
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * call btrfs_wait_ordered_range to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ count = iov_length(iov, nr_segs);
+ btrfs_wait_ordered_range(inode, offset, count);
+
if (rw & WRITE) {
- count = iov_length(iov, nr_segs);
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
@@ -7425,8 +7340,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
}
-int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
struct extent_io_tree *tree;
@@ -7467,7 +7382,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
}
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
@@ -7667,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret;
+ int ret = 0;
int err = 0;
struct btrfs_trans_handle *trans;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
- if (ret)
- return ret;
-
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
@@ -7934,15 +7846,15 @@ void btrfs_destroy_inode(struct inode *inode)
*/
smp_mb();
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) {
- printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
- (unsigned long long)btrfs_ino(inode));
+ btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+ (unsigned long long)btrfs_ino(inode));
atomic_dec(&root->orphan_inodes);
}
@@ -7951,10 +7863,9 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- printk(KERN_ERR "btrfs found ordered "
- "extent %llu %llu on inode cleanup\n",
- (unsigned long long)ordered->file_offset,
- (unsigned long long)ordered->len);
+ btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+ (unsigned long long)ordered->file_offset,
+ (unsigned long long)ordered->len);
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -7963,7 +7874,6 @@ void btrfs_destroy_inode(struct inode *inode)
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
- btrfs_remove_delayed_node(inode);
call_rcu(&inode->i_rcu, btrfs_i_callback);
}
@@ -7971,6 +7881,9 @@ int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ if (root == NULL)
+ return 1;
+
/* the snap/subvol tree is on deleting */
if (btrfs_root_refs(&root->root_item) == 0 &&
root != root->fs_info->tree_root)
@@ -8305,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8314,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
struct list_head splice;
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
- list_del_init(&binode->delalloc_inodes);
-
+ list_move_tail(&binode->delalloc_inodes,
+ &root->delalloc_inodes);
inode = igrab(&binode->vfs_inode);
if (!inode) {
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &binode->runtime_flags);
+ cond_resched_lock(&root->delalloc_lock);
continue;
}
-
- list_add_tail(&binode->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
if (unlikely(!work)) {
@@ -8349,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
&work->work);
cond_resched();
- spin_lock(&root->fs_info->delalloc_lock);
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+ return 0;
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->delalloc_lock);
+ list_splice_tail(&splice, &root->delalloc_inodes);
+ spin_unlock(&root->delalloc_lock);
+ }
+ return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+ int ret;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
- /* the filemap_flush will queue IO into the worker threads, but
+ ret = __start_delalloc_inodes(root, delay_iput);
+ /*
+ * the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
*/
@@ -8370,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
- return 0;
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
+ return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = __start_delalloc_inodes(root, delay_iput);
+ btrfs_put_fs_root(root);
+ if (ret)
+ goto out;
+
+ spin_lock(&fs_info->delalloc_root_lock);
}
+ spin_unlock(&fs_info->delalloc_root_lock);
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&fs_info->async_submit_draining);
+ return 0;
+out:
if (!list_empty_careful(&splice)) {
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_tail(&splice, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
return ret;
}
@@ -8571,16 +8538,14 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
+ em->ram_bytes = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
while (1) {
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- if (!ret)
- list_move(&em->list,
- &em_tree->modified_extents);
+ ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
if (ret != -EEXIST)
break;
@@ -8689,7 +8654,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = btrfs_real_readdir,
+ .iterate = btrfs_real_readdir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2c02310ff2d9..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!root->ref_cows)
return -EINVAL;
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+
+ btrfs_wait_ordered_extents(root, 0);
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot)
return -ENOMEM;
@@ -723,7 +729,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
struct dentry *dentry;
int error;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ if (error == -EINTR)
+ return error;
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
@@ -1152,8 +1160,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
u64 new_align = ~((u64)128 * 1024 - 1);
struct page **pages = NULL;
- if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ if (isize == 0)
+ return 0;
+
+ if (range->start >= isize)
+ return -EINVAL;
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -1162,8 +1173,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
compress_type = range->compress_type;
}
- if (isize == 0)
- return 0;
+ if (extent_thresh == 0)
+ extent_thresh = 256 * 1024;
/*
* if we were not given a file, allocate a readahead
@@ -1796,7 +1807,11 @@ static noinline int copy_to_sk(struct btrfs_root *root,
item_off = btrfs_item_ptr_offset(leaf, i);
item_len = btrfs_item_size_nr(leaf, i);
- if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+ continue;
+
+ if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
item_len = 0;
if (sizeof(sh) + item_len + *sk_offset >
@@ -1805,10 +1820,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
goto overflow;
}
- btrfs_item_key_to_cpu(leaf, key, i);
- if (!key_in_sk(key, sk))
- continue;
-
sh.objectid = key->objectid;
sh.offset = key->offset;
sh.type = key->type;
@@ -2086,7 +2097,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ if (err == -EINTR)
+ goto out;
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -2347,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- mnt_drop_write_file(file);
- return -EINVAL;
- }
-
- mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2362,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(root, vol_args->name);
- kfree(vol_args);
-out:
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name);
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+ kfree(vol_args);
mnt_drop_write_file(file);
return ret;
}
@@ -2425,7 +2438,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
mutex_lock(&fs_devices->device_list_mutex);
dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
- mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
ret = -ENODEV;
@@ -2449,6 +2461,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
}
out:
+ mutex_unlock(&fs_devices->device_list_mutex);
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
ret = -EFAULT;
@@ -2473,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
+ int same_inode = 0;
/*
* TODO:
@@ -2509,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
ret = -EINVAL;
if (src == inode)
- goto out_fput;
+ same_inode = 1;
/* the src must be open for reading */
if (!(src_file.file->f_mode & FMODE_READ))
@@ -2540,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
path->reada = 2;
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ }
} else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock(&src->i_mutex);
}
/* determine range to clone */
@@ -2563,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
!IS_ALIGNED(destoff, bs))
goto out_unlock;
+ /* verify if ranges are overlapped within the same file */
+ if (same_inode) {
+ if (destoff + len > off && destoff < off + len)
+ goto out_unlock;
+ }
+
if (destoff > inode->i_size) {
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
@@ -2839,7 +2863,8 @@ out:
unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
out_unlock:
mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
+ if (!same_inode)
+ mutex_unlock(&inode->i_mutex);
vfree(buf);
btrfs_free_path(path);
out_fput:
@@ -2944,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- ret = -ENOENT;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3003,7 +3023,7 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
}
}
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_space_args space_args;
struct btrfs_ioctl_space_info space;
@@ -3693,12 +3713,11 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
goto drop_write;
}
- if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
+ down_write(&root->fs_info->subvol_sem);
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
}
switch (sa->cmd) {
@@ -3708,24 +3727,17 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
case BTRFS_QUOTA_CTL_DISABLE:
ret = btrfs_quota_disable(trans, root->fs_info);
break;
- case BTRFS_QUOTA_CTL_RESCAN:
- ret = btrfs_quota_rescan(root->fs_info);
- break;
default:
ret = -EINVAL;
break;
}
- if (copy_to_user(arg, sa, sizeof(*sa)))
- ret = -EFAULT;
-
- if (trans) {
- err = btrfs_commit_transaction(trans, root);
- if (err && !ret)
- ret = err;
- }
+ err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+ if (err && !ret)
+ ret = err;
out:
kfree(sa);
+ up_write(&root->fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3877,6 +3889,74 @@ drop_write:
return ret;
}
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_quota_rescan_args *qsa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ qsa = memdup_user(arg, sizeof(*qsa));
+ if (IS_ERR(qsa)) {
+ ret = PTR_ERR(qsa);
+ goto drop_write;
+ }
+
+ if (qsa->flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+ kfree(qsa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_quota_rescan_args *qsa;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ if (!qsa)
+ return -ENOMEM;
+
+ if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ qsa->flags = 1;
+ qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+ }
+
+ if (copy_to_user(arg, qsa, sizeof(*qsa)))
+ ret = -EFAULT;
+
+ kfree(qsa);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
@@ -3960,7 +4040,7 @@ out:
static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
const char *label = root->fs_info->super_copy->label;
size_t len = strnlen(label, BTRFS_LABEL_SIZE);
int ret;
@@ -3979,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_super_block *super_block = root->fs_info->super_copy;
struct btrfs_trans_handle *trans;
char label[BTRFS_LABEL_SIZE];
@@ -4115,6 +4195,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_qgroup_create(file, argp);
case BTRFS_IOC_QGROUP_LIMIT:
return btrfs_ioctl_qgroup_limit(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN:
+ return btrfs_ioctl_quota_rescan(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+ return btrfs_ioctl_quota_rescan_status(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+ return btrfs_ioctl_quota_rescan_wait(file, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(root, argp);
case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e95df435d897..01277b8f2373 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,7 +24,7 @@
#include "extent_io.h"
#include "locking.h"
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
/*
* if we currently have a spinning reader or writer lock
@@ -264,7 +264,7 @@ void btrfs_assert_tree_locked(struct extent_buffer *eb)
BUG_ON(!atomic_read(&eb->write_locks));
}
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
{
BUG_ON(!atomic_read(&eb->read_locks));
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
struct workspace {
void *mem;
- void *buf; /* where compressed data goes */
- void *cbuf; /* where decompressed data goes */
+ void *buf; /* where decompressed data goes */
+ void *cbuf; /* where compressed data goes */
struct list_head list;
};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 005c45db699e..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
+#include "disk-io.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio, int compress_type)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
ordered_data_tree_panic(inode, -EEXIST, file_offset);
spin_unlock_irq(&tree->lock);
- spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
- &BTRFS_I(inode)->root->fs_info->ordered_extents);
- spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ &root->ordered_extents);
+ root->nr_ordered_extents++;
+ if (root->nr_ordered_extents == 1) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(!list_empty(&root->ordered_root));
+ list_add_tail(&root->ordered_root,
+ &root->fs_info->ordered_roots);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
return 0;
}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
spin_unlock_irq(&tree->lock);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+ root->nr_ordered_extents--;
trace_btrfs_ordered_extent_remove(inode, entry);
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (!root->nr_ordered_extents) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(list_empty(&root->ordered_root));
+ list_del_init(&root->ordered_root);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
}
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
struct list_head splice, works;
- struct list_head *cur;
struct btrfs_ordered_extent *ordered, *next;
struct inode *inode;
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
- list_splice_init(&root->fs_info->ordered_extents, &splice);
+ spin_lock(&root->ordered_extent_lock);
+ list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice)) {
- cur = splice.next;
- ordered = list_entry(cur, struct btrfs_ordered_extent,
- root_extent_list);
- list_del_init(&ordered->root_extent_list);
- atomic_inc(&ordered->refs);
-
+ ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+ root_extent_list);
+ list_move_tail(&ordered->root_extent_list,
+ &root->ordered_extents);
/*
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(ordered->inode);
+ if (!inode) {
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ atomic_inc(&ordered->refs);
+ spin_unlock(&root->ordered_extent_lock);
- if (inode) {
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
- list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
- } else {
- btrfs_put_ordered_extent(ordered);
- }
+ ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
list_for_each_entry_safe(ordered, next, &works, work_list) {
list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ btrfs_wait_ordered_extents(root, delay_iput);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
+}
+
/*
* this is used during transaction commit to write all the inodes
* added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
if (!wait)
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
list_splice_tail(&splice,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
ret = -ENOMEM;
goto out;
}
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
&work->work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
@@ -986,39 +1029,42 @@ out:
* be reclaimed before their checksum is actually put into the btree
*/
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u32 *sum)
+ u32 *sum, int len)
{
struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_sector_sum *sector_sums;
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
- int ret = 1;
+ int index = 0;
ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
- return 1;
+ return 0;
spin_lock_irq(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
- if (disk_bytenr >= ordered_sum->bytenr) {
- num_sectors = ordered_sum->len / sectorsize;
- sector_sums = ordered_sum->sums;
- for (i = 0; i < num_sectors; i++) {
- if (sector_sums[i].bytenr == disk_bytenr) {
- *sum = sector_sums[i].sum;
- ret = 0;
- goto out;
- }
- }
+ if (disk_bytenr >= ordered_sum->bytenr &&
+ disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+ i = (disk_bytenr - ordered_sum->bytenr) >>
+ inode->i_sb->s_blocksize_bits;
+ num_sectors = ordered_sum->len >>
+ inode->i_sb->s_blocksize_bits;
+ num_sectors = min_t(int, len - index, num_sectors - i);
+ memcpy(sum + index, ordered_sum->sums + i,
+ num_sectors);
+
+ index += (int)num_sectors;
+ if (index == len)
+ goto out;
+ disk_bytenr += num_sectors * sectorsize;
}
}
out:
spin_unlock_irq(&tree->lock);
btrfs_put_ordered_extent(ordered);
- return ret;
+ return index;
}
@@ -1049,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
if (last_mod < root->fs_info->last_trans_committed)
return;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8eadfe406cdd..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
struct rb_node *last;
};
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
- /* bytenr on disk */
- u64 bytenr;
- u32 sum;
-};
-
struct btrfs_ordered_sum {
/* bytenr is the start of this extent on disk */
u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
/*
* this is the length in bytes covered by the sums array below.
*/
- unsigned long len;
+ int len;
struct list_head list;
- /* last field is a variable length array of btrfs_sector_sums */
- struct btrfs_sector_sum sums[];
+ /* last field is a variable length array of csums */
+ u32 sums[];
};
/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
unsigned long bytes)
{
- unsigned long num_sectors = (bytes + root->sectorsize - 1) /
- root->sectorsize;
- num_sectors++;
- return sizeof(struct btrfs_ordered_sum) +
- num_sectors * sizeof(struct btrfs_sector_sum);
+ int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
}
static inline void
@@ -196,13 +181,16 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+ u32 *sum, int len);
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput);
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 920957ecb27e..dc0024f17c1f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -176,7 +176,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
nr = btrfs_header_nritems(l);
- printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+ btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
(unsigned long long)btrfs_header_bytenr(l), nr,
btrfs_leaf_free_space(root, l));
for (i = 0 ; i < nr ; i++) {
@@ -319,10 +319,9 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
btrfs_print_leaf(root, c);
return;
}
- printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
- (unsigned long long)btrfs_header_bytenr(c),
- level, nr,
- (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+ btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+ (unsigned long long)btrfs_header_bytenr(c),
+ level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index da75efe534d5..7faddfacc5bd 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -19,5 +19,5 @@
#ifndef __PRINT_TREE_
#define __PRINT_TREE_
void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b44124dd2370..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -31,13 +31,13 @@
#include "locking.h"
#include "ulist.h"
#include "backref.h"
+#include "extent_io.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
* - compressed
* - sync
- * - rescan
* - copy also limits on subvol creation
* - limit
* - caches fuer ulists
@@ -98,7 +98,12 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-/* must be called with qgroup_lock held */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
@@ -247,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
int slot;
int ret = 0;
u64 flags = 0;
+ u64 rescan_progress = 0;
if (!fs_info->quota_enabled)
return 0;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -298,7 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
ptr);
- /* FIXME read scan element */
+ rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -400,9 +412,18 @@ out:
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+ ret >= 0) {
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
}
btrfs_free_path(path);
+ if (ret < 0) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
return ret < 0 ? ret : 0;
}
@@ -420,8 +441,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- WARN_ON(!list_empty(&qgroup->dirty));
-
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
struct btrfs_qgroup_list,
@@ -441,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
}
kfree(qgroup);
}
+ ulist_free(fs_info->qgroup_ulist);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -721,7 +741,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
- /* XXX scan */
+ btrfs_set_qgroup_status_rescan(l, ptr,
+ fs_info->qgroup_rescan_progress.objectid);
btrfs_mark_buffer_dirty(l);
@@ -783,19 +804,27 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *quota_root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_path *path = NULL;
struct btrfs_qgroup_status_item *ptr;
struct extent_buffer *leaf;
struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_qgroup *qgroup = NULL;
int ret = 0;
+ int slot;
- spin_lock(&fs_info->qgroup_lock);
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root) {
fs_info->pending_quota_state = 1;
- spin_unlock(&fs_info->qgroup_lock);
goto out;
}
- spin_unlock(&fs_info->qgroup_lock);
+
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
/*
* initially create the quota tree
@@ -830,10 +859,57 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
- btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+ btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = 0;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+ if (ret > 0)
+ goto out_add_root;
+ if (ret < 0)
+ goto out_free_path;
+
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.type == BTRFS_ROOT_REF_KEY) {
+ ret = add_qgroup_item(trans, quota_root,
+ found_key.offset);
+ if (ret)
+ goto out_free_path;
+
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out_free_path;
+ }
+ }
+ ret = btrfs_next_item(tree_root, path);
+ if (ret < 0)
+ goto out_free_path;
+ if (ret)
+ break;
+ }
+
+out_add_root:
+ btrfs_release_path(path);
+ ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+ if (ret)
+ goto out_free_path;
+
+ qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out_free_path;
+ }
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
fs_info->pending_quota_state = 1;
@@ -847,6 +923,11 @@ out_free_root:
kfree(quota_root);
}
out:
+ if (ret) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -857,11 +938,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ goto out;
spin_lock(&fs_info->qgroup_lock);
- if (!fs_info->quota_root) {
- spin_unlock(&fs_info->qgroup_lock);
- return 0;
- }
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
quota_root = fs_info->quota_root;
@@ -869,8 +949,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
btrfs_free_qgroup_config(fs_info);
spin_unlock(&fs_info->qgroup_lock);
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret)
@@ -891,39 +973,62 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
free_extent_buffer(quota_root->commit_root);
kfree(quota_root);
out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
-int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
- /* FIXME */
- return 0;
+ if (list_empty(&qgroup->dirty))
+ list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
struct btrfs_root *quota_root;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ if (!member || !parent) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
ret = add_qgroup_relation_item(trans, quota_root, src, dst);
if (ret)
- return ret;
+ goto out;
ret = add_qgroup_relation_item(trans, quota_root, dst, src);
if (ret) {
del_qgroup_relation_item(trans, quota_root, src, dst);
- return ret;
+ goto out;
}
spin_lock(&fs_info->qgroup_lock);
ret = add_relation_rb(quota_root->fs_info, src, dst);
spin_unlock(&fs_info->qgroup_lock);
-
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -931,13 +1036,34 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
struct btrfs_root *quota_root;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
int ret = 0;
int err;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ if (!member || !parent) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent)
+ goto exist;
+ }
+ ret = -ENOENT;
+ goto out;
+exist:
ret = del_qgroup_relation_item(trans, quota_root, src, dst);
err = del_qgroup_relation_item(trans, quota_root, dst, src);
if (err && !ret)
@@ -945,9 +1071,9 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
-
spin_unlock(&fs_info->qgroup_lock);
-
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -958,11 +1084,21 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ ret = -EEXIST;
+ goto out;
+ }
ret = add_qgroup_item(trans, quota_root, qgroupid);
+ if (ret)
+ goto out;
spin_lock(&fs_info->qgroup_lock);
qgroup = add_qgroup_rb(fs_info, qgroupid);
@@ -970,7 +1106,8 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
if (IS_ERR(qgroup))
ret = PTR_ERR(qgroup);
-
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -981,27 +1118,32 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
- /* check if there are no relations to this qgroup */
- spin_lock(&fs_info->qgroup_lock);
qgroup = find_qgroup_rb(fs_info, qgroupid);
- if (qgroup) {
- if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
- spin_unlock(&fs_info->qgroup_lock);
- return -EBUSY;
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ } else {
+ /* check if there are no relations to this qgroup */
+ if (!list_empty(&qgroup->groups) ||
+ !list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
}
}
- spin_unlock(&fs_info->qgroup_lock);
-
ret = del_qgroup_item(trans, quota_root, qgroupid);
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
-
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -1009,13 +1151,22 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
- struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
- if (!quota_root)
- return -EINVAL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
limit->flags, limit->max_rfer,
limit->max_excl, limit->rsv_rfer,
@@ -1027,31 +1178,17 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
}
spin_lock(&fs_info->qgroup_lock);
-
- qgroup = find_qgroup_rb(fs_info, qgroupid);
- if (!qgroup) {
- ret = -ENOENT;
- goto unlock;
- }
qgroup->lim_flags = limit->flags;
qgroup->max_rfer = limit->max_rfer;
qgroup->max_excl = limit->max_excl;
qgroup->rsv_rfer = limit->rsv_rfer;
qgroup->rsv_excl = limit->rsv_excl;
-
-unlock:
spin_unlock(&fs_info->qgroup_lock);
-
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
-static void qgroup_dirty(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup)
-{
- if (list_empty(&qgroup->dirty))
- list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
-}
-
/*
* btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
* the modification into a list that's later used by btrfs_end_transaction to
@@ -1075,6 +1212,144 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
return 0;
}
+static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ u64 seq)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+ struct btrfs_qgroup *qg;
+ int ret;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ulist_reinit(tmp);
+ /* XXX id not needed */
+ ret = ulist_add(tmp, qg->qgroupid,
+ (u64)(uintptr_t)qg, GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+ if (qg->refcnt < seq)
+ qg->refcnt = seq + 1;
+ else
+ ++qg->refcnt;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ (u64)(uintptr_t)glist->group,
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ u64 seq, int sgn, u64 num_bytes,
+ struct btrfs_qgroup *qgroup)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+ int ret;
+
+ ulist_reinit(tmp);
+ ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ if (qg->refcnt < seq) {
+ /* not visited by step 1 */
+ qg->rfer += sgn * num_bytes;
+ qg->rfer_cmpr += sgn * num_bytes;
+ if (roots->nnodes == 0) {
+ qg->excl += sgn * num_bytes;
+ qg->excl_cmpr += sgn * num_bytes;
+ }
+ qgroup_dirty(fs_info, qg);
+ }
+ WARN_ON(qg->tag >= seq);
+ qg->tag = seq;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ u64 seq, int sgn, u64 num_bytes)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+ int ret;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ulist_reinit(tmp);
+ ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+ if (qg->tag == seq)
+ continue;
+
+ if (qg->refcnt - seq == roots->nnodes) {
+ qg->excl -= sgn * num_bytes;
+ qg->excl_cmpr -= sgn * num_bytes;
+ qgroup_dirty(fs_info, qg);
+ }
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ (uintptr_t)glist->group,
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
/*
* btrfs_qgroup_account_ref is called for every ref that is added to or deleted
* from the fs. First, all roots referencing the extent are searched, and
@@ -1090,10 +1365,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root;
u64 ref_root;
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
struct ulist *roots = NULL;
- struct ulist *tmp = NULL;
- struct ulist_iterator uiter;
u64 seq;
int ret = 0;
int sgn;
@@ -1132,9 +1404,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
case BTRFS_ADD_DELAYED_REF:
case BTRFS_ADD_DELAYED_EXTENT:
sgn = 1;
+ seq = btrfs_tree_mod_seq_prev(node->seq);
break;
case BTRFS_DROP_DELAYED_REF:
sgn = -1;
+ seq = node->seq;
break;
case BTRFS_UPDATE_DELAYED_HEAD:
return 0;
@@ -1142,20 +1416,30 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
BUG();
}
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
/*
* the delayed ref sequence number we pass depends on the direction of
- * the operation. for add operations, we pass (node->seq - 1) to skip
+ * the operation. for add operations, we pass
+ * tree_mod_log_prev_seq(node->seq) to skip
* the delayed ref's current sequence number, because we need the state
* of the tree before the add operation. for delete operations, we pass
* (node->seq) to include the delayed ref's current sequence number,
* because we need the state of the tree after the delete operation.
*/
- ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
- sgn > 0 ? node->seq - 1 : node->seq, &roots);
+ ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
if (ret < 0)
return ret;
spin_lock(&fs_info->qgroup_lock);
+
quota_root = fs_info->quota_root;
if (!quota_root)
goto unlock;
@@ -1167,116 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
/*
* step 1: for each old ref, visit all nodes once and inc refcnt
*/
- tmp = ulist_alloc(GFP_ATOMIC);
- if (!tmp) {
- ret = -ENOMEM;
- goto unlock;
- }
+ ulist_reinit(fs_info->qgroup_ulist);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
- struct btrfs_qgroup *qg;
-
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
-
- ulist_reinit(tmp);
- /* XXX id not needed */
- ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->refcnt < seq)
- qg->refcnt = seq + 1;
- else
- ++qg->refcnt;
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ulist_add(tmp, glist->group->qgroupid,
- (u64)(uintptr_t)glist->group,
- GFP_ATOMIC);
- }
- }
- }
+ ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+ seq);
+ if (ret)
+ goto unlock;
/*
* step 2: walk from the new root
*/
- ulist_reinit(tmp);
- ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
- if (qg->refcnt < seq) {
- /* not visited by step 1 */
- qg->rfer += sgn * node->num_bytes;
- qg->rfer_cmpr += sgn * node->num_bytes;
- if (roots->nnodes == 0) {
- qg->excl += sgn * node->num_bytes;
- qg->excl_cmpr += sgn * node->num_bytes;
- }
- qgroup_dirty(fs_info, qg);
- }
- WARN_ON(qg->tag >= seq);
- qg->tag = seq;
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
- }
- }
+ ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes, qgroup);
+ if (ret)
+ goto unlock;
/*
* step 3: walk again from old refs
*/
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- struct btrfs_qgroup *qg;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
-
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
-
- ulist_reinit(tmp);
- ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->tag == seq)
- continue;
-
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl -= sgn * node->num_bytes;
- qg->excl_cmpr -= sgn * node->num_bytes;
- qgroup_dirty(fs_info, qg);
- }
+ ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes);
+ if (ret)
+ goto unlock;
- list_for_each_entry(glist, &qg->groups, next_group) {
- ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- }
- }
- }
- ret = 0;
unlock:
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
- ulist_free(tmp);
return ret;
}
@@ -1289,10 +1491,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
{
struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
+ int start_rescan_worker = 0;
if (!quota_root)
goto out;
+ if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+ start_rescan_worker = 1;
+
fs_info->quota_enabled = fs_info->pending_quota_state;
spin_lock(&fs_info->qgroup_lock);
@@ -1318,6 +1524,16 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ if (!ret && start_rescan_worker) {
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
+ ret = 0;
+ }
+
out:
return ret;
@@ -1338,12 +1554,30 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
u32 level_size = 0;
+ u64 nums;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_enabled)
- return 0;
+ goto out;
- if (!quota_root)
- return -EINVAL;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ for (i = 0; i < nums; ++i) {
+ srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+ if (!srcgroup) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ++i_qgroups;
+ }
+ }
/*
* create a tracking group for the subvol itself
@@ -1470,6 +1704,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
unlock:
spin_unlock(&fs_info->qgroup_lock);
out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -1486,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1509,44 +1743,46 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* in a first step, we check all affected qgroups if any limits would
* be exceeded
*/
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- ret = -ENOMEM;
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ (uintptr_t)qgroup, GFP_ATOMIC);
+ if (ret < 0)
goto out;
- }
- ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
- qg->reserved + qg->rfer + num_bytes >
+ qg->reserved + (s64)qg->rfer + num_bytes >
qg->max_rfer) {
ret = -EDQUOT;
goto out;
}
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
- qg->reserved + qg->excl + num_bytes >
+ qg->reserved + (s64)qg->excl + num_bytes >
qg->max_excl) {
ret = -EDQUOT;
goto out;
}
list_for_each_entry(glist, &qg->groups, next_group) {
- ulist_add(ulist, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
}
}
-
+ ret = 0;
/*
* no limits exceeded, now record the reservation into all qgroups
*/
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1556,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
-
return ret;
}
@@ -1566,10 +1800,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
u64 ref_root = root->root_key.objectid;
+ int ret = 0;
if (!is_fstree(ref_root))
return;
@@ -1587,14 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
if (!qgroup)
goto out;
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- btrfs_std_error(fs_info, -ENOMEM);
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ (uintptr_t)qgroup, GFP_ATOMIC);
+ if (ret < 0)
goto out;
- }
- ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
@@ -1603,22 +1836,370 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
qg->reserved -= num_bytes;
list_for_each_entry(glist, &qg->groups, next_group) {
- ulist_add(ulist, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
}
}
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
return;
- printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+ pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
- trans->delayed_ref_elem.seq);
+ (u32)(trans->delayed_ref_elem.seq >> 32),
+ (u32)trans->delayed_ref_elem.seq);
BUG();
}
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ */
+static int
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct btrfs_trans_handle *trans, struct ulist *tmp,
+ struct extent_buffer *scratch_leaf)
+{
+ struct btrfs_key found;
+ struct ulist *roots = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct seq_list tree_mod_seq_elem = {};
+ u64 seq;
+ int slot;
+ int ret;
+
+ path->leave_spinning = 1;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ ret = btrfs_search_slot_for_read(fs_info->extent_root,
+ &fs_info->qgroup_rescan_progress,
+ path, 1, 0);
+
+ pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
+ (unsigned long long)fs_info->qgroup_rescan_progress.objectid,
+ fs_info->qgroup_rescan_progress.type,
+ (unsigned long long)fs_info->qgroup_rescan_progress.offset,
+ ret);
+
+ if (ret) {
+ /*
+ * The rescan is about to end, we will not be scanning any
+ * further blocks. We cannot unset the RESCAN flag here, because
+ * we want to commit the transaction if everything went well.
+ * To make the live accounting work in this phase, we set our
+ * scan progress pointer such that every real extent objectid
+ * will be smaller.
+ */
+ fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return ret;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found,
+ btrfs_header_nritems(path->nodes[0]) - 1);
+ fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+ btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+ if (found.type != BTRFS_EXTENT_ITEM_KEY)
+ continue;
+ ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret < 0)
+ goto out;
+ spin_lock(&fs_info->qgroup_lock);
+ seq = fs_info->qgroup_seq;
+ fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+ ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+ if (ret) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
+ }
+
+ /*
+ * step2 of btrfs_qgroup_account_ref works from a single root,
+ * we're doing all at once here.
+ */
+ ulist_reinit(tmp);
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ struct btrfs_qgroup *qg;
+
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
+ GFP_ATOMIC);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
+ }
+ }
+
+ /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
+ qg->rfer += found.offset;
+ qg->rfer_cmpr += found.offset;
+ WARN_ON(qg->tag >= seq);
+ if (qg->refcnt - seq == roots->nnodes) {
+ qg->excl += found.offset;
+ qg->excl_cmpr += found.offset;
+ }
+ qgroup_dirty(fs_info, qg);
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ (uintptr_t)glist->group,
+ GFP_ATOMIC);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
+ }
+ }
+ }
+
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ ret = 0;
+ }
+
+out:
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+
+ return ret;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+ qgroup_rescan_work);
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ struct ulist *tmp = NULL;
+ struct extent_buffer *scratch_leaf = NULL;
+ int err = -ENOMEM;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ goto out;
+ scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
+ if (!scratch_leaf)
+ goto out;
+
+ err = 0;
+ while (!err) {
+ trans = btrfs_start_transaction(fs_info->fs_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ break;
+ }
+ if (!fs_info->quota_enabled) {
+ err = -EINTR;
+ } else {
+ err = qgroup_rescan_leaf(fs_info, path, trans,
+ tmp, scratch_leaf);
+ }
+ if (err > 0)
+ btrfs_commit_transaction(trans, fs_info->fs_root);
+ else
+ btrfs_end_transaction(trans, fs_info->fs_root);
+ }
+
+out:
+ kfree(scratch_leaf);
+ ulist_free(tmp);
+ btrfs_free_path(path);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+
+ if (err == 2 &&
+ fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ } else if (err < 0) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (err >= 0) {
+ pr_info("btrfs: qgroup scan completed%s\n",
+ err == 2 ? " (inconsistency flag cleared)" : "");
+ } else {
+ pr_err("btrfs: qgroup scan failed with %d\n", err);
+ }
+
+ complete_all(&fs_info->qgroup_rescan_completion);
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags)
+{
+ int ret = 0;
+
+ if (!init_flags &&
+ (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+
+ if (init_flags) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = -EINPROGRESS;
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto err;
+ }
+
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
+ memset(&fs_info->qgroup_rescan_progress, 0,
+ sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ init_completion(&fs_info->qgroup_rescan_completion);
+
+ memset(&fs_info->qgroup_rescan_work, 0,
+ sizeof(fs_info->qgroup_rescan_work));
+ fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+
+ if (ret) {
+err:
+ pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+
+ spin_lock(&fs_info->qgroup_lock);
+ /* clear all current qgroup tracking information */
+ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ qgroup->rfer = 0;
+ qgroup->rfer_cmpr = 0;
+ qgroup->excl = 0;
+ qgroup->excl_cmpr = 0;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * We have set the rescan_progress to 0, which means no more
+ * delayed refs will be accounted by btrfs_qgroup_account_ref.
+ * However, btrfs_qgroup_account_ref may be right after its call
+ * to btrfs_find_all_roots, in which case it would still do the
+ * accounting.
+ * To solve this, we're committing the transaction, which will
+ * ensure we run all delayed refs and only after that, we are
+ * going to clear all tracking information for a clean start.
+ */
+
+ trans = btrfs_join_transaction(fs_info->fs_root);
+ if (IS_ERR(trans)) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
+
+ qgroup_rescan_zero_tracking(fs_info);
+
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+
+ return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+ int running;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+ running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (running)
+ ret = wait_for_completion_interruptible(
+ &fs_info->qgroup_rescan_completion);
+
+ return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9a79fb790adb..0525e1389f5b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -410,7 +410,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
/*
* remove everything in the cache
*/
-void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
{
struct btrfs_stripe_hash_table *table;
unsigned long flags;
@@ -1010,12 +1010,12 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
* this will try to merge into existing bios if possible, and returns
* zero if all went well.
*/
-int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct page *page,
+ int stripe_nr,
+ unsigned long page_index,
+ unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
u64 last_end = 0;
@@ -1050,7 +1050,7 @@ int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
if (!bio)
return -ENOMEM;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 96b93daa0bbb..1031b69252c5 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -955,10 +955,11 @@ int btrfs_reada_wait(void *handle)
while (atomic_read(&rc->elems)) {
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
5 * HZ);
- dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+ dump_devs(rc->root->fs_info,
+ atomic_read(&rc->elems) < 10 ? 1 : 0);
}
- dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+ dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
kref_put(&rc->refcnt, reada_control_release);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b67171e6d688..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -326,8 +326,7 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
return NULL;
}
-void backref_tree_panic(struct rb_node *rb_node, int errno,
- u64 bytenr)
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
{
struct btrfs_fs_info *fs_info = NULL;
@@ -619,10 +618,13 @@ static noinline_for_stack
int find_inline_backref(struct extent_buffer *leaf, int slot,
unsigned long *ptr, unsigned long *end)
{
+ struct btrfs_key key;
struct btrfs_extent_item *ei;
struct btrfs_tree_block_info *bi;
u32 item_size;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
item_size = btrfs_item_size_nr(leaf, slot);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (item_size < sizeof(*ei)) {
@@ -634,13 +636,18 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
WARN_ON(!(btrfs_extent_flags(leaf, ei) &
BTRFS_EXTENT_FLAG_TREE_BLOCK));
- if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ item_size <= sizeof(*ei) + sizeof(*bi)) {
WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
return 1;
}
- bi = (struct btrfs_tree_block_info *)(ei + 1);
- *ptr = (unsigned long)(bi + 1);
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ *ptr = (unsigned long)(bi + 1);
+ } else {
+ *ptr = (unsigned long)(ei + 1);
+ }
*end = (unsigned long)ei + item_size;
return 0;
}
@@ -708,7 +715,7 @@ again:
end = 0;
ptr = 0;
key.objectid = cur->bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = (u64)-1;
path1->search_commit_root = 1;
@@ -766,7 +773,8 @@ again:
break;
}
- if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
ret = find_inline_backref(eb, path1->slots[0],
&ptr, &end);
if (ret)
@@ -1297,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
+ u64 last_snap = 0;
int ret;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1312,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
+ last_snap = btrfs_root_last_snapshot(&root->root_item);
btrfs_set_root_last_snapshot(&root->root_item,
trans->transid - 1);
} else {
@@ -1337,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
root_item->drop_level = 0;
+ /*
+ * abuse rtransid, it is safe because it is impossible to
+ * receive data into a relocation tree.
+ */
+ btrfs_set_root_rtransid(root_item, last_snap);
+ btrfs_set_root_otransid(root_item, trans->transid);
}
btrfs_tree_unlock(eb);
@@ -1347,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(root_item);
- reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &root_key);
+ reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
return reloc_root;
@@ -1762,7 +1777,11 @@ again:
eb = read_tree_block(dest, old_bytenr, blocksize,
old_ptr_gen);
- BUG_ON(!eb);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = (!eb) ? -ENOMEM : -EIO;
+ free_extent_buffer(eb);
+ break;
+ }
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1915,6 +1934,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
blocksize = btrfs_level_size(root, i - 1);
eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
BUG_ON(btrfs_header_level(eb) != i - 1);
path->nodes[i - 1] = eb;
path->slots[i - 1] = 0;
@@ -2257,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
+ u64 last_snap;
+ u64 otransid;
+ u64 objectid;
LIST_HEAD(reloc_roots);
int found = 0;
int ret = 0;
@@ -2292,12 +2319,44 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
+
+ /*
+ * we keep the old last snapshod transid in rtranid when we
+ * created the relocation tree.
+ */
+ last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+ otransid = btrfs_root_otransid(&reloc_root->root_item);
+ objectid = reloc_root->root_key.offset;
+
ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
if (ret < 0) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
+ } else if (!ret) {
+ /*
+ * recover the last snapshot tranid to avoid
+ * the space balance break NOCOW.
+ */
+ root = read_fs_root(rc->extent_root->fs_info,
+ objectid);
+ if (IS_ERR(root))
+ continue;
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ continue;
+
+ trans = btrfs_join_transaction(root);
+ BUG_ON(IS_ERR(trans));
+
+ /* Check if the fs/file tree was snapshoted or not. */
+ if (btrfs_root_last_snapshot(&root->root_item) ==
+ otransid - 1)
+ btrfs_set_root_last_snapshot(&root->root_item,
+ last_snap);
+
+ btrfs_end_transaction(trans, root);
}
}
@@ -2592,7 +2651,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = btrfs_level_size(root, node->level);
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, blocksize, generation);
- if (!eb) {
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
err = -EIO;
goto next;
}
@@ -2753,7 +2813,10 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.objectid, block->key.offset);
- BUG_ON(!eb);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2768,8 +2831,13 @@ static int reada_tree_block(struct reloc_control *rc,
struct tree_block *block)
{
BUG_ON(block->key_ready);
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
+ if (block->key.type == BTRFS_METADATA_ITEM_KEY)
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid,
+ rc->extent_root->leafsize);
+ else
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid, block->key.offset);
return 0;
}
@@ -2850,7 +2918,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- goto out_path;
+ goto out_free_blocks;
}
rb_node = rb_first(blocks);
@@ -2864,8 +2932,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
rb_node = rb_first(blocks);
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- if (!block->key_ready)
- get_tree_block_key(rc, block);
+ if (!block->key_ready) {
+ err = get_tree_block_key(rc, block);
+ if (err)
+ goto out_free_path;
+ }
rb_node = rb_next(rb_node);
}
@@ -2892,8 +2963,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
out:
err = finish_pending_nodes(trans, rc, path, err);
+out_free_path:
btrfs_free_path(path);
-out_path:
+out_free_blocks:
free_block_list(blocks);
return err;
}
@@ -2965,7 +3037,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
while (1) {
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
+ ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
@@ -3176,12 +3248,17 @@ static int add_tree_block(struct reloc_control *rc,
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
- if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+ if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+ item_size >= sizeof(*ei) + sizeof(*bi)) {
ei = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_item);
- bi = (struct btrfs_tree_block_info *)(ei + 1);
+ if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ level = btrfs_tree_block_level(eb, bi);
+ } else {
+ level = (int)extent_key->offset;
+ }
generation = btrfs_extent_generation(eb, ei);
- level = btrfs_tree_block_level(eb, bi);
} else {
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
u64 ref_owner;
@@ -3210,7 +3287,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = extent_key->offset;
+ block->key.objectid = rc->extent_root->leafsize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3232,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+ SKINNY_METADATA);
if (tree_block_processed(bytenr, blocksize, rc))
return 0;
@@ -3242,19 +3321,42 @@ static int __add_tree_block(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
+again:
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = blocksize;
+ if (skinny) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+ }
path->search_commit_root = 1;
path->skip_locking = 1;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
+
+ if (ret > 0 && skinny) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_METADATA_ITEM_KEY ||
+ (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == blocksize)))
+ ret = 0;
+ }
+
+ if (ret) {
+ skinny = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
BUG_ON(ret);
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ret = add_tree_block(rc, &key, path, blocks);
out:
btrfs_free_path(path);
@@ -3275,7 +3377,8 @@ static int block_use_full_backref(struct reloc_control *rc,
return 1;
ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
- eb->start, eb->len, NULL, &flags);
+ eb->start, btrfs_header_level(eb), 1,
+ NULL, &flags);
BUG_ON(ret);
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
@@ -3309,6 +3412,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
}
truncate:
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3644,12 +3752,25 @@ next:
break;
}
- if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
key.objectid + key.offset <= rc->search_start) {
path->slots[0]++;
goto next;
}
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.objectid + rc->extent_root->leafsize <=
+ rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
ret = find_first_extent_bit(&rc->processed_blocks,
key.objectid, &start, &end,
EXTENT_DIRTY, NULL);
@@ -3658,7 +3779,11 @@ next:
btrfs_release_path(path);
rc->search_start = end + 1;
} else {
- rc->search_start = key.objectid + key.offset;
+ if (key.type == BTRFS_EXTENT_ITEM_KEY)
+ rc->search_start = key.objectid + key.offset;
+ else
+ rc->search_start = key.objectid +
+ rc->extent_root->leafsize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -4019,7 +4144,7 @@ out:
return inode;
}
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -4030,7 +4155,8 @@ static struct reloc_control *alloc_reloc_control(void)
INIT_LIST_HEAD(&rc->reloc_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL);
+ extent_io_tree_init(&rc->processed_blocks,
+ fs_info->btree_inode->i_mapping);
return rc;
}
@@ -4047,7 +4173,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
int rw = 0;
int err = 0;
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(fs_info);
if (!rc)
return -ENOMEM;
@@ -4096,19 +4222,16 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
if (ret < 0) {
err = ret;
goto out;
}
- btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ btrfs_wait_all_ordered_extents(fs_info, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
-
- btrfs_clean_old_snapshots(fs_info->tree_root);
ret = relocate_block_group(rc);
-
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
@@ -4216,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+ reloc_root = btrfs_read_fs_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
@@ -4251,7 +4374,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(root->fs_info);
if (!rc) {
err = -ENOMEM;
goto out;
@@ -4335,10 +4458,8 @@ out:
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
- size_t offset;
int ret;
u64 disk_bytenr;
LIST_HEAD(list);
@@ -4352,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
if (ret)
goto out;
+ disk_bytenr = ordered->start;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
-
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ sums->bytenr = disk_bytenr;
+ disk_bytenr += sums->len;
btrfs_add_ordered_sum(inode, ordered, sums);
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 668af537a3ea..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -29,9 +29,8 @@
* generation numbers as then we know the root was once mounted with an older
* kernel that was not aware of the root item structure change.
*/
-void btrfs_read_root_item(struct btrfs_root *root,
- struct extent_buffer *eb, int slot,
- struct btrfs_root_item *item)
+void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item)
{
uuid_le uuid;
int len;
@@ -65,52 +64,59 @@ void btrfs_read_root_item(struct btrfs_root *root,
}
/*
- * lookup the root with the highest offset for a given objectid. The key we do
- * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
*/
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
- struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key)
{
- struct btrfs_path *path;
- struct btrfs_key search_key;
struct btrfs_key found_key;
struct extent_buffer *l;
int ret;
int slot;
- search_key.objectid = objectid;
- search_key.type = BTRFS_ROOT_ITEM_KEY;
- search_key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
- BUG_ON(ret == 0);
- if (path->slots[0] == 0) {
- ret = 1;
- goto out;
+ if (search_key->offset != -1ULL) { /* the search key is exact */
+ if (ret > 0)
+ goto out;
+ } else {
+ BUG_ON(ret == 0); /* Logical error */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ ret = 0;
}
+
l = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
+
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid ||
+ if (found_key.objectid != search_key->objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- if (item)
- btrfs_read_root_item(root, l, slot, item);
- if (key)
- memcpy(key, &found_key, sizeof(found_key));
- ret = 0;
+ if (root_item)
+ btrfs_read_root_item(l, slot, root_item);
+ if (root_key)
+ memcpy(root_key, &found_key, sizeof(found_key));
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
return ret;
}
@@ -213,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed. This is any root item with an
- * offset lower than the latest root. They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_root *dead_root;
- struct btrfs_root_item *ri;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_path *path;
- int ret;
- u32 nritems;
- struct extent_buffer *leaf;
- int slot;
-
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = 0;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
-again:
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
- while (1) {
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- if (slot >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
- goto next;
-
- if (key.objectid < objectid)
- goto next;
-
- if (key.objectid > objectid)
- break;
-
- ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
- if (btrfs_disk_root_refs(leaf, ri) != 0)
- goto next;
-
- memcpy(&found_key, &key, sizeof(key));
- key.offset++;
- btrfs_release_path(path);
- dead_root =
- btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &found_key);
- if (IS_ERR(dead_root)) {
- ret = PTR_ERR(dead_root);
- goto err;
- }
-
- ret = btrfs_add_dead_root(dead_root);
- if (ret)
- goto err;
- goto again;
-next:
- slot++;
- path->slots[0]++;
- }
- ret = 0;
-err:
- btrfs_free_path(path);
- return ret;
-}
-
int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
{
struct extent_buffer *leaf;
@@ -302,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct btrfs_root *root;
int err = 0;
int ret;
+ bool can_recover = true;
+
+ if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+ can_recover = false;
path = btrfs_alloc_path();
if (!path)
@@ -341,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
- root = btrfs_read_fs_root_no_name(tree_root->fs_info,
- &root_key);
- if (!IS_ERR(root))
+ root = btrfs_read_fs_root(tree_root, &root_key);
+ err = PTR_RET(root);
+ if (err && err != -ENOENT) {
+ break;
+ } else if (err == -ENOENT) {
+ struct btrfs_trans_handle *trans;
+
+ btrfs_release_path(path);
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to start trans to delete "
+ "orphan item");
+ break;
+ }
+ err = btrfs_del_orphan_item(trans, tree_root,
+ root_key.objectid);
+ btrfs_end_transaction(trans, tree_root);
+ if (err) {
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to delete root orphan "
+ "item");
+ break;
+ }
continue;
+ }
- ret = PTR_ERR(root);
- if (ret != -ENOENT) {
- err = ret;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
+ err = btrfs_init_fs_root(root);
+ if (err) {
+ btrfs_free_fs_root(root);
break;
}
- ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
- if (ret) {
- err = ret;
+ root->orphan_item_inserted = 1;
+
+ err = btrfs_insert_fs_root(root->fs_info, root);
+ if (err) {
+ BUG_ON(err == -EEXIST);
+ btrfs_free_fs_root(root);
break;
}
}
@@ -369,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_path *path;
int ret;
- struct btrfs_root_item *ri;
- struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
@@ -380,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
goto out;
BUG_ON(ret != 0);
- leaf = path->nodes[0];
- ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
ret = btrfs_del_item(trans, root, path);
out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 85e072b956d5..4ba2a69a60ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!page->page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
@@ -1336,7 +1336,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
int page_num;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
- struct btrfs_root *root = fs_info->extent_root;
void *mapped_buffer;
WARN_ON(!sblock->pagev[0]->page);
@@ -1365,12 +1364,11 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
for (page_num = 0;;) {
if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(root,
+ crc = btrfs_csum_data(
((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
else
- crc = btrfs_csum_data(root, mapped_buffer, crc,
- PAGE_SIZE);
+ crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
kunmap_atomic(mapped_buffer);
page_num++;
@@ -1433,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_bdev = page_bad->dev->bdev;
@@ -1524,7 +1522,7 @@ again:
sbio->dev = wr_ctx->tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
if (!bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
@@ -1657,7 +1655,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
void *buffer;
u32 crc = ~(u32)0;
int fail = 0;
- struct btrfs_root *root = sctx->dev_root;
u64 len;
int index;
@@ -1674,7 +1671,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
- crc = btrfs_csum_data(root, buffer, crc, l);
+ crc = btrfs_csum_data(buffer, crc, l);
kunmap_atomic(buffer);
len -= l;
if (len == 0)
@@ -1744,7 +1741,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
- crc = btrfs_csum_data(root, p, crc, l);
+ crc = btrfs_csum_data(p, crc, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@@ -1805,7 +1802,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
- crc = btrfs_csum_data(root, p, crc, l);
+ crc = btrfs_csum_data(p, crc, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@@ -1933,7 +1930,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
if (!bio)
return -ENOMEM;
sbio->bio = bio;
@@ -2129,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
- int ret = 0;
- unsigned long i;
+ unsigned long index;
unsigned long num_sectors;
while (!list_empty(&sctx->csum_list)) {
@@ -2149,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
if (!sum)
return 0;
+ index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
num_sectors = sum->len / sctx->sectorsize;
- for (i = 0; i < num_sectors; ++i) {
- if (sum->sums[i].bytenr == logical) {
- memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
- ret = 1;
- break;
- }
- }
- if (ret && i == num_sectors - 1) {
+ memcpy(csum, sum->sums + index, sctx->csum_size);
+ if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
}
- return ret;
+ return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2236,12 +2227,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 flags;
int ret;
int slot;
- int i;
u64 nstripes;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
+ u64 logic_end;
u64 generation;
int mirror_num;
struct reada_control *reada1;
@@ -2255,6 +2246,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_len;
struct btrfs_device *extent_dev;
int extent_mirror_num;
+ int stop_loop;
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
@@ -2315,8 +2307,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.type = BTRFS_EXTENT_ITEM_KEY;
key_start.offset = (u64)0;
key_end.objectid = base + offset + nstripes * increment;
- key_end.type = BTRFS_EXTENT_ITEM_KEY;
- key_end.offset = (u64)0;
+ key_end.type = BTRFS_METADATA_ITEM_KEY;
+ key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key_start, &key_end);
key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -2354,8 +2346,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
logical = base + offset;
physical = map->stripes[num].physical;
+ logic_end = logical + increment * nstripes;
ret = 0;
- for (i = 0; i < nstripes; ++i) {
+ while (logical < logic_end) {
/*
* canceled?
*/
@@ -2391,19 +2384,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
wake_up(&fs_info->scrub_pause_wait);
}
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
-
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
+
if (ret > 0) {
ret = btrfs_previous_item(root, path, 0,
BTRFS_EXTENT_ITEM_KEY);
@@ -2420,7 +2408,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
}
+ stop_loop = 0;
while (1) {
+ u64 bytes;
+
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
@@ -2430,19 +2421,30 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (ret < 0)
goto out;
+ stop_loop = 1;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
- if (key.objectid + key.offset <= logical)
- goto next;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ bytes = root->leafsize;
+ else
+ bytes = key.offset;
- if (key.objectid >= logical + map->stripe_len)
- break;
+ if (key.objectid + bytes <= logical)
+ goto next;
- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
goto next;
+ if (key.objectid >= logical + map->stripe_len) {
+ /* out of this device extent */
+ if (key.objectid >= logic_end)
+ stop_loop = 1;
+ break;
+ }
+
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
flags = btrfs_extent_flags(l, extent);
@@ -2458,22 +2460,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
goto next;
}
+again:
+ extent_logical = key.objectid;
+ extent_len = bytes;
+
/*
* trim extent to this stripe
*/
- if (key.objectid < logical) {
- key.offset -= logical - key.objectid;
- key.objectid = logical;
+ if (extent_logical < logical) {
+ extent_len -= logical - extent_logical;
+ extent_logical = logical;
}
- if (key.objectid + key.offset >
+ if (extent_logical + extent_len >
logical + map->stripe_len) {
- key.offset = logical + map->stripe_len -
- key.objectid;
+ extent_len = logical + map->stripe_len -
+ extent_logical;
}
- extent_logical = key.objectid;
- extent_physical = key.objectid - logical + physical;
- extent_len = key.offset;
+ extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
if (is_dev_replace)
@@ -2481,13 +2485,36 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
extent_len, &extent_physical,
&extent_dev,
&extent_mirror_num);
+
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+
ret = scrub_extent(sctx, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
- key.objectid - logical + physical);
+ extent_physical);
if (ret)
goto out;
+ scrub_free_csums(sctx);
+ if (extent_logical + extent_len <
+ key.objectid + bytes) {
+ logical += increment;
+ physical += map->stripe_len;
+
+ if (logical < key.objectid + bytes) {
+ cond_resched();
+ goto again;
+ }
+
+ if (logical >= logic_end) {
+ stop_loop = 1;
+ break;
+ }
+ }
next:
path->slots[0]++;
}
@@ -2495,8 +2522,14 @@ next:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
- sctx->stat.last_physical = physical;
+ if (stop_loop)
+ sctx->stat.last_physical = map->stripes[num].physical +
+ length;
+ else
+ sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
+ if (stop_loop)
+ break;
}
out:
/* push queued extents */
@@ -3005,28 +3038,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_device *dev;
- int ret;
-
- /*
- * we have to hold the device_list_mutex here so the device
- * does not go away in cancel_dev. FIXME: find a better solution
- */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!dev) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- return -ENODEV;
- }
- ret = btrfs_scrub_cancel_dev(fs_info, dev);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
- return ret;
-}
-
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress)
{
@@ -3188,16 +3199,18 @@ out:
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
{
- unsigned long index;
struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct page *page;
struct btrfs_root *local_root;
u64 physical_for_dev_replace;
u64 len;
- struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ unsigned long index;
int srcu_index;
+ int ret;
+ int err;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3211,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
return PTR_ERR(local_root);
}
+ if (btrfs_root_refs(&local_root->root_item) == 0) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ return -ENOENT;
+ }
+
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
@@ -3219,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
+ /* Avoid truncate/dio/punch hole.. */
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+
+ ret = 0;
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
len = nocow_ctx->len;
while (len >= PAGE_CACHE_SIZE) {
- struct page *page = NULL;
- int ret_sub;
-
index = offset >> PAGE_CACHE_SHIFT;
-
+again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
pr_err("find_or_create_page() failed\n");
ret = -ENOMEM;
- goto next_page;
+ goto out;
}
if (PageUptodate(page)) {
@@ -3239,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
goto next_page;
} else {
ClearPageError(page);
- ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+ err = extent_read_full_page(&BTRFS_I(inode)->
io_tree,
page, btrfs_get_extent,
nocow_ctx->mirror_num);
- if (ret_sub) {
- ret = ret_sub;
+ if (err) {
+ ret = err;
goto next_page;
}
- wait_on_page_locked(page);
+
+ lock_page(page);
+ /*
+ * If the page has been remove from the page cache,
+ * the data on it is meaningless, because it may be
+ * old one, the new data may be written into the new
+ * page in the page cache.
+ */
+ if (page->mapping != inode->i_mapping) {
+ page_cache_release(page);
+ goto again;
+ }
if (!PageUptodate(page)) {
ret = -EIO;
goto next_page;
}
}
- ret_sub = write_page_nocow(nocow_ctx->sctx,
- physical_for_dev_replace, page);
- if (ret_sub) {
- ret = ret_sub;
- goto next_page;
- }
-
+ err = write_page_nocow(nocow_ctx->sctx,
+ physical_for_dev_replace, page);
+ if (err)
+ ret = err;
next_page:
- if (page) {
- unlock_page(page);
- put_page(page);
- }
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (ret)
+ break;
+
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
-
- if (inode)
- iput(inode);
+out:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
return ret;
}
@@ -3291,7 +3321,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
return -EIO;
}
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c85e7c6b4598..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
}
}
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
{
struct fs_path *p;
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
return p;
}
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
{
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return NULL;
p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
return p;
}
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
@@ -387,7 +387,7 @@ static struct btrfs_path *alloc_path_for_send(void)
return path;
}
-int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
{
int ret;
mm_segment_t old_fs;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
*
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
-static int iterate_inode_ref(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key, int resolve,
iterate_inode_ref_t iterate, void *ctx)
{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
unsigned long elem_size;
unsigned long ptr;
- p = fs_path_alloc_reversed(sctx);
+ p = fs_path_alloc_reversed();
if (!p)
return -ENOMEM;
tmp_path = alloc_path_for_send();
if (!tmp_path) {
- fs_path_free(sctx, p);
+ fs_path_free(p);
return -ENOMEM;
}
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
*
* path must point to the dir item when called.
*/
-static int iterate_dir_item(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key,
iterate_dir_item_t iterate, void *ctx)
{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
* Retrieve the first path of an inode. If an inode has more then one
* ref/hardlink, this is ignored.
*/
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
u64 ino, struct fs_path *path)
{
int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
goto out;
}
- ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, 1,
+ __copy_first_ref, path);
if (ret < 0)
goto out;
ret = 0;
@@ -1314,8 +1312,7 @@ out:
return ret;
}
-static int read_symlink(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int read_symlink(struct btrfs_root *root,
u64 ino,
struct fs_path *dest)
{
@@ -1562,8 +1559,7 @@ out:
* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
* generation of the parent dir and the name of the dir entry.
*/
-static int get_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root, u64 ino,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
u64 *dir, u64 *dir_gen, struct fs_path *name)
{
int ret;
@@ -1628,8 +1624,7 @@ out:
return ret;
}
-static int is_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int is_first_ref(struct btrfs_root *root,
u64 ino, u64 dir,
const char *name, int name_len)
{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
u64 tmp_dir;
u64 tmp_dir_gen;
- tmp_name = fs_path_alloc(sctx);
+ tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
if (ret < 0)
goto out;
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
ret = !memcmp(tmp_name->start, name, name_len);
out:
- fs_path_free(sctx, tmp_name);
+ fs_path_free(tmp_name);
return ret;
}
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
if (!sctx->parent_root)
goto out;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name)
return -ENOMEM;
- ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+ ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
if (ret < 0)
goto out;
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
name->start, fs_path_len(name));
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
return ret;
}
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
* send_root or parent_root for ref lookup.
*/
if (ino < sctx->send_progress)
- ret = get_first_ref(sctx, sctx->send_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->send_root, ino,
+ parent_ino, parent_gen, dest);
else
- ret = get_first_ref(sctx, sctx->parent_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->parent_root, ino,
+ parent_ino, parent_gen, dest);
if (ret < 0)
goto out;
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_gen = 0;
int stop = 0;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name) {
ret = -ENOMEM;
goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
}
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
if (!ret)
fs_path_unreverse(dest);
return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
verbose_printk("btrfs: send_utimes %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
btrfs_free_path(path);
return ret;
}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
verbose_printk("btrfs: send_create_inode %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (S_ISLNK(mode)) {
fs_path_reset(p);
- ret = read_symlink(sctx, sctx->send_root, ino, p);
+ ret = read_symlink(sctx->send_root, ino, p);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
return 0;
}
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
{
struct recorded_ref *cur;
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(sctx, cur->full_path);
+ fs_path_free(cur->full_path);
list_del(&cur->list);
kfree(cur);
}
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
static void free_recorded_refs(struct send_ctx *sctx)
{
- __free_recorded_refs(sctx, &sctx->new_refs);
- __free_recorded_refs(sctx, &sctx->deleted_refs);
+ __free_recorded_refs(&sctx->new_refs);
+ __free_recorded_refs(&sctx->deleted_refs);
}
/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
int ret;
struct fs_path *orphan;
- orphan = fs_path_alloc(sctx);
+ orphan = fs_path_alloc();
if (!orphan)
return -ENOMEM;
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
ret = send_rename(sctx, path, orphan);
out:
- fs_path_free(sctx, orphan);
+ fs_path_free(orphan);
return ret;
}
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
*/
BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
- valid_path = fs_path_alloc(sctx);
+ valid_path = fs_path_alloc();
if (!valid_path) {
ret = -ENOMEM;
goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
- ret = is_first_ref(sctx, sctx->parent_root,
- ow_inode, cur->dir, cur->name,
- cur->name_len);
+ ret = is_first_ref(sctx->parent_root,
+ ow_inode, cur->dir, cur->name,
+ cur->name_len);
if (ret < 0)
goto out;
if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
out:
free_recorded_refs(sctx);
ulist_free(check_dirs);
- fs_path_free(sctx, valid_path);
+ fs_path_free(valid_path);
return ret;
}
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, __record_new_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, __record_deleted_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
return 0;
}
-static int find_iref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_iref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
ctx.name = name;
ctx.found_idx = -1;
- ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
if (ret < 0)
return ret;
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+ ret = find_iref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, dir, name);
if (ret == -ENOENT)
ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+ ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
dir, name);
if (ret == -ENOENT)
ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, __record_changed_new_ref, sctx);
if (ret < 0)
goto out;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
if (ret < 0)
goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
- sctx);
+ ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
btrfs_release_path(path);
if (ret < 0)
goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
posix_acl_xattr_header dummy_acl;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
ret = send_remove_xattr(sctx, p, name, name_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, __process_new_xattr, sctx);
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, __process_new_xattr, sctx);
return ret;
}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
{
int ret;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, __process_deleted_xattr, sctx);
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, __process_deleted_xattr, sctx);
return ret;
}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmalloc(data_len, GFP_NOFS);
+ ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
if (!ctx->found_data)
return -ENOMEM;
- memcpy(ctx->found_data, data, data_len);
return 1;
}
return 0;
}
-static int find_xattr(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_xattr(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
ctx.found_data = NULL;
ctx.found_data_len = 0;
- ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+ ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
if (ret < 0)
return ret;
@@ -3479,11 +3470,10 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
struct send_ctx *sctx = ctx;
char *found_data = NULL;
int found_data_len = 0;
- struct fs_path *p = NULL;
- ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, name, name_len, &found_data,
- &found_data_len);
+ ret = find_xattr(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, name, name_len, &found_data,
+ &found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3498,7 +3488,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
}
kfree(found_data);
- fs_path_free(sctx, p);
return ret;
}
@@ -3510,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
- name, name_len, NULL, NULL);
+ ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3525,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
sctx->cmp_key, __process_changed_new_xattr, sctx);
if (ret < 0)
goto out;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
sctx->cmp_key, __process_changed_deleted_xattr, sctx);
out:
@@ -3574,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
goto out;
}
- ret = iterate_dir_item(sctx, root, path, &found_key,
- __process_new_xattr, sctx);
+ ret = iterate_dir_item(root, path, &found_key,
+ __process_new_xattr, sctx);
if (ret < 0)
goto out;
@@ -3600,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int num_read = 0;
mm_segment_t old_fs;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3642,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
set_fs(old_fs);
if (ret < 0)
return ret;
@@ -3665,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
clone_root->root->objectid, clone_root->ino,
clone_root->offset);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3688,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
} else {
- ret = get_inode_path(sctx, clone_root->root,
- clone_root->ino, p);
+ ret = get_inode_path(clone_root->root, clone_root->ino, p);
}
if (ret < 0)
goto out;
@@ -3706,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3719,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3739,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -4529,9 +4517,11 @@ static int send_subvol(struct send_ctx *sctx)
{
int ret;
- ret = send_header(sctx);
- if (ret < 0)
- goto out;
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+ ret = send_header(sctx);
+ if (ret < 0)
+ goto out;
+ }
ret = send_subvol_begin(sctx);
if (ret < 0)
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
send_root = BTRFS_I(file_inode(mnt_file))->root;
fs_info = send_root->fs_info;
+ /*
+ * This is done when we lookup the root, it should already be complete
+ * by the time we get here.
+ */
+ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+ /*
+ * If we just created this root we need to make sure that the orphan
+ * cleanup has been done and committed since we search the commit root,
+ * so check its commit root transid with our otransid and if they match
+ * commit the transaction to make sure everything is updated.
+ */
+ down_read(&send_root->fs_info->extent_commit_sem);
+ if (btrfs_header_generation(send_root->commit_root) ==
+ btrfs_root_otransid(&send_root->root_item)) {
+ struct btrfs_trans_handle *trans;
+
+ up_read(&send_root->fs_info->extent_commit_sem);
+
+ trans = btrfs_attach_transaction_barrier(send_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ /* ENOENT means theres no transaction */
+ } else {
+ ret = btrfs_commit_transaction(trans, send_root);
+ if (ret)
+ goto out;
+ }
+ } else {
+ up_read(&send_root->fs_info->extent_commit_sem);
+ }
+
arg = memdup_user(arg_, sizeof(*arg));
if (IS_ERR(arg)) {
ret = PTR_ERR(arg);
@@ -4593,7 +4618,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
- if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
ret = -EINVAL;
goto out;
}
@@ -4612,8 +4637,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->flags = arg->flags;
sctx->send_filp = fget(arg->send_fd);
- if (IS_ERR(sctx->send_filp)) {
- ret = PTR_ERR(sctx->send_filp);
+ if (!sctx->send_filp) {
+ ret = -EBADF;
goto out;
}
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!clone_root) {
- ret = -EINVAL;
- goto out;
- }
if (IS_ERR(clone_root)) {
ret = PTR_ERR(clone_root);
goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!sctx->parent_root) {
- ret = -EINVAL;
+ if (IS_ERR(sctx->parent_root)) {
+ ret = PTR_ERR(sctx->parent_root);
goto out;
}
}
@@ -4704,12 +4725,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
if (ret < 0)
goto out;
- ret = begin_cmd(sctx, BTRFS_SEND_C_END);
- if (ret < 0)
- goto out;
- ret = send_cmd(sctx);
- if (ret < 0)
- goto out;
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+ if (ret < 0)
+ goto out;
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ goto out;
+ }
out:
kfree(arg);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 8bb18f7ccaa6..48d425aef05b 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -131,5 +131,4 @@ enum {
#ifdef __KERNEL__
long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
-int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f6b88595f858..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,11 +51,11 @@
#include "print-tree.h"
#include "xattr.h"
#include "volumes.h"
-#include "version.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "free-space-cache.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -63,9 +63,9 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
-static const char *btrfs_decode_error(int errno, char nbuf[16])
+static const char *btrfs_decode_error(int errno)
{
- char *errstr = NULL;
+ char *errstr = "unknown";
switch (errno) {
case -EIO:
@@ -80,18 +80,18 @@ static const char *btrfs_decode_error(int errno, char nbuf[16])
case -EEXIST:
errstr = "Object already exists";
break;
- default:
- if (nbuf) {
- if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
- errstr = nbuf;
- }
+ case -ENOSPC:
+ errstr = "No space left";
+ break;
+ case -ENOENT:
+ errstr = "No such entry";
break;
}
return errstr;
}
-static void __save_error_info(struct btrfs_fs_info *fs_info)
+static void save_error_info(struct btrfs_fs_info *fs_info)
{
/*
* today we only save the error info into ram. Long term we'll
@@ -100,11 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
}
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
- __save_error_info(fs_info);
-}
-
/* btrfs handle error by forcing the filesystem readonly */
static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
{
@@ -115,7 +110,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
sb->s_flags |= MS_RDONLY;
- printk(KERN_INFO "btrfs is forced readonly\n");
+ btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not
* canceled here although there is no way to update
@@ -126,7 +121,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* mounted writeable again, the device replace
* operation continues.
*/
-// WARN_ON(1);
}
}
@@ -139,7 +133,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
- char nbuf[16];
const char *errstr;
/*
@@ -149,7 +142,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
- errstr = btrfs_decode_error(errno, nbuf);
+ errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
va_list args;
@@ -158,19 +151,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
- sb->s_id, function, line, errstr, &vaf);
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+ sb->s_id, function, line, errno, errstr);
}
/* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN) {
- save_error_info(fs_info);
+ save_error_info(fs_info);
+ if (sb->s_flags & MS_BORN)
btrfs_handle_error(fs_info);
- }
}
static const char * const logtypes[] = {
@@ -184,7 +176,7 @@ static const char * const logtypes[] = {
"debug",
};
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
char lvl[4];
@@ -208,7 +200,7 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
va_end(args);
}
@@ -252,21 +244,30 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
+ /*
+ * Report first abort since mount
+ */
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+ &root->fs_info->fs_state)) {
+ WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+ errno);
+ }
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->blocks_used) {
- char nbuf[16];
const char *errstr;
- errstr = btrfs_decode_error(errno, nbuf);
- btrfs_printk(root->fs_info,
- "%s:%d: Aborting unused transaction(%s).\n",
- function, line, errstr);
+ errstr = btrfs_decode_error(errno);
+ btrfs_warn(root->fs_info,
+ "%s:%d: Aborting unused transaction(%s).",
+ function, line, errstr);
return;
}
ACCESS_ONCE(trans->transaction->aborted) = errno;
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
@@ -276,7 +277,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
- char nbuf[16];
char *s_id = "<unknown>";
const char *errstr;
struct va_format vaf = { .fmt = fmt };
@@ -288,13 +288,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(errno, nbuf);
+ errstr = btrfs_decode_error(errno);
if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
- panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
- s_id, function, line, &vaf, errstr);
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
- printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
- s_id, function, line, &vaf, errstr);
+ printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}
@@ -650,7 +650,7 @@ out:
*/
static int btrfs_parse_early_options(const char *options, fmode_t flags,
void *holder, char **subvol_name, u64 *subvol_objectid,
- u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
+ struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
@@ -693,16 +693,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
}
break;
case Opt_subvolrootid:
- intarg = 0;
- error = match_int(&args[0], &intarg);
- if (!error) {
- /* we want the original fs_tree */
- if (!intarg)
- *subvol_rootid =
- BTRFS_FS_TREE_OBJECTID;
- else
- *subvol_rootid = intarg;
- }
+ printk(KERN_WARNING
+ "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
break;
case Opt_device:
device_name = match_strdup(&args[0]);
@@ -786,9 +778,6 @@ find_root:
if (IS_ERR(new_root))
return ERR_CAST(new_root);
- if (btrfs_root_refs(&new_root->root_item) == 0)
- return ERR_PTR(-ENOENT);
-
dir_id = btrfs_root_dirid(&new_root->root_item);
setup_root:
location.objectid = dir_id;
@@ -876,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(fs_info, 1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1080,7 +1069,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
- u64 subvol_rootid = 0;
int error = 0;
if (!(flags & MS_RDONLY))
@@ -1088,7 +1076,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_parse_early_options(data, mode, fs_type,
&subvol_name, &subvol_objectid,
- &subvol_rootid, &fs_devices);
+ &fs_devices);
if (error) {
kfree(subvol_name);
return ERR_PTR(error);
@@ -1202,11 +1190,14 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
new_pool_size);
}
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
- unsigned long old_opts, int flags)
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts, int flags)
+{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
(flags & MS_RDONLY))) {
@@ -1247,7 +1238,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- btrfs_remount_prepare(fs_info, old_opts, *flags);
+ btrfs_remount_prepare(fs_info);
ret = btrfs_parse_options(root, data);
if (ret) {
@@ -1255,6 +1246,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
+ btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
@@ -1270,6 +1262,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
ret = btrfs_commit_super(root);
if (ret)
@@ -1691,6 +1684,18 @@ static void btrfs_interface_exit(void)
printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
}
+static void btrfs_print_info(void)
+{
+ printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+ ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ ", integrity-checker=on"
+#endif
+ "\n");
+}
+
static int __init init_btrfs_fs(void)
{
int err;
@@ -1739,7 +1744,9 @@ static int __init init_btrfs_fs(void)
btrfs_init_lockdep();
- printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+ btrfs_print_info();
+ btrfs_test_free_space_cache();
+
return 0;
unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 50767bbaad6c..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
#define BTRFS_ROOT_TRANS_TAG 0
-void put_transaction(struct btrfs_transaction *transaction)
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+ [TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START),
+ [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN),
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+ [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+};
+
+static void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
+ while (!list_empty(&transaction->pending_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&transaction->pending_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
root->commit_root = btrfs_root_node(root);
}
-static inline int can_join_transaction(struct btrfs_transaction *trans,
- int type)
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+ unsigned int type)
{
- return !(trans->in_commit &&
- type != TRANS_JOIN &&
- type != TRANS_JOIN_NOLOCK);
+ if (type & TRANS_EXTWRITERS)
+ atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+ return atomic_read(&trans->num_extwriters);
}
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
return -EROFS;
}
- if (fs_info->trans_no_join) {
- /*
- * If we are JOIN_NOLOCK we're already committing a current
- * transaction, we just need a handle to deal with something
- * when committing the transaction, such as inode cache and
- * space cache. It is a special case.
- */
- if (type != TRANS_JOIN_NOLOCK) {
- spin_unlock(&fs_info->trans_lock);
- return -EBUSY;
- }
- }
-
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
- if (!can_join_transaction(cur_trans, type)) {
+ if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
- cur_trans->num_joined++;
+ extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
return 0;
}
@@ -112,6 +147,12 @@ loop:
if (type == TRANS_ATTACH)
return -ENOENT;
+ /*
+ * JOIN_NOLOCK only happens during the transaction commit, so
+ * it is impossible that ->running_transaction is NULL
+ */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
- * to redo the trans_no_join checks above
+ * to redo the checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
goto loop;
@@ -131,17 +172,15 @@ loop:
}
atomic_set(&cur_trans->num_writers, 1);
- cur_trans->num_joined = 0;
+ extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->in_commit = 0;
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.root = RB_ROOT;
@@ -162,9 +201,8 @@ loop:
if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
"creating a fresh transaction\n");
- atomic_set(&fs_info->tree_mod_seq, 0);
+ atomic64_set(&fs_info->tree_mod_seq, 0);
- spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
+ INIT_LIST_HEAD(&cur_trans->pending_chunks);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
}
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+ return (trans->state >= TRANS_STATE_BLOCKED &&
+ trans->state < TRANS_STATE_UNBLOCKED &&
+ !trans->aborted);
+}
+
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
- if (cur_trans && cur_trans->blocked) {
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
atomic_inc(&cur_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
wait_event(root->fs_info->transaction_wait,
- !cur_trans->blocked);
+ cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ cur_trans->aborted);
put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
return ERR_PTR(-EROFS);
if (current->journal_info) {
- WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+ WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
* If we are ATTACH, it means we just want to catch the current
* transaction and commit it, so we needn't do sb_start_intwrite().
*/
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_start_intwrite(root->fs_info->sb);
if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ may_wait_transaction(root, type)) {
btrfs_commit_transaction(h, root);
goto again;
}
@@ -429,7 +477,7 @@ got_it:
return h;
join_fail:
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * btrfs_attach_transaction_barrier() - catch the running transaction
*
* It is similar to the above function, the differentia is this one
* will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
- wait_event(commit->commit_wait, commit->commit_done);
+ wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
}
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry_reverse(t, &root->fs_info->trans_list,
list) {
- if (t->in_commit) {
- if (t->commit_done)
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret;
+ if (root->fs_info->global_block_rsv.space_info->full &&
+ btrfs_should_throttle_delayed_refs(trans, root))
+ return 1;
- ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
- return ret ? 1 : 0;
+ return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
}
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
int err;
smp_mb();
- if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ cur_trans->delayed_refs.flushing)
return 1;
updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
- int count = 0;
+ unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- while (count < 1) {
- unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (btrfs_should_throttle_delayed_refs(trans, root)) {
+ cur = max_t(unsigned long, cur, 1);
trans->delayed_ref_updates = 0;
- if (cur &&
- trans->transaction->delayed_refs.num_heads_ready > 64) {
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- } else {
- break;
- }
- count++;
+ btrfs_run_delayed_refs(trans, root, cur);
}
btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_create_pending_block_groups(trans, root);
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
- should_end_transaction(trans, root)) {
- trans->transaction->blocked = 1;
- smp_wmb();
+ should_end_transaction(trans, root) &&
+ ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+ spin_lock(&info->trans_lock);
+ if (cur_trans->state == TRANS_STATE_RUNNING)
+ cur_trans->state = TRANS_STATE_BLOCKED;
+ spin_unlock(&info->trans_lock);
}
- if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+ if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle) {
/*
* We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
}
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
atomic_dec(&cur_trans->num_writers);
+ extwriter_counter_dec(cur_trans, trans->type);
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
@@ -707,23 +755,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret;
-
- ret = __btrfs_end_transaction(trans, root, 0);
- if (ret)
- return ret;
- return 0;
+ return __btrfs_end_transaction(trans, root, 0);
}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret;
-
- ret = __btrfs_end_transaction(trans, root, 1);
- if (ret)
- return ret;
- return 0;
+ return __btrfs_end_transaction(trans, root, 1);
}
int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
@@ -746,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
- struct blk_plug plug;
- blk_start_plug(&plug);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -762,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
- blk_finish_plug(&plug);
return werr;
}
@@ -807,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
{
int ret;
int ret2;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ blk_finish_plug(&plug);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
if (ret)
@@ -948,7 +986,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
int btrfs_add_dead_root(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
- list_add(&root->root_list, &root->fs_info->dead_roots);
+ list_add_tail(&root->root_list, &root->fs_info->dead_roots);
spin_unlock(&root->fs_info->trans_lock);
return 0;
}
@@ -1179,13 +1217,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
+ if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+ memset(new_root_item->received_uuid, 0,
+ sizeof(new_root_item->received_uuid));
+ memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+ memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+ btrfs_set_root_stransid(new_root_item, 0);
+ btrfs_set_root_rtransid(new_root_item, 0);
+ }
new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
btrfs_set_root_otransid(new_root_item, trans->transid);
- memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
- memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
- btrfs_set_root_stransid(new_root_item, 0);
- btrfs_set_root_rtransid(new_root_item, 0);
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -1324,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->in_commit;
+ trans = info->running_transaction;
+ if (trans)
+ ret = (trans->state >= TRANS_STATE_COMMIT_START);
spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->blocked;
+ trans = info->running_transaction;
+ if (trans)
+ ret = is_transaction_blocked(trans);
spin_unlock(&info->trans_lock);
return ret;
}
@@ -1349,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
static void wait_current_trans_commit_start(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
- wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+ wait_event(root->fs_info->transaction_blocked_wait,
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ trans->aborted);
}
/*
@@ -1360,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
wait_event(root->fs_info->transaction_wait,
- trans->commit_done || (trans->in_commit && !trans->blocked));
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ trans->aborted);
}
/*
@@ -1456,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->trans_lock);
- if (list_empty(&cur_trans->list)) {
- spin_unlock(&root->fs_info->trans_lock);
- btrfs_end_transaction(trans, root);
- return;
- }
+ /*
+ * If the transaction is removed from the list, it means this
+ * transaction has been committed successfully, so it is impossible
+ * to call the cleanup function.
+ */
+ BUG_ON(list_empty(&cur_trans->list));
list_del_init(&cur_trans->list);
if (cur_trans == root->fs_info->running_transaction) {
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
}
spin_unlock(&root->fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, root);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->trans_lock);
+
put_transaction(cur_trans);
put_transaction(cur_trans);
@@ -1492,24 +1548,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
- int snap_pending = 0;
int ret;
- if (!flush_on_commit) {
- spin_lock(&root->fs_info->trans_lock);
- if (!list_empty(&trans->transaction->pending_snapshots))
- snap_pending = 1;
- spin_unlock(&root->fs_info->trans_lock);
- }
-
- if (flush_on_commit || snap_pending) {
- ret = btrfs_start_delalloc_inodes(root, 1);
- if (ret)
- return ret;
- btrfs_wait_ordered_extents(root, 1);
- }
-
ret = btrfs_run_delayed_items(trans, root);
if (ret)
return ret;
@@ -1533,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * btrfs_transaction state sequence:
- * in_commit = 0, blocked = 0 (initial)
- * in_commit = 1, blocked = 1
- * blocked = 0
- * commit_done = 1
- */
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ return btrfs_start_all_delalloc_inodes(fs_info, 1);
+ return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- unsigned long joined = 0;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
- DEFINE_WAIT(wait);
int ret;
- int should_grow = 0;
- unsigned long now = get_seconds();
ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
@@ -1588,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* start sending their work down.
*/
cur_trans->delayed_refs.flushing = 1;
+ smp_wmb();
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
@@ -1598,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- spin_lock(&cur_trans->commit_lock);
- if (cur_trans->in_commit) {
- spin_unlock(&cur_trans->commit_lock);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans, root);
@@ -1611,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- trans->transaction->in_commit = 1;
- trans->transaction->blocked = 1;
- spin_unlock(&cur_trans->commit_lock);
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
- spin_lock(&root->fs_info->trans_lock);
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (!prev_trans->commit_done) {
+ if (prev_trans->state != TRANS_STATE_COMPLETED) {
atomic_inc(&prev_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
@@ -1634,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
}
- if (!btrfs_test_opt(root, SSD) &&
- (now < cur_trans->start_time || now - cur_trans->start_time < 1))
- should_grow = 1;
-
- do {
- joined = cur_trans->num_joined;
-
- WARN_ON(cur_trans != trans->transaction);
+ extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_flush_all_pending_stuffs(trans, root);
- if (ret)
- goto cleanup_transaction;
-
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ ret = btrfs_start_delalloc_flush(root->fs_info);
+ if (ret)
+ goto cleanup_transaction;
- if (atomic_read(&cur_trans->num_writers) > 1)
- schedule_timeout(MAX_SCHEDULE_TIMEOUT);
- else if (should_grow)
- schedule_timeout(1);
+ ret = btrfs_flush_all_pending_stuffs(trans, root);
+ if (ret)
+ goto cleanup_transaction;
- finish_wait(&cur_trans->writer_wait, &wait);
- } while (atomic_read(&cur_trans->num_writers) > 1 ||
- (should_grow && cur_trans->num_joined != joined));
+ wait_event(cur_trans->writer_wait,
+ extwriter_counter_read(cur_trans) == 0);
+ /* some pending stuffs might be added after the previous flush. */
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
goto cleanup_transaction;
+ btrfs_wait_delalloc_flush(root->fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
- * no_join so make sure to wait for num_writers to == 1 again.
+ * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -1796,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
- trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
- root->fs_info->trans_no_join = 0;
spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->reloc_mutex);
@@ -1808,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
btrfs_error(root->fs_info, ret,
- "Error while writing out transaction.");
+ "Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto cleanup_transaction;
}
@@ -1827,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- cur_trans->commit_done = 1;
-
root->fs_info->last_trans_committed = cur_trans->transid;
-
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
spin_lock(&root->fs_info->trans_lock);
@@ -1840,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
put_transaction(cur_trans);
put_transaction(cur_trans);
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
trace_btrfs_transaction_commit(root);
@@ -1864,8 +1895,7 @@ cleanup_transaction:
btrfs_qgroup_free(root, trans->qgroup_reserved);
trans->qgroup_reserved = 0;
}
- btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
-// WARN_ON(1);
+ btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
cleanup_transaction(trans, root, ret);
@@ -1874,31 +1904,44 @@ cleanup_transaction:
}
/*
- * interface function to delete all the snapshots we have scheduled for deletion
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
*/
-int btrfs_clean_old_snapshots(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
{
- LIST_HEAD(list);
+ int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
- list_splice_init(&fs_info->dead_roots, &list);
+ if (list_empty(&fs_info->dead_roots)) {
+ spin_unlock(&fs_info->trans_lock);
+ return 0;
+ }
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ list_del(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- while (!list_empty(&list)) {
- int ret;
-
- root = list_entry(list.next, struct btrfs_root, root_list);
- list_del(&root->root_list);
+ pr_debug("btrfs: cleaner removing %llu\n",
+ (unsigned long long)root->objectid);
- btrfs_kill_all_delayed_nodes(root);
+ btrfs_kill_all_delayed_nodes(root);
- if (btrfs_header_backref_rev(root->node) <
- BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, NULL, 0, 0);
- else
- ret =btrfs_drop_snapshot(root, NULL, 1, 0);
- BUG_ON(ret < 0);
- }
- return 0;
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+ else
+ ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+ /*
+ * If we encounter a transaction abort during snapshot cleaning, we
+ * don't want to crash here
+ */
+ BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS);
+ return 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 3c8e0d25c8e4..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
#include "delayed-ref.h"
#include "ctree.h"
+enum btrfs_trans_state {
+ TRANS_STATE_RUNNING = 0,
+ TRANS_STATE_BLOCKED = 1,
+ TRANS_STATE_COMMIT_START = 2,
+ TRANS_STATE_COMMIT_DOING = 3,
+ TRANS_STATE_UNBLOCKED = 4,
+ TRANS_STATE_COMPLETED = 5,
+ TRANS_STATE_MAX = 6,
+};
+
struct btrfs_transaction {
u64 transid;
/*
+ * total external writers(USERSPACE/START/ATTACH) in this
+ * transaction, it must be zero before the transaction is
+ * being committed
+ */
+ atomic_t num_extwriters;
+ /*
* total writers in this transaction, it must be zero before the
* transaction can end
*/
atomic_t num_writers;
atomic_t use_count;
- unsigned long num_joined;
-
- spinlock_t commit_lock;
- int in_commit;
- int commit_done;
- int blocked;
+ /* Be protected by fs_info->trans_lock when we want to change it. */
+ enum btrfs_trans_state state;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head ordered_operations;
+ struct list_head pending_chunks;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
-enum btrfs_trans_type {
- TRANS_START,
- TRANS_JOIN,
- TRANS_USERSPACE,
- TRANS_JOIN_NOLOCK,
- TRANS_ATTACH,
-};
+#define __TRANS_FREEZABLE (1U << 0)
+
+#define __TRANS_USERSPACE (1U << 8)
+#define __TRANS_START (1U << 9)
+#define __TRANS_ATTACH (1U << 10)
+#define __TRANS_JOIN (1U << 11)
+#define __TRANS_JOIN_NOLOCK (1U << 12)
+
+#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH (__TRANS_ATTACH)
+#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
+ __TRANS_ATTACH)
struct btrfs_trans_handle {
u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
- enum btrfs_trans_type type;
+ unsigned int type;
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction.
@@ -123,7 +145,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
int btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
-int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
@@ -146,5 +168,4 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
-void put_transaction(struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ef96381569a4..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include "ctree.h"
#include "transaction.h"
@@ -277,17 +278,31 @@ static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
struct walk_control *wc, u64 gen)
{
+ int ret = 0;
+
+ /*
+ * If this fs is mixed then we need to be able to process the leaves to
+ * pin down any logged extents, so we have to read the block.
+ */
+ if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
+ }
+
if (wc->pin)
- btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
- eb->start, eb->len);
+ ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+ eb->start, eb->len);
- if (btrfs_buffer_uptodate(eb, gen, 0)) {
+ if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+ if (wc->pin && btrfs_header_level(eb) == 0)
+ ret = btrfs_exclude_logged_extents(log, eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
btrfs_wait_tree_block_writeback(eb);
}
- return 0;
+ return ret;
}
/*
@@ -408,9 +423,9 @@ insert:
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(trans, root, path, item_size, 1);
+ btrfs_truncate_item(root, path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(trans, root, path,
+ btrfs_extend_item(root, path,
item_size - found_size);
} else if (ret) {
return ret;
@@ -587,7 +602,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
- BUG_ON(ret);
+ if (ret)
+ goto out;
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -597,7 +613,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, key,
sizeof(*item));
- BUG_ON(ret);
+ if (ret)
+ goto out;
dest_offset = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
copy_extent_buffer(path->nodes[0], eb, dest_offset,
@@ -623,7 +640,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset,
0, root->root_key.objectid,
key->objectid, offset, 0);
- BUG_ON(ret);
+ if (ret)
+ goto out;
} else {
/*
* insert the extent pointer in the extent
@@ -632,7 +650,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_logged_file_extent(trans,
root, root->root_key.objectid,
key->objectid, offset, &ins);
- BUG_ON(ret);
+ if (ret)
+ goto out;
}
btrfs_release_path(path);
@@ -649,26 +668,30 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
&ordered_sums, 0);
- BUG_ON(ret);
+ if (ret)
+ goto out;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
- ret = btrfs_csum_file_blocks(trans,
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans,
root->fs_info->csum_root,
sums);
- BUG_ON(ret);
list_del(&sums->list);
kfree(sums);
}
+ if (ret)
+ goto out;
} else {
btrfs_release_path(path);
}
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
/* inline extents are easy, we just overwrite them */
ret = overwrite_item(trans, root, path, eb, slot, key);
- BUG_ON(ret);
+ if (ret)
+ goto out;
}
inode_add_bytes(inode, nbytes);
@@ -713,20 +736,21 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
inode = read_one_inode(root, location.objectid);
if (!inode) {
- kfree(name);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
ret = link_to_fixup_dir(trans, root, path, location.objectid);
- BUG_ON(ret);
+ if (ret)
+ goto out;
ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
- BUG_ON(ret);
+ if (ret)
+ goto out;
+ btrfs_run_delayed_items(trans, root);
+out:
kfree(name);
-
iput(inode);
-
- btrfs_run_delayed_items(trans, root);
return ret;
}
@@ -879,7 +903,8 @@ again:
victim_name_len = btrfs_inode_ref_name_len(leaf,
victim_ref);
victim_name = kmalloc(victim_name_len, GFP_NOFS);
- BUG_ON(!victim_name);
+ if (!victim_name)
+ return -ENOMEM;
read_extent_buffer(leaf, victim_name,
(unsigned long)(victim_ref + 1),
@@ -895,9 +920,10 @@ again:
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
- BUG_ON(ret);
- btrfs_run_delayed_items(trans, root);
kfree(victim_name);
+ if (ret)
+ return ret;
+ btrfs_run_delayed_items(trans, root);
*search_done = 1;
goto again;
}
@@ -905,7 +931,6 @@ again:
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
- BUG_ON(ret);
/*
* NOTE: we have searched root tree and checked the
@@ -939,6 +964,8 @@ again:
goto next;
victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ if (!victim_name)
+ return -ENOMEM;
read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
victim_name_len);
@@ -965,14 +992,16 @@ again:
victim_name_len);
btrfs_run_delayed_items(trans, root);
}
- BUG_ON(ret);
iput(victim_parent);
kfree(victim_name);
+ if (ret)
+ return ret;
*search_done = 1;
goto again;
}
kfree(victim_name);
- BUG_ON(ret);
+ if (ret)
+ return ret;
next:
cur_offset += victim_name_len + sizeof(*extref);
}
@@ -985,7 +1014,8 @@ next:
ref_index, name, namelen, 0);
if (di && !IS_ERR(di)) {
ret = drop_one_dir_item(trans, root, path, dir, di);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
btrfs_release_path(path);
@@ -994,7 +1024,8 @@ next:
name, namelen, 0);
if (di && !IS_ERR(di)) {
ret = drop_one_dir_item(trans, root, path, dir, di);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
btrfs_release_path(path);
@@ -1139,15 +1170,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
parent_objectid,
ref_index, name, namelen,
&search_done);
- if (ret == 1)
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ if (ret)
goto out;
- BUG_ON(ret);
}
/* insert our name */
ret = btrfs_add_link(trans, dir, inode, name, namelen,
0, ref_index);
- BUG_ON(ret);
+ if (ret)
+ goto out;
btrfs_update_inode(trans, root, inode);
}
@@ -1162,13 +1197,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
- BUG_ON(ret);
-
out:
btrfs_release_path(path);
iput(dir);
iput(inode);
- return 0;
+ return ret;
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -1326,10 +1359,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
ino, 1);
- BUG_ON(ret);
+ if (ret)
+ goto out;
}
ret = insert_orphan_item(trans, root, ino);
- BUG_ON(ret);
}
out:
@@ -1374,9 +1407,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
return -EIO;
ret = fixup_inode_link_count(trans, root, inode);
- BUG_ON(ret);
-
iput(inode);
+ if (ret)
+ goto out;
/*
* fixup on a directory may create new entries,
@@ -1426,7 +1459,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
} else if (ret == -EEXIST) {
ret = 0;
} else {
- BUG();
+ BUG(); /* Logic Error */
}
iput(inode);
@@ -1495,7 +1528,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct inode *dir;
u8 log_type;
int exists;
- int ret;
+ int ret = 0;
dir = read_one_inode(root, key->objectid);
if (!dir)
@@ -1527,7 +1560,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
key->offset, name,
name_len, 1);
} else {
- BUG();
+ /* Corruption */
+ ret = -EINVAL;
+ goto out;
}
if (IS_ERR_OR_NULL(dst_di)) {
/* we need a sequence number to insert, so we only
@@ -1555,7 +1590,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
goto out;
ret = drop_one_dir_item(trans, root, path, dir, dst_di);
- BUG_ON(ret);
+ if (ret)
+ goto out;
if (key->type == BTRFS_DIR_INDEX_KEY)
goto insert;
@@ -1563,14 +1599,15 @@ out:
btrfs_release_path(path);
kfree(name);
iput(dir);
- return 0;
+ return ret;
insert:
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
-
- BUG_ON(ret && ret != -ENOENT);
+ if (ret && ret != -ENOENT)
+ goto out;
+ ret = 0;
goto out;
}
@@ -1601,7 +1638,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
- BUG_ON(ret);
+ if (ret)
+ return ret;
ptr = (unsigned long)(di + 1);
ptr += name_len;
}
@@ -1762,16 +1800,21 @@ again:
ret = link_to_fixup_dir(trans, root,
path, location.objectid);
- BUG_ON(ret);
+ if (ret) {
+ kfree(name);
+ iput(inode);
+ goto out;
+ }
+
btrfs_inc_nlink(inode);
ret = btrfs_unlink_inode(trans, root, dir, inode,
name, name_len);
- BUG_ON(ret);
-
- btrfs_run_delayed_items(trans, root);
-
+ if (!ret)
+ btrfs_run_delayed_items(trans, root);
kfree(name);
iput(inode);
+ if (ret)
+ goto out;
/* there might still be more names under this key
* check and repeat if required
@@ -1875,7 +1918,8 @@ again:
ret = check_item_in_log(trans, root, log, path,
log_path, dir,
&found_key);
- BUG_ON(ret);
+ if (ret)
+ goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
@@ -1952,11 +1996,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
root, log, path, key.objectid, 0);
- BUG_ON(ret);
+ if (ret)
+ break;
}
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
- BUG_ON(ret);
+ if (ret)
+ break;
/* for regular files, make sure corresponding
* orhpan item exist. extents past the new EOF
@@ -1965,12 +2011,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (S_ISREG(mode)) {
ret = insert_orphan_item(wc->trans, root,
key.objectid);
- BUG_ON(ret);
+ if (ret)
+ break;
}
ret = link_to_fixup_dir(wc->trans, root,
path, key.objectid);
- BUG_ON(ret);
+ if (ret)
+ break;
}
if (wc->stage < LOG_WALK_REPLAY_ALL)
continue;
@@ -1979,28 +2027,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (key.type == BTRFS_XATTR_ITEM_KEY) {
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
- BUG_ON(ret);
- } else if (key.type == BTRFS_INODE_REF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- BUG_ON(ret && ret != -ENOENT);
- } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ if (ret)
+ break;
+ } else if (key.type == BTRFS_INODE_REF_KEY ||
+ key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
- BUG_ON(ret && ret != -ENOENT);
+ if (ret && ret != -ENOENT)
+ break;
+ ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
- BUG_ON(ret);
+ if (ret)
+ break;
} else if (key.type == BTRFS_DIR_ITEM_KEY ||
key.type == BTRFS_DIR_INDEX_KEY) {
ret = replay_one_dir_item(wc->trans, root, path,
eb, i, &key);
- BUG_ON(ret);
+ if (ret)
+ break;
}
}
btrfs_free_path(path);
- return 0;
+ return ret;
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -2045,8 +2095,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (*level == 1) {
ret = wc->process_func(root, next, wc, ptr_gen);
- if (ret)
+ if (ret) {
+ free_extent_buffer(next);
return ret;
+ }
path->slots[*level]++;
if (wc->free) {
@@ -2066,7 +2118,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
- BUG_ON(ret); /* -ENOMEM or logic errors */
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
}
free_extent_buffer(next);
continue;
@@ -2139,7 +2194,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_free_and_pin_reserved_extent(root,
path->nodes[*level]->start,
path->nodes[*level]->len);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2161,7 +2217,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
int wret;
int level;
struct btrfs_path *path;
- int i;
int orig_level;
path = btrfs_alloc_path();
@@ -2213,17 +2268,12 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
- BUG_ON(ret); /* -ENOMEM or logic errors */
+ if (ret)
+ goto out;
}
}
out:
- for (i = 0; i <= orig_level; i++) {
- if (path->nodes[i]) {
- free_extent_buffer(path->nodes[i]);
- path->nodes[i] = NULL;
- }
- }
btrfs_free_path(path);
return ret;
}
@@ -2316,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
unsigned long log_transid = 0;
+ struct blk_plug plug;
mutex_lock(&root->log_mutex);
log_transid = root->log_transid;
@@ -2359,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
+ blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2395,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ blk_finish_plug(&plug);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2410,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
@@ -2432,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2439,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- ret = btrfs_write_and_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ ret = btrfs_write_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ blk_finish_plug(&plug);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
@@ -2449,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -2507,7 +2567,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
ret = walk_log_tree(trans, log, &wc);
- BUG_ON(ret);
+
+ /* I don't think this can happen but just in case */
+ if (ret)
+ btrfs_abort_transaction(trans, log, ret);
}
while (1) {
@@ -2615,7 +2678,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
}
btrfs_release_path(path);
di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
@@ -2627,7 +2693,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
}
/* update the directory size in the log to reflect the names
@@ -2966,7 +3035,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Logic error */
if (ret < 0)
break;
@@ -3169,7 +3238,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
log->fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_release_path(dst_path);
+ kfree(ins_data);
+ return ret;
+ }
}
}
}
@@ -3209,115 +3282,6 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct extent_map *em,
- struct btrfs_path *path)
-{
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
- struct btrfs_key key, new_key;
- struct btrfs_map_token token;
- u64 extent_end;
- u64 extent_offset = 0;
- int extent_type;
- int del_slot = 0;
- int del_nr = 0;
- int ret = 0;
-
- while (1) {
- btrfs_init_map_token(&token);
- leaf = path->nodes[0];
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf)) {
- if (del_nr) {
- ret = btrfs_del_items(trans, root, path,
- del_slot, del_nr);
- if (ret)
- return ret;
- del_nr = 0;
- }
-
- ret = btrfs_next_leaf_write(trans, root, path, 1);
- if (ret < 0)
- return ret;
- if (ret > 0)
- return 0;
- leaf = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY ||
- key.offset >= em->start + em->len)
- break;
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- extent_offset = btrfs_token_file_extent_offset(leaf,
- fi, &token);
- extent_end = key.offset +
- btrfs_token_file_extent_num_bytes(leaf, fi,
- &token);
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = key.offset +
- btrfs_file_extent_inline_len(leaf, fi);
- } else {
- BUG();
- }
-
- if (extent_end <= em->len + em->start) {
- if (!del_nr) {
- del_slot = path->slots[0];
- }
- del_nr++;
- continue;
- }
-
- /*
- * Ok so we'll ignore previous items if we log a new extent,
- * which can lead to overlapping extents, so if we have an
- * existing extent we want to adjust we _have_ to check the next
- * guy to make sure we even need this extent anymore, this keeps
- * us from panicing in set_item_key_safe.
- */
- if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
- struct btrfs_key tmp_key;
-
- btrfs_item_key_to_cpu(leaf, &tmp_key,
- path->slots[0] + 1);
- if (tmp_key.objectid == btrfs_ino(inode) &&
- tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
- tmp_key.offset <= em->start + em->len) {
- if (!del_nr)
- del_slot = path->slots[0];
- del_nr++;
- continue;
- }
- }
-
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
- memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = em->start + em->len;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
- extent_offset += em->start + em->len - key.offset;
- btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
- &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
- (em->start + em->len),
- &token);
- btrfs_mark_buffer_dirty(leaf);
- }
-
- if (del_nr)
- ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-
- return ret;
-}
-
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
struct extent_map *em, struct btrfs_path *path)
@@ -3339,39 +3303,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-insert:
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0);
+ if (ret)
+ return ret;
+
INIT_LIST_HEAD(&ordered_sums);
btrfs_init_map_token(&token);
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
- path->really_keep_locks = 1;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
- if (ret && ret != -EEXIST) {
- path->really_keep_locks = 0;
+ if (ret)
return ret;
- }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /*
- * If we are overwriting an inline extent with a real one then we need
- * to just delete the inline extent as it may not be large enough to
- * have the entire file_extent_item.
- */
- if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
- BTRFS_FILE_EXTENT_INLINE) {
- ret = btrfs_del_item(trans, log, path);
- btrfs_release_path(path);
- if (ret) {
- path->really_keep_locks = 0;
- return ret;
- }
- goto insert;
- }
-
btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3410,22 +3359,14 @@ insert:
em->start - em->orig_start,
&token);
btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
&token);
btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
btrfs_mark_buffer_dirty(leaf);
- /*
- * Have to check the extent to the right of us to make sure it doesn't
- * fall in our current range. We're ok if the previous extent is in our
- * range since the recovery stuff will run us in key order and thus just
- * drop the part we overwrote.
- */
- ret = drop_adjacent_extents(trans, log, inode, em, path);
btrfs_release_path(path);
- path->really_keep_locks = 0;
if (ret) {
return ret;
}
@@ -3650,8 +3591,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool fast_search = false;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3918,9 +3857,9 @@ out:
* only logging is done of any parent directories that are older than
* the last committed transaction
*/
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
@@ -4095,8 +4034,7 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root_no_radix(log_root_tree,
- &found_key);
+ log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_error(fs_info, ret,
@@ -4111,6 +4049,9 @@ again:
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+ free_extent_buffer(log->node);
+ free_extent_buffer(log->commit_root);
+ kfree(log);
btrfs_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
@@ -4119,12 +4060,10 @@ again:
wc.replay_dest->log_root = log;
btrfs_record_root_in_trans(trans, wc.replay_dest);
ret = walk_log_tree(trans, log, &wc);
- BUG_ON(ret);
- if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
- BUG_ON(ret);
}
key.offset = found_key.offset - 1;
@@ -4133,6 +4072,9 @@ again:
free_extent_buffer(log->commit_root);
kfree(log);
+ if (ret)
+ goto error;
+
if (found_key.offset == 0)
break;
}
@@ -4153,17 +4095,20 @@ again:
btrfs_free_path(path);
+ /* step 4: commit the transaction, which also unpins the blocks */
+ ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+ if (ret)
+ return ret;
+
free_extent_buffer(log_root_tree->node);
log_root_tree->log_root = NULL;
fs_info->log_root_recovering = 0;
-
- /* step 4: commit the transaction, which also unpins the blocks */
- btrfs_commit_transaction(trans, fs_info->tree_root);
-
kfree(log_root_tree);
- return 0;
+ return 0;
error:
+ if (wc.trans)
+ btrfs_end_transaction(wc.trans, fs_info->tree_root);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 862ac813f6b8..1d4ae0d15a70 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -40,9 +40,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
int for_rename);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ddc61cad0080..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -53,6 +53,7 @@ void ulist_init(struct ulist *ulist)
ulist->nnodes = 0;
ulist->nodes = ulist->int_nodes;
ulist->nodes_alloced = ULIST_SIZE;
+ ulist->root = RB_ROOT;
}
EXPORT_SYMBOL(ulist_init);
@@ -72,6 +73,7 @@ void ulist_fini(struct ulist *ulist)
if (ulist->nodes_alloced > ULIST_SIZE)
kfree(ulist->nodes);
ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
+ ulist->root = RB_ROOT;
}
EXPORT_SYMBOL(ulist_fini);
@@ -123,6 +125,45 @@ void ulist_free(struct ulist *ulist)
}
EXPORT_SYMBOL(ulist_free);
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+ struct rb_node *n = ulist->root.rb_node;
+ struct ulist_node *u = NULL;
+
+ while (n) {
+ u = rb_entry(n, struct ulist_node, rb_node);
+ if (u->val < val)
+ n = n->rb_right;
+ else if (u->val > val)
+ n = n->rb_left;
+ else
+ return u;
+ }
+ return NULL;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+ struct rb_node **p = &ulist->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ulist_node *cur = NULL;
+
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct ulist_node, rb_node);
+
+ if (cur->val < ins->val)
+ p = &(*p)->rb_right;
+ else if (cur->val > ins->val)
+ p = &(*p)->rb_left;
+ else
+ return -EEXIST;
+ }
+ rb_link_node(&ins->rb_node, parent, p);
+ rb_insert_color(&ins->rb_node, &ulist->root);
+ return 0;
+}
+
/**
* ulist_add - add an element to the ulist
* @ulist: ulist to add the element to
@@ -151,20 +192,23 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask)
{
- int i;
-
- for (i = 0; i < ulist->nnodes; ++i) {
- if (ulist->nodes[i].val == val) {
- if (old_aux)
- *old_aux = ulist->nodes[i].aux;
- return 0;
- }
+ int ret = 0;
+ struct ulist_node *node = NULL;
+ node = ulist_rbtree_search(ulist, val);
+ if (node) {
+ if (old_aux)
+ *old_aux = node->aux;
+ return 0;
}
if (ulist->nnodes >= ulist->nodes_alloced) {
u64 new_alloced = ulist->nodes_alloced + 128;
struct ulist_node *new_nodes;
void *old = NULL;
+ int i;
+
+ for (i = 0; i < ulist->nnodes; i++)
+ rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
/*
* if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -184,9 +228,22 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
ulist->nodes = new_nodes;
ulist->nodes_alloced = new_alloced;
+
+ /*
+ * krealloc actually uses memcpy, which does not copy rb_node
+ * pointers, so we have to do it ourselves. Otherwise we may
+ * be bitten by crashes.
+ */
+ for (i = 0; i < ulist->nnodes; i++) {
+ ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+ if (ret < 0)
+ return ret;
+ }
}
ulist->nodes[ulist->nnodes].val = val;
ulist->nodes[ulist->nnodes].aux = aux;
+ ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
+ BUG_ON(ret);
++ulist->nnodes;
return 1;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21a1963439c3..fb36731074b5 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -8,6 +8,9 @@
#ifndef __ULIST__
#define __ULIST__
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
/*
* ulist is a generic data structure to hold a collection of unique u64
* values. The only operations it supports is adding to the list and
@@ -34,6 +37,7 @@ struct ulist_iterator {
struct ulist_node {
u64 val; /* value to store */
u64 aux; /* auxiliary value saved along with the val */
+ struct rb_node rb_node; /* used to speed up search */
};
struct ulist {
@@ -54,6 +58,8 @@ struct ulist {
*/
struct ulist_node *nodes;
+ struct rb_root root;
+
/*
* inline storage space for the first ULIST_SIZE entries
*/
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2854c824ab64..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -46,6 +46,7 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static DEFINE_MUTEX(uuid_mutex);
@@ -717,9 +718,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
if (!device->name)
continue;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh);
- if (ret)
+ /* Just open everything we can; ignore failures here */
+ if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh))
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -981,6 +982,35 @@ out:
return ret;
}
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 *start, u64 len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+ struct map_lookup *map;
+ int i;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev != device)
+ continue;
+ if (map->stripes[i].physical >= *start + len ||
+ map->stripes[i].physical + em->orig_block_len <=
+ *start)
+ continue;
+ *start = map->stripes[i].physical +
+ em->orig_block_len;
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+
/*
* find_free_dev_extent - find free space in the specified device
* @device: the device which we search the free space in
@@ -1001,7 +1031,8 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
@@ -1025,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
*/
search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
max_hole_start = search_start;
max_hole_size = 0;
hole_size = 0;
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
- goto error;
+ goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto error;
- }
path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
@@ -1080,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
if (key.offset > search_start) {
hole_size = key.offset - search_start;
+ /*
+ * Have to check before we set max_hole_start, otherwise
+ * we could end up sending back this offset anyway.
+ */
+ if (contains_pending_extent(trans, device,
+ &search_start,
+ hole_size))
+ hole_size = 0;
+
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -1123,6 +1164,11 @@ next:
max_hole_size = hole_size;
}
+ if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
@@ -1131,7 +1177,6 @@ next:
out:
btrfs_free_path(path);
-error:
*start = max_hole_start;
if (len)
*len = max_hole_size;
@@ -1199,10 +1244,10 @@ out:
return ret;
}
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset, u64 start, u64 num_bytes)
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes)
{
int ret;
struct btrfs_path *path;
@@ -1243,47 +1288,22 @@ out:
return ret;
}
-static noinline int find_next_chunk(struct btrfs_root *root,
- u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
- int ret;
- struct btrfs_key key;
- struct btrfs_chunk *chunk;
- struct btrfs_key found_key;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = objectid;
- key.offset = (u64)-1;
- key.type = BTRFS_CHUNK_ITEM_KEY;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
-
- BUG_ON(ret == 0); /* Corruption */
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
- ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
- if (ret) {
- *offset = 0;
- } else {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
- if (found_key.objectid != objectid)
- *offset = 0;
- else {
- chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_chunk);
- *offset = found_key.offset +
- btrfs_chunk_length(path->nodes[0], chunk);
- }
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
}
- ret = 0;
-error:
- btrfs_free_path(path);
+ read_unlock(&em_tree->lock);
+
return ret;
}
@@ -1329,9 +1349,9 @@ error:
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
-int btrfs_add_device(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_device *device)
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
@@ -1461,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- printk(KERN_ERR "btrfs: unable to go below four devices "
- "on raid10\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid1\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
root->fs_info->fs_devices->rw_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid5\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
root->fs_info->fs_devices->rw_devices <= 3) {
- printk(KERN_ERR "btrfs: unable to go below three "
- "devices on raid6\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
goto out;
}
@@ -1511,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bh = NULL;
disk_super = NULL;
if (!device) {
- printk(KERN_ERR "btrfs: no missing devices found to "
- "remove\n");
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
goto out;
}
} else {
@@ -1534,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
if (device->is_tgtdev_for_dev_replace) {
- pr_err("btrfs: unable to remove the dev_replace target dev\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto error_brelse;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
- printk(KERN_ERR "btrfs: unable to remove the only writeable "
- "device\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto error_brelse;
}
@@ -1710,8 +1718,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
}
-int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
- struct btrfs_device **device)
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device)
{
int ret = 0;
struct btrfs_super_block *disk_super;
@@ -3119,14 +3127,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices < 4)
+ else if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
- else
- allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6);
-
+ if (num_devices > 2)
+ allowed |= BTRFS_BLOCK_GROUP_RAID5;
+ if (num_devices > 3)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID6);
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
(bctl->data.target & ~allowed))) {
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
- if (IS_ERR(tsk))
- return PTR_ERR(tsk);
-
- return 0;
+ return PTR_RET(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3607,7 +3611,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
@@ -3674,25 +3678,15 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
- u64 features;
-
if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
return;
- features = btrfs_super_incompat_flags(info->super_copy);
- if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
- return;
-
- features |= BTRFS_FEATURE_INCOMPAT_RAID56;
- btrfs_set_super_incompat_flags(info->super_copy, features);
- printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
+ btrfs_set_fs_incompat(info, RAID56);
}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes_out, u64 *stripe_size_out,
- u64 start, u64 type)
+ struct btrfs_root *extent_root, u64 start,
+ u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3799,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(device,
+ ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -3911,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- *map_ret = map;
num_bytes = stripe_size * data_stripes;
- *stripe_size_out = stripe_size;
- *num_bytes_out = num_bytes;
-
trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
em = alloc_extent_map();
@@ -3929,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
+ em->orig_block_len = stripe_size;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
+ ret = add_extent_mapping(em_tree, em, 0);
+ if (!ret) {
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ atomic_inc(&em->refs);
+ }
write_unlock(&em_tree->lock);
if (ret) {
free_extent_map(em);
goto error;
}
- for (i = 0; i < map->num_stripes; ++i) {
- struct btrfs_device *device;
- u64 dev_offset;
-
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device,
- info->chunk_root->root_key.objectid,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, dev_offset, stripe_size);
- if (ret)
- goto error_dev_extent;
- }
-
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- if (ret) {
- i = map->num_stripes - 1;
- goto error_dev_extent;
- }
+ if (ret)
+ goto error_del_extent;
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3968,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
kfree(devices_info);
return 0;
-error_dev_extent:
- for (; i >= 0; i--) {
- struct btrfs_device *device;
- int err;
-
- device = map->stripes[i].dev;
- err = btrfs_free_dev_extent(trans, device, start);
- if (err) {
- btrfs_abort_transaction(trans, extent_root, err);
- break;
- }
- }
+error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
@@ -3994,33 +3961,68 @@ error:
return ret;
}
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
- struct map_lookup *map, u64 chunk_offset,
- u64 chunk_size, u64 stripe_size)
+ u64 chunk_offset, u64 chunk_size)
{
- u64 dev_offset;
struct btrfs_key key;
struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- size_t item_size = btrfs_chunk_item_size(map->num_stripes);
- int index = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
int ret;
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(extent_root->fs_info, "unable to find logical "
+ "%Lu len %Lu", chunk_offset, chunk_size);
+ return -EINVAL;
+ }
+
+ if (em->start != chunk_offset || em->len != chunk_size) {
+ btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+ " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ chunk_size, em->start, em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk)
- return -ENOMEM;
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
- index = 0;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out_free;
- index++;
+ goto out;
+ ret = btrfs_alloc_dev_extent(trans, device,
+ chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ goto out;
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4028,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);
- index = 0;
stripe = &chunk->stripe;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- dev_offset = map->stripes[index].physical;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
- index++;
}
btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4056,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
@@ -4066,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
}
-out_free:
+out:
kfree(chunk);
+ free_extent_map(em);
return ret;
}
@@ -4082,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type)
{
u64 chunk_offset;
- u64 chunk_size;
- u64 stripe_size;
- struct map_lookup *map;
- struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
- int ret;
-
- ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- &chunk_offset);
- if (ret)
- return ret;
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, type);
- if (ret)
- return ret;
-
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret)
- return ret;
- return 0;
+ chunk_offset = find_next_chunk(extent_root->fs_info);
+ return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4111,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
u64 sys_chunk_offset;
- u64 chunk_size;
- u64 sys_chunk_size;
- u64 stripe_size;
- u64 sys_stripe_size;
u64 alloc_profile;
- struct map_lookup *map;
- struct map_lookup *sys_map;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
- ret = find_next_chunk(fs_info->chunk_root,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
- if (ret)
- return ret;
-
+ chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+ alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = chunk_offset + chunk_size;
-
+ sys_chunk_offset = find_next_chunk(root->fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
- &sys_chunk_size, &sys_stripe_size,
- sys_chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ alloc_profile);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- /*
- * Modifying chunk tree needs allocating new blocks from both
- * system block group and metadata block group. So we only can
- * do operations require modifying the chunk tree after both
- * block groups were created.
- */
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- ret = __finish_chunk_alloc(trans, extent_root, sys_map,
- sys_chunk_offset, sys_chunk_size,
- sys_stripe_size);
if (ret)
btrfs_abort_transaction(trans, root, ret);
-
out:
-
return ret;
}
@@ -4240,9 +4187,25 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
read_unlock(&em_tree->lock);
- BUG_ON(!em);
- BUG_ON(em->start > logical || em->start + em->len < logical);
+ /*
+ * We could return errors for these cases, but that could get ugly and
+ * we'd probably do the same thing which is just not do anything else
+ * and exit, so return 1 so the callers don't try to use other copies.
+ */
+ if (!em) {
+ btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical,
+ logical+len);
+ return 1;
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got "
+ "%Lu-%Lu\n", logical, logical+len, em->start,
+ em->start + em->len);
+ return 1;
+ }
+
map = (struct map_lookup *)em->bdev;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
@@ -4411,19 +4374,22 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
read_unlock(&em_tree->lock);
if (!em) {
- printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
- (unsigned long long)logical,
- (unsigned long long)*length);
- BUG();
+ btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+ (unsigned long long)logical,
+ (unsigned long long)*length);
+ return -EINVAL;
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+ "found %Lu-%Lu\n", logical, em->start,
+ em->start + em->len);
+ return -EINVAL;
}
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
- if (mirror_num > map->num_stripes)
- mirror_num = 0;
-
stripe_len = map->stripe_len;
stripe_nr = offset;
/*
@@ -5004,42 +4970,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static void *merge_stripe_index_into_bio_private(void *bi_private,
- unsigned int stripe_index)
-{
- /*
- * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
- * at most 1.
- * The alternative solution (instead of stealing bits from the
- * pointer) would be to allocate an intermediate structure
- * that contains the old private pointer plus the stripe_index.
- */
- BUG_ON((((uintptr_t)bi_private) & 3) != 0);
- BUG_ON(stripe_index > 3);
- return (void *)(((uintptr_t)bi_private) | stripe_index);
-}
-
-static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
-{
- return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
-}
-
-static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
-{
- return (unsigned int)((uintptr_t)bi_private) & 3;
-}
-
static void btrfs_end_bio(struct bio *bio, int err)
{
- struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
+ struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (err) {
atomic_inc(&bbio->error);
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
- extract_stripe_index_from_bio_private(
- bio->bi_private);
+ btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
@@ -5069,8 +5009,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio->bi_bdev = (struct block_device *)
- (unsigned long)bbio->mirror_num;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
@@ -5106,9 +5045,9 @@ struct async_sched {
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-noinline void btrfs_schedule_bio(struct btrfs_root *root,
- struct btrfs_device *device,
- int rw, struct bio *bio)
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
@@ -5177,7 +5116,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
}
prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if ((bio->bi_size >> 9) > max_sectors)
+ if (bio_sectors(bio) > max_sectors)
return 0;
if (!q->merge_bvec_fn)
@@ -5196,8 +5135,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
bio->bi_private = bbio;
- bio->bi_private = merge_stripe_index_into_bio_private(
- bio->bi_private, (unsigned int)dev_nr);
+ btrfs_io_bio(bio)->stripe_index = dev_nr;
bio->bi_end_io = btrfs_end_bio;
bio->bi_sector = physical >> 9;
#ifdef DEBUG
@@ -5258,8 +5196,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
if (atomic_dec_and_test(&bbio->stripes_pending)) {
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio->bi_bdev = (struct block_device *)
- (unsigned long)bbio->mirror_num;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_sector = logical >> 9;
kfree(bbio);
bio_endio(bio, -EIO);
@@ -5308,10 +5245,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
if (map_length < length) {
- printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
- "len %llu\n", (unsigned long long)logical,
- (unsigned long long)length,
- (unsigned long long)map_length);
+ btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+ (unsigned long long)logical,
+ (unsigned long long)length,
+ (unsigned long long)map_length);
BUG();
}
@@ -5337,7 +5274,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
if (dev_nr < total_devs - 1) {
- bio = bio_clone(first_bio, GFP_NOFS);
+ bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
@@ -5382,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
return NULL;
list_add(&device->dev_list,
&fs_devices->devices);
- device->dev_root = root->fs_info->dev_root;
device->devid = devid;
device->work.func = pending_bios_fn;
device->fs_devices = fs_devices;
@@ -5476,7 +5412,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
write_lock(&map_tree->map_tree.lock);
- ret = add_extent_mapping(&map_tree->map_tree, em);
+ ret = add_extent_mapping(&map_tree->map_tree, em, 0);
write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret); /* Tree corruption */
free_extent_map(em);
@@ -5583,8 +5519,8 @@ static int read_one_dev(struct btrfs_root *root,
return -EIO;
if (!device) {
- printk(KERN_WARNING "warning devid %llu missing\n",
- (unsigned long long)devid);
+ btrfs_warn(root->fs_info, "devid %llu missing",
+ (unsigned long long)devid);
device = add_missing_dev(root, devid, dev_uuid);
if (!device)
return -ENOMEM;
@@ -5608,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
}
fill_device_from_item(leaf, dev_item, device);
- device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5766,6 +5701,17 @@ error:
return ret;
}
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
int i;
@@ -5926,7 +5872,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
btrfs_dev_stat_print_on_error(dev);
}
-void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 062d8604d35b..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -152,6 +152,26 @@ struct btrfs_fs_devices {
int rotating;
};
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet. We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios. We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+struct btrfs_io_bio {
+ unsigned long mirror_num;
+ unsigned long stripe_index;
+ struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_io_bio, bio);
+}
+
struct btrfs_bio_stripe {
struct btrfs_device *dev;
u64 physical;
@@ -254,10 +274,6 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset, u64 start, u64 num_bytes);
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
@@ -282,11 +298,6 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
-int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
- struct btrfs_device **device);
-int btrfs_add_device(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
@@ -305,12 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
-void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_root *root,
struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
@@ -321,14 +333,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
int btrfs_scratch_superblock(struct btrfs_device *device);
-void btrfs_schedule_bio(struct btrfs_root *root,
- struct btrfs_device *device,
- int rw, struct bio *bio);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
struct btrfs_mapping_tree *map_tree,
u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 446a6848c554..05740b9789e4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -406,8 +406,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
XATTR_REPLACE);
}
-int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int btrfs_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_info)
{
const struct xattr *xattr;
struct btrfs_trans_handle *trans = fs_info;
diff --git a/fs/buffer.c b/fs/buffer.c
index b4dcb34c9635..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(unlock_buffer);
/*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct buffer_head *head, *bh;
+ *dirty = false;
+ *writeback = false;
+
+ BUG_ON(!PageLocked(page));
+
+ if (!page_has_buffers(page))
+ return;
+
+ if (PageWriteback(page))
+ *writeback = true;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (buffer_locked(bh))
+ *writeback = true;
+
+ if (buffer_dirty(bh))
+ *dirty = true;
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
* Block until a buffer comes unlocked. This doesn't stop it
* from becoming locked again - you have to lock it yourself
* if you want to preserve its state.
@@ -865,8 +899,6 @@ try_again:
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
-
- init_buffer(bh, NULL, NULL);
}
return head;
/*
@@ -1456,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh)
* block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
* block_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -1467,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh)
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
+ unsigned int stop = length + offset;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
goto out;
+ /*
+ * Check for overflow
+ */
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
@@ -1483,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
next = bh->b_this_page;
/*
+ * Are we still fully in range ?
+ */
+ if (next_off > stop)
+ goto out;
+
+ /*
* is this block fully invalidated?
*/
if (offset <= curr_off)
@@ -1503,6 +1549,7 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2843,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
@@ -2949,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
}
}
-int submit_bh(int rw, struct buffer_head * bh)
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
{
struct bio *bio;
int ret = 0;
@@ -2979,15 +3026,20 @@ int submit_bh(int rw, struct buffer_head * bh)
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
- bio->bi_idx = 0;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
+ bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
+ if (buffer_meta(bh))
+ rw |= REQ_META;
+ if (buffer_prio(bh))
+ rw |= REQ_PRIO;
+
bio_get(bio);
submit_bio(rw, bio);
@@ -2997,6 +3049,12 @@ int submit_bh(int rw, struct buffer_head * bh)
bio_put(bio);
return ret;
}
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+ return _submit_bh(rw, bh, 0);
+}
EXPORT_SYMBOL(submit_bh);
/**
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
#include <linux/mount.h>
#include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
struct cachefiles_lookup_data {
struct cachefiles_xattr *auxdata; /* auxiliary data */
char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
object = container_of(_object, struct cachefiles_object, fscache);
cache = container_of(object->fscache.cache, struct cachefiles_cache,
cache);
+
+ if (!fscache_use_cookie(_object)) {
+ _leave(" [relinq]");
+ return;
+ }
+
cookie = object->fscache.cookie;
if (!cookie->def->get_aux) {
+ fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
+ fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ fscache_unuse_cookie(_object);
ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
#endif
/* delete retired objects */
- if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+ if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobject: OBJ%x\n",
prefix, object->fscache.debug_id);
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
- prefix, fscache_object_states[object->fscache.state],
+ prefix, object->fscache.state->name,
object->fscache.flags, work_busy(&object->fscache.work),
object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
found_dentry:
kdebug("preemptive burial: OBJ%x [%s] %p",
object->fscache.debug_id,
- fscache_object_states[object->fscache.state],
+ object->fscache.state->name,
dentry);
- if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
- if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
/* look up the victim */
- mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 480992259707..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/swap.h>
#include "internal.h"
/*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
*/
static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
struct fscache_retrieval *op,
- struct page *netpage,
- struct pagevec *pagevec)
+ struct page *netpage)
{
struct cachefiles_one_read *monitor;
struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
_enter("");
- pagevec_reinit(pagevec);
-
_debug("read back %p{%lu,%d}",
netpage, netpage->index, page_count(netpage));
@@ -283,9 +281,7 @@ installed_new_backing_page:
backpage = newpage;
newpage = NULL;
- page_cache_get(backpage);
- pagevec_add(pagevec, backpage);
- __pagevec_lru_add_file(pagevec);
+ lru_cache_add_file(backpage);
read_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
if (block) {
/* submit the apparently valid page to the backing fs to be
* read from disk */
- ret = cachefiles_read_backing_file_one(object, op, page,
- &pagevec);
+ ret = cachefiles_read_backing_file_one(object, op, page);
} else if (cachefiles_has_space(cache, 0, 1) == 0) {
/* there's space in the cache we can use */
fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
{
struct cachefiles_one_read *monitor = NULL;
struct address_space *bmapping = object->backer->d_inode->i_mapping;
- struct pagevec lru_pvec;
struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
int ret = 0;
_enter("");
- pagevec_init(&lru_pvec, 0);
-
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
backpage = newpage;
newpage = NULL;
- page_cache_get(backpage);
- if (!pagevec_add(&lru_pvec, backpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(backpage);
reread_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto nomem;
}
- page_cache_get(netpage);
- if (!pagevec_add(&lru_pvec, netpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(netpage);
/* install a monitor */
page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
fscache_mark_page_cached(op, netpage);
- page_cache_get(netpage);
- if (!pagevec_add(&lru_pvec, netpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(netpage);
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
out:
/* tidy up */
- pagevec_lru_add_file(&lru_pvec);
-
if (newpage)
page_cache_release(newpage);
if (netpage)
@@ -962,12 +946,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
}
data = kmap(page);
+ file_start_write(file);
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = file->f_op->write(
file, (const void __user *) data, len, &pos);
set_fs(old_fs);
kunmap(page);
+ file_end_write(file);
if (ret != len)
ret = -EIO;
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a60ea977af6f..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
* dirty page counters appropriately. Only called if there is private
* data on the page.
*/
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
page->private = 0;
ClearPagePrivate(page);
} else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
+ dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+ inode, page, page->index, offset, length);
}
}
@@ -236,15 +237,21 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
+ struct ceph_osd_data *osd_data;
int rc = req->r_result;
int bytes = le32_to_cpu(msg->hdr.data_len);
+ int num_pages;
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
/* unlock all pages, zeroing any data we didn't read */
- for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
- struct page *page = req->r_pages[i];
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = osd_data->pages[i];
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -257,8 +264,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
SetPageUptodate(page);
unlock_page(page);
page_cache_release(page);
+ bytes -= PAGE_CACHE_SIZE;
}
- kfree(req->r_pages);
+ kfree(osd_data->pages);
}
static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -279,6 +287,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct ceph_vino vino;
struct ceph_osd_request *req;
u64 off;
u64 len;
@@ -303,18 +312,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
len = nr_pages << PAGE_CACHE_SHIFT;
dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
off, len);
-
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
- off, &len,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, 0,
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+ 1, CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ, NULL,
ci->i_truncate_seq, ci->i_truncate_size,
- NULL, false, 0);
+ false);
if (IS_ERR(req))
return PTR_ERR(req);
/* build page vector */
- nr_pages = len >> PAGE_CACHE_SHIFT;
+ nr_pages = calc_pages_for(0, len);
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
ret = -ENOMEM;
if (!pages)
@@ -336,11 +344,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
}
pages[i] = page;
}
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
req->r_callback = finish_read;
req->r_inode = inode;
+ ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
@@ -373,7 +382,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
- dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+ dout("readpages %p file %p nr_pages %d max %d\n", inode,
+ file, nr_pages,
max);
while (!list_empty(page_list)) {
rc = start_read(inode, page_list, max);
@@ -429,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
- loff_t page_off = page_offset(page);
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
struct ceph_snap_context *snapc, *oldest;
- u64 snap_size = 0;
+ loff_t page_off = page_offset(page);
long writeback_stat;
+ u64 truncate_size, snap_size = 0;
+ u32 truncate_seq;
+ int err = 0, len = PAGE_CACHE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -465,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
/* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
+ if (page_off >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
+ goto out;
+ }
+ if (snap_size < page_off + len)
+ len = snap_size - page_off;
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
@@ -485,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
+ truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -548,17 +564,23 @@ static void writepages_finish(struct ceph_osd_request *req,
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_data *osd_data;
unsigned wrote;
struct page *page;
+ int num_pages;
int i;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
int rc = req->r_result;
- u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
+ u64 bytes = req->r_ops[0].extent.length;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
if (rc >= 0) {
/*
* Assume we wrote the pages we originally sent. The
@@ -566,7 +588,7 @@ static void writepages_finish(struct ceph_osd_request *req,
* raced with a truncation and was adjusted at the osd,
* so don't believe the reply.
*/
- wrote = req->r_num_pages;
+ wrote = num_pages;
} else {
wrote = 0;
mapping_set_error(mapping, rc);
@@ -575,8 +597,8 @@ static void writepages_finish(struct ceph_osd_request *req,
inode, rc, bytes, wrote);
/* clean all pages */
- for (i = 0; i < req->r_num_pages; i++) {
- page = req->r_pages[i];
+ for (i = 0; i < num_pages; i++) {
+ page = osd_data->pages[i];
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
@@ -605,35 +627,18 @@ static void writepages_finish(struct ceph_osd_request *req,
unlock_page(page);
}
dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+ ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
- ceph_release_pages(req->r_pages, req->r_num_pages);
- if (req->r_pages_from_pool)
- mempool_free(req->r_pages,
+ ceph_release_pages(osd_data->pages, num_pages);
+ if (osd_data->pages_from_pool)
+ mempool_free(osd_data->pages,
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
else
- kfree(req->r_pages);
+ kfree(osd_data->pages);
ceph_osdc_put_request(req);
}
/*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool. we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_fs_client *fsc,
- struct ceph_osd_request *req)
-{
- req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
- GFP_NOFS);
- if (!req->r_pages) {
- req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
- req->r_pages_from_pool = 1;
- WARN_ON(!req->r_pages);
- }
-}
-
-/*
* initiate async writeback
*/
static int ceph_writepages_start(struct address_space *mapping,
@@ -641,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -653,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync;
- u64 snap_size = 0;
+ u64 truncate_size, snap_size;
+ u32 truncate_seq;
/*
* Include a 'sync' in the OSD request if this is a data
* integrity write (e.g., O_SYNC write or fsync()), or if our
* cap is being revoked.
*/
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ if ((wbc->sync_mode == WB_SYNC_ALL) ||
+ ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
do_sync = 1;
dout("writepages_start %p dosync=%d (mode=%s)\n",
inode, do_sync,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- fsc = ceph_inode_to_client(inode);
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
@@ -699,6 +705,7 @@ static int ceph_writepages_start(struct address_space *mapping,
retry:
/* find oldest snap context with dirty data */
ceph_put_snap_context(snapc);
+ snap_size = 0;
snapc = get_oldest_context(inode, &snap_size);
if (!snapc) {
/* hmm, why does writepages get called when there
@@ -706,8 +713,18 @@ retry:
dout(" no snap context with dirty data?\n");
goto out;
}
+ if (snap_size == 0)
+ snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
+
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
if (last_snapc && snapc != last_snapc) {
/* if we switched to a newer snapc, restart our scan at the
* start of the original file range. */
@@ -718,10 +735,13 @@ retry:
last_snapc = snapc;
while (!done && index <= end) {
+ int num_ops = do_sync ? 2 : 1;
unsigned i;
int first;
pgoff_t next;
int pvec_pages, locked_pages;
+ struct page **pages = NULL;
+ mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
int want;
u64 offset, len;
@@ -773,11 +793,8 @@ get_more_pages:
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
}
- if ((snap_size && page_offset(page) > snap_size) ||
- (!snap_size &&
- page_offset(page) > i_size_read(inode))) {
- dout("%p page eof %llu\n", page, snap_size ?
- snap_size : i_size_read(inode));
+ if (page_offset(page) >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
done = 1;
unlock_page(page);
break;
@@ -805,34 +822,42 @@ get_more_pages:
break;
}
- /* ok */
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * allocate an osd request and a page array
+ * that it will use.
+ */
if (locked_pages == 0) {
+ BUG_ON(pages);
/* prepare async write request */
- offset = (u64) page_offset(page);
+ offset = (u64)page_offset(page);
len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout,
- ceph_vino(inode),
- offset, &len,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, do_sync,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- &inode->i_mtime, true, 0);
-
+ &ci->i_layout, vino,
+ offset, &len, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
unlock_page(page);
break;
}
- max_pages = req->r_num_pages;
-
- alloc_page_vec(fsc, req);
req->r_callback = writepages_finish;
req->r_inode = inode;
+
+ max_pages = calc_pages_for(0, (u64)len);
+ pages = kmalloc(max_pages * sizeof (*pages),
+ GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
}
/* note position of first page in pvec */
@@ -850,7 +875,7 @@ get_more_pages:
}
set_page_writeback(page);
- req->r_pages[locked_pages] = page;
+ pages[locked_pages] = page;
locked_pages++;
next = page->index + 1;
}
@@ -879,18 +904,27 @@ get_more_pages:
pvec.nr -= i-first;
}
- /* submit the write */
- offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
- len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+ /* Format the osd request message and submit the write */
+
+ offset = page_offset(pages[0]);
+ len = min(snap_size - offset,
(u64)locked_pages << PAGE_CACHE_SHIFT);
dout("writepages got %d pages at %llu~%llu\n",
locked_pages, offset, len);
- /* revise final length, page count */
- req->r_num_pages = locked_pages;
- req->r_request_ops[0].extent.length = cpu_to_le64(len);
- req->r_request_ops[0].payload_len = cpu_to_le32(len);
- req->r_request->hdr.data_len = cpu_to_le32(len);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ !!pool, false);
+
+ pages = NULL; /* request message now owns the pages array */
+ pool = NULL;
+
+ /* Update the write op length in case we changed it */
+
+ osd_req_op_extent_update(req, 0, len);
+
+ vino = ceph_vino(inode);
+ ceph_osdc_build_request(req, offset, snapc, vino.snap,
+ &inode->i_mtime);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
BUG_ON(rc);
@@ -1067,51 +1101,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *fi = file->private_data;
struct page *page;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- int r, want, got = 0;
-
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
-
- dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, len, inode->i_size);
- r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
- if (r < 0)
- return r;
- dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
- if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
- ceph_put_cap_refs(ci, got);
- return -EAGAIN;
- }
+ int r;
do {
/* get a page */
page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page) {
- r = -ENOMEM;
- break;
- }
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
- if (r)
- page_cache_release(page);
} while (r == -EAGAIN);
- if (r) {
- ceph_put_cap_refs(ci, got);
- } else {
- *pagep = page;
- *(int *)fsdata = got;
- }
return r;
}
@@ -1125,12 +1131,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
- int got = (unsigned long)fsdata;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
@@ -1153,19 +1157,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
up_read(&mdsc->snap_rwsem);
page_cache_release(page);
- if (copied > 0) {
- int dirty;
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- }
-
- dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
- ceph_put_cap_refs(ci, got);
-
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 78e2f575247d..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
int have;
int alloc = 0;
LIST_HEAD(newcaps);
- int ret = 0;
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
for (i = have; i < need; i++) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
+ if (!cap)
+ break;
list_add(&cap->caps_item, &newcaps);
alloc++;
}
- BUG_ON(have + alloc != need);
+ /* we didn't manage to reserve as much as we needed */
+ if (have + alloc != need)
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -490,15 +483,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
ci->i_rdcache_gen++;
/*
- * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
+ * if we are newly issued FILE_SHARED, mark dir not complete; we
* don't know what happened to this directory while we didn't
* have the cap.
*/
if ((issued & CEPH_CAP_FILE_SHARED) &&
(had & CEPH_CAP_FILE_SHARED) == 0) {
ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode))
- ceph_dir_clear_complete(&ci->vfs_inode);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ __ceph_dir_clear_complete(ci);
+ }
}
}
@@ -553,6 +548,7 @@ retry:
cap->implemented = 0;
cap->mds = mds;
cap->mds_wanted = 0;
+ cap->mseq = 0;
cap->ci = ci;
__insert_cap_node(ci, cap);
@@ -609,9 +605,11 @@ retry:
__cap_delay_requeue(mdsc, ci);
}
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap) {
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (ci->i_auth_cap == NULL ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ci->i_auth_cap = cap;
+ } else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -628,7 +626,10 @@ retry:
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
- cap->mds_wanted |= wanted;
+ if (mseq > cap->mseq)
+ cap->mds_wanted = wanted;
+ else
+ cap->mds_wanted |= wanted;
cap->seq = seq;
cap->issue_seq = seq;
cap->mseq = mseq;
@@ -689,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (implemented)
*implemented |= cap->implemented;
}
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
return have;
}
@@ -796,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
{
- struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct rb_node *p;
- int ret = 0;
- spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
+ if (cap != ocap && __cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
}
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
@@ -997,9 +1013,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
return 0;
}
-static void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
+void __queue_cap_release(struct ceph_mds_session *session,
+ u64 ino, u64 cap_id, u32 migrate_seq,
+ u32 issue_seq)
{
struct ceph_msg *msg;
struct ceph_mds_cap_release *head;
@@ -1974,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
__ceph_flush_snaps(ci, &session, 1);
+
if (ci->i_flushing_caps) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
@@ -2046,6 +2069,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
goto out;
}
+ /* finish pending truncate */
+ while (ci->i_truncate_pending) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_unlock(&inode->i_mutex);
+ spin_lock(&ci->i_ceph_lock);
+ }
+
if (need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
@@ -2067,12 +2101,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
}
have = __ceph_caps_issued(ci, &implemented);
- /*
- * disallow writes while a truncate is pending
- */
- if (ci->i_truncate_pending)
- have &= ~CEPH_CAP_FILE_WR;
-
if ((have & need) == need) {
/*
* Look at (implemented & ~have & not) so that we keep waiting
@@ -2466,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
} else {
dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
cap->issued = newcaps;
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
@@ -3035,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
(cap->issued & unless) == 0)) {
if ((cap->issued & drop) &&
(cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
cap->issued &= ~drop;
cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
+ cap->mds_wanted = wanted;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6d797f46d772..a40ceda47a32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -107,15 +107,14 @@ static unsigned fpos_off(loff_t p)
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
- * D_COMPLETE tells indicates we have all dentries in the dir. It is
+ * Complete dir indicates that we have all dentries in the dir. It is
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct dentry *parent = filp->f_dentry;
+ struct ceph_file_info *fi = file->private_data;
+ struct dentry *parent = file->f_dentry;
struct inode *dir = parent->d_inode;
struct list_head *p;
struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+ dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
last);
spin_lock(&parent->d_lock);
/* start at beginning? */
- if (filp->f_pos == 2 || last == NULL ||
- filp->f_pos < ceph_dentry(last)->offset) {
+ if (ctx->pos == 2 || last == NULL ||
+ ctx->pos < ceph_dentry(last)->offset) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
if (!d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- filp->f_pos <= di->offset)
+ ctx->pos <= di->offset)
break;
dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
dentry->d_name.len, dentry->d_name.name, di->offset,
- filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+ ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
!dentry->d_inode ? " null" : "");
spin_unlock(&dentry->d_lock);
p = p->prev;
@@ -173,33 +172,31 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+ dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- filp->f_pos = di->offset;
- err = filldir(dirent, dentry->d_name.name,
- dentry->d_name.len, di->offset,
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12);
-
- if (last) {
- if (err < 0) {
+ dentry->d_inode->i_mode >> 12)) {
+ if (last) {
/* remember our position */
fi->dentry = last;
fi->next_offset = di->offset;
- } else {
- dput(last);
}
+ dput(dentry);
+ return 0;
}
- last = dentry;
- if (err < 0)
- goto out;
+ if (last)
+ dput(last);
+ last = dentry;
- filp->f_pos++;
+ ctx->pos++;
/* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_test_complete(dir)) {
- dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
+ if (!ceph_dir_is_complete(dir)) {
+ dout(" lost dir complete on %p; falling back to mds\n", dir);
err = -EAGAIN;
goto out;
}
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0;
}
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(filp->f_pos);
- int off = fpos_off(filp->f_pos);
+ unsigned frag = fpos_frag(ctx->pos);
+ int off = fpos_off(ctx->pos);
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
const int max_entries = fsc->mount_options->max_readdir;
const int max_bytes = fsc->mount_options->max_readdir_bytes;
- dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+ dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
- if (filp->f_pos == 0) {
+ if (ctx->pos == 0) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
- fi->dir_release_count = ci->i_release_count;
+ fi->dir_release_count = atomic_read(&ci->i_release_count);
dout("readdir off 0 -> '.'\n");
- if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+ if (!dir_emit(ctx, ".", 1,
ceph_translate_ino(inode->i_sb, inode->i_ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 1;
+ ctx->pos = 1;
off = 1;
}
- if (filp->f_pos == 1) {
- ino_t ino = parent_ino(filp->f_dentry);
+ if (ctx->pos == 1) {
+ ino_t ino = parent_ino(file->f_dentry);
dout("readdir off 1 -> '..'\n");
- if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+ if (!dir_emit(ctx, "..", 2,
ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 2;
+ ctx->pos = 2;
off = 2;
}
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
- if ((filp->f_pos == 2 || fi->dentry) &&
+ if ((ctx->pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
- ceph_dir_test_complete(inode) &&
+ __ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(filp, dirent, filldir);
+ err = __dcache_readdir(file, ctx);
if (err != -EAGAIN)
return err;
} else {
@@ -327,7 +324,7 @@ more:
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
- req->r_dentry = dget(filp->f_dentry);
+ req->r_dentry = dget(file->f_dentry);
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
@@ -350,7 +347,8 @@ more:
if (!req->r_did_prepopulate) {
dout("readdir !did_prepopulate");
- fi->dir_release_count--; /* preclude D_COMPLETE */
+ /* preclude from marking dir complete */
+ fi->dir_release_count--;
}
/* note next offset and last dentry name */
@@ -378,15 +376,16 @@ more:
rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
rinfo->dir_nr, off, fi->offset);
+
+ ctx->pos = ceph_make_fpos(frag, off);
while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
struct ceph_vino vino;
ino_t ino;
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, pos,
+ off, off - fi->offset, rinfo->dir_nr, ctx->pos,
rinfo->dir_dname_len[off - fi->offset],
rinfo->dir_dname[off - fi->offset], in);
BUG_ON(!in);
@@ -394,16 +393,15 @@ more:
vino.ino = le64_to_cpu(in->ino);
vino.snap = le64_to_cpu(in->snapid);
ino = ceph_vino_to_ino(vino);
- if (filldir(dirent,
+ if (!dir_emit(ctx,
rinfo->dir_dname[off - fi->offset],
rinfo->dir_dname_len[off - fi->offset],
- pos,
- ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n");
return 0;
}
off++;
- filp->f_pos = pos + 1;
+ ctx->pos++;
}
if (fi->last_name) {
@@ -416,7 +414,7 @@ more:
if (!ceph_frag_is_rightmost(frag)) {
frag = ceph_frag_next(frag);
off = 0;
- filp->f_pos = ceph_make_fpos(frag, off);
+ ctx->pos = ceph_make_fpos(frag, off);
dout("readdir next frag is %x\n", frag);
goto more;
}
@@ -428,13 +426,14 @@ more:
* the complete dir contents in our cache.
*/
spin_lock(&ci->i_ceph_lock);
- if (ci->i_release_count == fi->dir_release_count) {
- ceph_dir_set_complete(inode);
- ci->i_max_offset = filp->f_pos;
+ if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
+ dout(" marking %p complete\n", inode);
+ __ceph_dir_set_complete(ci, fi->dir_release_count);
+ ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
- dout("readdir %p filp %p done.\n", inode, filp);
+ dout("readdir %p file %p done.\n", inode, file);
return 0;
}
@@ -604,7 +603,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
- ceph_dir_test_complete(dir) &&
+ __ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
@@ -1065,44 +1064,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
}
/*
- * Set/clear/test dir complete flag on the dir's dentry.
- */
-void ceph_dir_set_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry) &&
- ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
- dout(" marking %p (%p) complete\n", inode, dentry);
- set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
-}
-
-void ceph_dir_clear_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry)) {
- dout(" marking %p (%p) complete\n", inode, dentry);
- set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
-}
-
-bool ceph_dir_test_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry)) {
- dout(" marking %p (%p) NOT complete\n", inode, dentry);
- clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
- return false;
-}
-
-/*
* When the VFS prunes a dentry from the cache, we need to clear the
* complete flag on the parent directory.
*
@@ -1110,15 +1071,13 @@ bool ceph_dir_test_complete(struct inode *inode)
*/
static void ceph_d_prune(struct dentry *dentry)
{
- struct ceph_dentry_info *di;
-
dout("ceph_d_prune %p\n", dentry);
/* do we have a valid parent? */
if (IS_ROOT(dentry))
return;
- /* if we are not hashed, we don't affect D_COMPLETE */
+ /* if we are not hashed, we don't affect dir's completeness */
if (d_unhashed(dentry))
return;
@@ -1126,8 +1085,7 @@ static void ceph_d_prune(struct dentry *dentry)
* we hold d_lock, so d_parent is stable, and d_fsdata is never
* cleared until d_release
*/
- di = ceph_dentry(dentry->d_parent);
- clear_bit(CEPH_D_COMPLETE, &di->flags);
+ ceph_dir_clear_complete(dentry->d_parent->d_inode);
}
/*
@@ -1307,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
- .readdir = ceph_readdir,
+ .iterate = ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bf338d9b67e3..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "super.h"
#include "mds_client.h"
@@ -446,19 +447,35 @@ done:
}
/*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
+ * Write commit request unsafe callback, called to tell us when a
+ * request is unsafe (that is, in flight--has been handed to the
+ * messenger to send to its target osd). It is called again when
+ * we've received a response message indicating the request is
+ * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
+ * is completed early (and unsuccessfully) due to a timeout or
+ * interrupt.
+ *
+ * This is used if we requested both an ACK and ONDISK commit reply
+ * from the OSD.
*/
-static void sync_write_commit(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
{
struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+ dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
+ unsafe ? "un" : "");
+ if (unsafe) {
+ ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_item,
+ &ci->i_unsafe_writes);
+ spin_unlock(&ci->i_unsafe_lock);
+ } else {
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+ }
}
/*
@@ -470,36 +487,33 @@ static void sync_write_commit(struct ceph_osd_request *req,
* objects, rollback on failure, etc.)
*/
static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t *offset)
+ size_t left, loff_t pos, loff_t *ppos)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_vino vino;
struct ceph_osd_request *req;
+ int num_ops = 1;
struct page **pages;
int num_pages;
- long long unsigned pos;
u64 len;
int written = 0;
int flags;
- int do_sync = 0;
int check_caps = 0;
int page_align, io_align;
unsigned long buf_align;
int ret;
struct timespec mtime = CURRENT_TIME;
+ bool own_pages = false;
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+ dout("sync_write on file %p %lld~%u %s\n", file, pos,
(unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
- if (file->f_flags & O_APPEND)
- pos = i_size_read(inode);
- else
- pos = *offset;
-
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
if (ret < 0)
return ret;
@@ -516,7 +530,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
flags |= CEPH_OSD_FLAG_ACK;
else
- do_sync = 1;
+ num_ops++; /* Also include a 'startsync' command. */
/*
* we may need to do multiple writes here if we span an object
@@ -526,25 +540,20 @@ more:
io_align = pos & ~PAGE_MASK;
buf_align = (unsigned long)data & ~PAGE_MASK;
len = left;
- if (file->f_flags & O_DIRECT) {
- /* write from beginning of first page, regardless of
- io alignment */
- page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
- num_pages = calc_pages_for((unsigned long)data, len);
- } else {
- page_align = pos & ~PAGE_MASK;
- num_pages = calc_pages_for(pos, len);
- }
+
+ snapc = ci->i_snap_realm->cached_context;
+ vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- ceph_vino(inode), pos, &len,
- CEPH_OSD_OP_WRITE, flags,
- ci->i_snap_realm->cached_context,
- do_sync,
+ vino, pos, &len, num_ops,
+ CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, page_align);
+ false);
if (IS_ERR(req))
return PTR_ERR(req);
+ /* write from beginning of first page, regardless of io alignment */
+ page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
+ num_pages = calc_pages_for(page_align, len);
if (file->f_flags & O_DIRECT) {
pages = ceph_get_direct_page_vector(data, num_pages, false);
if (IS_ERR(pages)) {
@@ -572,36 +581,20 @@ more:
if ((file->f_flags & O_SYNC) == 0) {
/* get a second commit callback */
- req->r_safe_callback = sync_write_commit;
- req->r_own_pages = 1;
+ req->r_unsafe_callback = ceph_sync_write_unsafe;
+ req->r_inode = inode;
+ own_pages = true;
}
}
- req->r_pages = pages;
- req->r_num_pages = num_pages;
- req->r_inode = inode;
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ false, own_pages);
+
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- if (req->r_safe_callback) {
- /*
- * Add to inode unsafe list only after we
- * start_request so that a tid has been assigned.
- */
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_item,
- &ci->i_unsafe_writes);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
-
+ if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret < 0 && req->r_safe_callback) {
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
- }
if (file->f_flags & O_DIRECT)
ceph_put_page_vector(pages, num_pages, false);
@@ -614,12 +607,12 @@ out:
pos += len;
written += len;
left -= len;
- data += written;
+ data += len;
if (left)
goto more;
ret = written;
- *offset = pos;
+ *ppos = pos;
if (pos > i_size_read(inode))
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
@@ -653,7 +646,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), pos, (unsigned)len, inode);
again:
- __ceph_do_pending_vmtruncate(inode);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
@@ -717,55 +709,74 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
- loff_t endoff = pos + iov->iov_len;
- int got = 0;
- int ret, err, written;
+ ssize_t count, written = 0;
+ int err, want, got;
+ bool hold_mutex;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
-retry_snap:
- written = 0;
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
- return -ENOSPC;
- __ceph_do_pending_vmtruncate(inode);
+ mutex_lock(&inode->i_mutex);
+ hold_mutex = true;
- /*
- * try to do a buffered write. if we don't have sufficient
- * caps, we'll get -EAGAIN from generic_file_aio_write, or a
- * short write if we only get caps for some pages.
- */
- if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
- !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
- !(fi->flags & CEPH_F_SYNC)) {
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- if (ret >= 0)
- written = ret;
-
- if ((ret >= 0 || ret == -EIOCBQUEUED) &&
- ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
- || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + written - 1, 1);
- if (err < 0)
- ret = err;
- }
- if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
- goto out;
+ err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+ if (err)
+ goto out;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = file->f_mapping->backing_dev_info;
+
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+retry_snap:
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+ err = -ENOSPC;
+ goto out;
}
- dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos + written,
- (unsigned)iov->iov_len - written, inode->i_size);
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
- if (ret < 0)
+ dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, count, inode->i_size);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+ got = 0;
+ err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
+ if (err < 0)
goto out;
- dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos + written,
- (unsigned)iov->iov_len - written, ceph_cap_string(got));
- ret = ceph_sync_write(file, iov->iov_base + written,
- iov->iov_len - written, &iocb->ki_pos);
- if (ret >= 0) {
+ dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+
+ if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+ (iocb->ki_filp->f_flags & O_DIRECT) ||
+ (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+ (fi->flags & CEPH_F_SYNC)) {
+ mutex_unlock(&inode->i_mutex);
+ written = ceph_sync_write(file, iov->iov_base, count,
+ pos, &iocb->ki_pos);
+ } else {
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, &iocb->ki_pos,
+ count, 0);
+ mutex_unlock(&inode->i_mutex);
+ }
+ hold_mutex = false;
+
+ if (written >= 0) {
int dirty;
spin_lock(&ci->i_ceph_lock);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
@@ -773,18 +784,33 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
}
+
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos + written,
- (unsigned)iov->iov_len - written, ceph_cap_string(got));
+ inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
-out:
- if (ret == -EOLDSNAPC) {
+
+ if (written >= 0 &&
+ ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+ err = vfs_fsync_range(file, pos, pos + written - 1, 1);
+ if (err < 0)
+ written = err;
+ }
+
+ if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+ mutex_lock(&inode->i_mutex);
+ hold_mutex = true;
goto retry_snap;
}
+out:
+ if (hold_mutex)
+ mutex_unlock(&inode->i_mutex);
+ current->backing_dev_info = NULL;
- return ret;
+ return written ? written : err;
}
/*
@@ -838,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 851814d951cd..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
- ci->i_release_count = 0;
+ atomic_set(&ci->i_release_count, 1);
+ atomic_set(&ci->i_complete_count, 0);
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode,
struct ceph_inode_info *ci = ceph_inode(inode);
int i;
int issued = 0, implemented;
- int updating_inode = 0;
struct timespec mtime, atime, ctime;
u32 nsplits;
struct ceph_buffer *xattr_blob = NULL;
@@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode,
(ci->i_version & ~1) >= le64_to_cpu(info->version))
goto no_change;
- updating_inode = 1;
issued = __ceph_caps_issued(ci, &implemented);
issued |= implemented | __ceph_caps_dirty(ci);
@@ -717,6 +716,17 @@ static int fill_inode(struct inode *inode,
ceph_vinop(inode), inode->i_mode);
}
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ ceph_snap(inode) == CEPH_NOSNAP &&
+ (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ dout(" marking %p complete (empty)\n", inode);
+ __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
+ ci->i_max_offset = 2;
+ }
no_change:
spin_unlock(&ci->i_ceph_lock);
@@ -767,19 +777,6 @@ no_change:
__ceph_get_fmode(ci, cap_fmode);
}
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- updating_inode && /* didn't jump to no_change */
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !ceph_dir_test_complete(inode)) {
- dout(" marking %p complete (empty)\n", inode);
- ceph_dir_set_complete(inode);
- ci->i_max_offset = 2;
- }
-
/* update delegation info? */
if (dirinfo)
ceph_fill_dirfrag(inode, dirinfo);
@@ -861,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
di = ceph_dentry(dn);
spin_lock(&ci->i_ceph_lock);
- if (!ceph_dir_test_complete(inode)) {
+ if (!__ceph_dir_is_complete(ci)) {
spin_unlock(&ci->i_ceph_lock);
return;
}
@@ -906,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
- dn, dn->d_count,
- realdn, realdn->d_count,
+ dn, d_count(dn),
+ realdn, d_count(realdn),
realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
@@ -1065,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
/*
* d_move() puts the renamed dentry at the end of
* d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when holding
- * D_COMPLETE.
+ * directory offset so we can behave when dir is
+ * complete.
*/
ceph_set_dentry_offset(req->r_old_dentry);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
@@ -1457,7 +1454,7 @@ out:
/*
- * called by trunc_wq; take i_mutex ourselves
+ * called by trunc_wq;
*
* We also truncate in a separate thread as well.
*/
@@ -1494,8 +1491,6 @@ void ceph_queue_vmtruncate(struct inode *inode)
}
/*
- * called with i_mutex held.
- *
* Make sure any pending truncation is applied before doing anything
* that may depend on it.
*/
@@ -1563,6 +1558,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
.follow_link = ceph_sym_follow_link,
+ .setattr = ceph_setattr,
+ .getattr = ceph_getattr,
+ .setxattr = ceph_setxattr,
+ .getxattr = ceph_getxattr,
+ .listxattr = ceph_listxattr,
+ .removexattr = ceph_removexattr,
};
/*
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4a989345b37b..e0b4ef31d3c8 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
- osdc->osdmap);
+
+ ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+ ceph_file_layout_pg_pool(ci->i_layout));
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 202dd3d68be0..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
/**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
* counter variables, so you can prepare pagelist metadata before calling
* ceph_encode_locks.
*/
@@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
}
/**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
*/
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
- int num_fcntl_locks, int num_flock_locks)
+int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct ceph_filelock cephlock;
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
+ int l = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
- err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
- if (err)
- goto fail;
+
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
++seen_fcntl;
@@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
-
- err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
- if (err)
- goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
++seen_flock;
@@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
fail:
return err;
}
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks)
+{
+ int err = 0;
+ __le32 nlocks;
+
+ nlocks = cpu_to_le32(num_fcntl_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+
+ nlocks = cpu_to_le32(num_flock_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist,
+ &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
+out_fail:
+ return err;
+}
+
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 442880d099c9..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -265,7 +265,8 @@ static int parse_reply_info_extra(void **p, void *end,
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
- else if (info->head->op == CEPH_MDS_OP_READDIR)
+ else if (info->head->op == CEPH_MDS_OP_READDIR ||
+ info->head->op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_dir(p, end, info, features);
else if (info->head->op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
@@ -364,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
- s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
- s->s_mdsc->fsc->client->monc.auth,
- s->s_auth.authorizer);
+ ceph_auth_destroy_authorizer(
+ s->s_mdsc->fsc->client->monc.auth,
+ s->s_auth.authorizer);
kfree(s);
}
}
@@ -1196,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
session->s_trim_caps--;
if (oissued) {
/* we aren't the only cap.. just remove us */
+ __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+ cap->mseq, cap->issue_seq);
__ceph_remove_cap(cap);
} else {
/* try to drop referring dentries */
@@ -1388,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
num = le32_to_cpu(head->num);
dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
session->s_num_cap_releases += num;
/* requeue completed messages */
@@ -1550,7 +1554,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, dentry->d_count, *base, len, path);
+ dentry, d_count(dentry), *base, len, path);
return path;
}
@@ -1718,8 +1722,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- msg->pages = req->r_pages;
- msg->nr_pages = req->r_num_pages;
+ if (req->r_data_len) {
+ /* outbound data set only by ceph_sync_setxattr() */
+ BUG_ON(!req->r_pages);
+ ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+ }
+
msg->hdr.data_len = cpu_to_le32(req->r_data_len);
msg->hdr.data_off = cpu_to_le16(0);
@@ -1913,6 +1921,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
req = list_entry(tmp_list.next,
struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
+ dout(" wake request %p tid %llu\n", req, req->r_tid);
__do_request(mdsc, req);
}
}
@@ -2026,20 +2035,16 @@ out:
}
/*
- * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir's completeness, dentry lease state on an aborted MDS
* namespace request.
*/
void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{
struct inode *inode = req->r_locked_dir;
- struct ceph_inode_info *ci = ceph_inode(inode);
- dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
- spin_lock(&ci->i_ceph_lock);
- ceph_dir_clear_complete(inode);
- ci->i_release_count++;
- spin_unlock(&ci->i_ceph_lock);
+ dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
+ ceph_dir_clear_complete(inode);
if (req->r_dentry)
ceph_invalidate_dentry_lease(req->r_dentry);
if (req->r_old_dentry)
@@ -2450,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
+ cap->mseq = 0; /* and migrate_seq */
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2474,39 +2480,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
- struct ceph_pagelist_cursor trunc_point;
-
- ceph_pagelist_set_cursor(pagelist, &trunc_point);
- do {
- lock_flocks();
- ceph_count_locks(inode, &num_fcntl_locks,
- &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- unlock_flocks();
-
- /* pre-alloc pagelist */
- ceph_pagelist_truncate(pagelist, &trunc_point);
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_pagelist_reserve(pagelist,
- rec.v2.flock_len);
-
- /* encode locks */
- if (!err) {
- lock_flocks();
- err = ceph_encode_locks(inode,
- pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_flocks();
- }
- } while (err == -ENOSPC);
+ struct ceph_filelock *flocks;
+
+encode_again:
+ spin_lock(&inode->i_lock);
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ spin_unlock(&inode->i_lock);
+ flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock), GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ spin_lock(&inode->i_lock);
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ spin_unlock(&inode->i_lock);
+ if (err) {
+ kfree(flocks);
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_free;
+ }
+ /*
+ * number of encoded locks is stable, so copy to pagelist
+ */
+ rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+ (num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock));
+ err = ceph_pagelist_append(pagelist, &rec, reclen);
+ if (!err)
+ err = ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ kfree(flocks);
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
-
out_free:
kfree(path);
out_dput:
@@ -2599,11 +2610,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail;
}
- reply->pagelist = pagelist;
if (recon_state.flock)
reply->hdr.version = cpu_to_le16(2);
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- reply->nr_pages = calc_pages_for(0, pagelist->length);
+ if (pagelist->length) {
+ /* set up outbound data if we have any */
+ reply->hdr.data_len = cpu_to_le32(pagelist->length);
+ ceph_msg_data_add_pagelist(reply, pagelist);
+ }
ceph_con_send(&session->s_con, reply);
mutex_unlock(&session->s_mutex);
@@ -3029,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL)
+ if (mdsc->mdsmap == NULL) {
+ kfree(mdsc);
return -ENOMEM;
+ }
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
@@ -3433,13 +3448,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
struct ceph_auth_handshake *auth = &s->s_auth;
if (force_new && auth->authorizer) {
- if (ac->ops && ac->ops->destroy_authorizer)
- ac->ops->destroy_authorizer(ac, auth->authorizer);
+ ceph_auth_destroy_authorizer(ac, auth->authorizer);
auth->authorizer = NULL;
}
- if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
- int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
- auth);
+ if (!auth->authorizer) {
+ int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ } else {
+ int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+ auth);
if (ret)
return ERR_PTR(ret);
}
@@ -3455,7 +3474,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+ return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
}
static int invalidate_authorizer(struct ceph_connection *con)
@@ -3464,12 +3483,32 @@ static int invalidate_authorizer(struct ceph_connection *con)
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- if (ac->ops->invalidate_authorizer)
- ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+ ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
}
+static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
+{
+ struct ceph_msg *msg;
+ int type = (int) le16_to_cpu(hdr->type);
+ int front_len = (int) le32_to_cpu(hdr->front_len);
+
+ if (con->in_msg)
+ return con->in_msg;
+
+ *skip = 0;
+ msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("unable to allocate msg type %d len %d\n",
+ type, front_len);
+ return NULL;
+ }
+
+ return msg;
+}
+
static const struct ceph_connection_operations mds_con_ops = {
.get = con_get,
.put = con_put,
@@ -3478,6 +3517,7 @@ static const struct ceph_connection_operations mds_con_ops = {
.verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
+ .alloc_msg = mds_alloc_msg,
};
/* eof */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 0d3c9240c61b..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
{
int n = 0;
int i;
- char r;
+
+ /* special case for one mds */
+ if (1 == m->m_max_mds && m->m_info[0].state > 0)
+ return 0;
/* count */
for (i = 0; i < m->m_max_mds; i++)
@@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return -1;
/* pick */
- get_random_bytes(&r, 1);
- n = r % n;
+ n = prandom_u32() % n;
i = 0;
for (i = 0; n > 0; i++, n--)
while (m->m_info[i].state <= 0)
@@ -90,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
+ struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
global_id = ceph_decode_64(p);
@@ -124,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].global_id = global_id;
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].laggy =
- (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- m->m_info[mds].export_targets[j] =
- ceph_decode_32(&pexport_targets);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
+
+ if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ continue;
+
+ info = &m->m_info[mds];
+ info->global_id = global_id;
+ info->state = state;
+ info->addr = addr;
+ info->laggy = (laggy_since.tv_sec != 0 ||
+ laggy_since.tv_nsec != 0);
+ info->num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ info->export_targets = kcalloc(num_export_targets,
+ sizeof(u32), GFP_NOFS);
+ if (info->export_targets == NULL)
+ goto badmem;
+ for (j = 0; j < num_export_targets; j++)
+ info->export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ info->export_targets = NULL;
}
}
@@ -168,7 +174,7 @@ bad:
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(err);
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index cbb2f54a3019..f01645a27752 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
goto fail;
- snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+ snapc = ceph_create_snap_context(num, GFP_NOFS);
if (!snapc)
goto fail;
- atomic_set(&snapc->nref, 1);
/* build (reverse sorted) snap vector */
num = 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6ddc0bca56b2..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
- if (*dev_name_end != ':') {
+ if (dev_name_end < dev_name || *dev_name_end != ':') {
pr_err("device name is missing path (no : separator in %s)\n",
dev_name);
goto out;
@@ -479,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
CEPH_FEATURE_FLOCK |
CEPH_FEATURE_DIRLAYOUTHASH;
const unsigned required_features = 0;
+ int page_count;
+ size_t size;
int err = -ENOMEM;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -522,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
/* set up mempools */
err = -ENOMEM;
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
- fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+ page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+ size = sizeof (struct page *) * (page_count ? page_count : 1);
+ fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c7b309723dcc..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -204,7 +204,6 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
- unsigned long flags;
struct ceph_mds_session *lease_session;
u32 lease_gen, lease_shared_gen;
u32 lease_seq;
@@ -215,18 +214,6 @@ struct ceph_dentry_info {
u64 offset;
};
-/*
- * dentry flags
- *
- * The locking for D_COMPLETE is a bit odd:
- * - we can clear it at almost any time (see ceph_d_prune)
- * - it is only meaningful if:
- * - we hold dir inode i_ceph_lock
- * - we hold dir FILE_SHARED caps
- * - the dentry D_COMPLETE is set
- */
-#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
-
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -257,7 +244,8 @@ struct ceph_inode_info {
u32 i_time_warp_seq;
unsigned i_ceph_flags;
- unsigned long i_release_count;
+ atomic_t i_release_count;
+ atomic_t i_complete_count;
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
@@ -267,7 +255,7 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
+ u64 i_max_offset; /* largest readdir offset, set with complete dir */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
@@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
-static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
+ int release_count)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags &= ~mask;
- spin_unlock(&ci->i_ceph_lock);
+ atomic_set(&ci->i_complete_count, release_count);
}
-static inline void ceph_i_set(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
+ atomic_inc(&ci->i_release_count);
+}
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags |= mask;
- spin_unlock(&ci->i_ceph_lock);
+static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
+{
+ return atomic_read(&ci->i_complete_count) ==
+ atomic_read(&ci->i_release_count);
}
-static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+static inline void ceph_dir_clear_complete(struct inode *inode)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- bool r;
+ __ceph_dir_clear_complete(ceph_inode(inode));
+}
- spin_lock(&ci->i_ceph_lock);
- r = (ci->i_ceph_flags & mask) == mask;
- spin_unlock(&ci->i_ceph_lock);
- return r;
+static inline bool ceph_dir_is_complete(struct inode *inode)
+{
+ return __ceph_dir_is_complete(ceph_inode(inode));
}
@@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
}
/*
- * set/clear directory D_COMPLETE flag
- */
-void ceph_dir_set_complete(struct inode *inode);
-void ceph_dir_clear_complete(struct inode *inode);
-bool ceph_dir_test_complete(struct inode *inode);
-
-/*
* caps helpers
*/
static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
@@ -555,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
@@ -584,7 +563,7 @@ struct ceph_file_info {
u64 next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
- unsigned long dir_release_count;
+ int dir_release_count;
/* used for -o dirstat read() on directory thing */
char *dir_info;
@@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
+extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
+ u64 cap_id, u32 migrate_seq, u32 issue_seq);
extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
@@ -841,8 +822,13 @@ extern const struct export_operations ceph_export_ops;
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
- int p_locks, int f_locks);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks,
+ int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks);
extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
- goto out;
+ return err;
}
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2906ee276408..603f18a65c12 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
select CRYPTO_ECB
select CRYPTO_DES
select CRYPTO_SHA256
+ select CRYPTO_CMAC
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 1d36db114772..a3b56544c21b 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -506,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
/* GSSAPI header */
if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, "Error decoding negTokenInit header");
+ cifs_dbg(FYI, "Error decoding negTokenInit header\n");
return 0;
} else if ((cls != ASN1_APL) || (con != ASN1_CON)
|| (tag != ASN1_EOC)) {
- cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
return 0;
}
@@ -531,52 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
/* SPNEGO OID not present or garbled -- bail out */
if (!rc) {
- cFYI(1, "Error decoding negTokenInit header");
+ cifs_dbg(FYI, "Error decoding negTokenInit header\n");
return 0;
}
/* SPNEGO */
if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, "Error decoding negTokenInit");
+ cifs_dbg(FYI, "Error decoding negTokenInit\n");
return 0;
} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
|| (tag != ASN1_EOC)) {
- cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+ cls, con, tag, end, *end);
return 0;
}
/* negTokenInit */
if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, "Error decoding negTokenInit");
+ cifs_dbg(FYI, "Error decoding negTokenInit\n");
return 0;
} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
|| (tag != ASN1_SEQ)) {
- cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+ cls, con, tag, end, *end);
return 0;
}
/* sequence */
if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, "Error decoding 2nd part of negTokenInit");
+ cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
return 0;
} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
|| (tag != ASN1_EOC)) {
- cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+ cls, con, tag, end, *end);
return 0;
}
/* sequence of */
if (asn1_header_decode
(&ctx, &sequence_end, &cls, &con, &tag) == 0) {
- cFYI(1, "Error decoding 2nd part of negTokenInit");
+ cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
return 0;
} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
|| (tag != ASN1_SEQ)) {
- cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+ cls, con, tag, end, *end);
return 0;
}
@@ -584,15 +584,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
while (!asn1_eoc_decode(&ctx, sequence_end)) {
rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
if (!rc) {
- cFYI(1, "Error decoding negTokenInit hdr exit2");
+ cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
return 0;
}
if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
- cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
- "0x%lx 0x%lx", oidlen, *oid,
- *(oid + 1), *(oid + 2), *(oid + 3));
+ cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
+ oidlen, *oid, *(oid + 1), *(oid + 2),
+ *(oid + 3));
if (compare_oid(oid, oidlen, MSKRB5_OID,
MSKRB5_OID_LEN))
@@ -610,7 +610,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
kfree(oid);
}
} else {
- cFYI(1, "Should be an oid what is going on?");
+ cifs_dbg(FYI, "Should be an oid what is going on?\n");
}
}
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 282d6de7e410..6c665bf4a27c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
break;
default:
- cERROR(1, "Unknown network family '%d'", sa->sa_family);
+ cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
key_len = 0;
break;
}
@@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
sharename = extract_sharename(tcon->treeName);
if (IS_ERR(sharename)) {
- cFYI(1, "%s: couldn't extract sharename", __func__);
+ cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
sharename = NULL;
return 0;
}
@@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
pagevec_init(&pvec, 0);
first = 0;
- cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi);
+ cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
for (;;) {
nr_pages = pagevec_lookup(&pvec,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d9ea6ede6a7a..f3ac4154cbb6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -57,15 +57,32 @@ cifs_dump_mem(char *label, void *data, int length)
}
}
+#ifdef CONFIG_CIFS_DEBUG
+void cifs_vfs_err(const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR "CIFS VFS: %pV", &vaf);
+
+ va_end(args);
+}
+#endif
+
void cifs_dump_detail(void *buf)
{
#ifdef CONFIG_CIFS_DEBUG2
struct smb_hdr *smb = (struct smb_hdr *)buf;
- cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
- smb->Command, smb->Status.CifsError,
- smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
- cERROR(1, "smb buf %p len %u", smb, smbCalcSize(smb));
+ cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
+ smb->Command, smb->Status.CifsError,
+ smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+ cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -78,25 +95,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
if (server == NULL)
return;
- cERROR(1, "Dump pending requests:");
+ cifs_dbg(VFS, "Dump pending requests:\n");
spin_lock(&GlobalMid_Lock);
list_for_each(tmp, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu",
- mid_entry->mid_state,
- le16_to_cpu(mid_entry->command),
- mid_entry->pid,
- mid_entry->callback_data,
- mid_entry->mid);
+ cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
+ mid_entry->mid_state,
+ le16_to_cpu(mid_entry->command),
+ mid_entry->pid,
+ mid_entry->callback_data,
+ mid_entry->mid);
#ifdef CONFIG_CIFS_STATS2
- cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
- mid_entry->large_buf,
- mid_entry->resp_buf,
- mid_entry->when_received,
- jiffies);
+ cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n",
+ mid_entry->large_buf,
+ mid_entry->resp_buf,
+ mid_entry->when_received,
+ jiffies);
#endif /* STATS2 */
- cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
- mid_entry->multiEnd);
+ cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
+ mid_entry->multiRsp, mid_entry->multiEnd);
if (mid_entry->resp_buf) {
cifs_dump_detail(mid_entry->resp_buf);
cifs_dump_mem("existing buf: ",
@@ -196,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
tcon->nativeFileSystem);
}
seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
- "\nPathComponentMax: %d Status: 0x%d",
+ "\n\tPathComponentMax: %d Status: 0x%d",
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -207,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
+ if (server->ops->dump_share_caps)
+ server->ops->dump_share_caps(m, tcon);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
@@ -578,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
return single_open(file, cifs_security_flags_proc_show, NULL);
}
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
+{
+ unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+
+ if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+ *flags = CIFSSEC_MUST_KRB5;
+ else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+ *flags = CIFSSEC_MUST_NTLMSSP;
+ else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+ *flags = CIFSSEC_MUST_NTLMV2;
+ else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+ *flags = CIFSSEC_MUST_NTLM;
+ else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+ *flags = CIFSSEC_MUST_LANMAN;
+ else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+ *flags = CIFSSEC_MUST_PLNTXT;
+
+ *flags |= signflags;
+}
+
static ssize_t cifs_security_flags_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
+ int rc;
unsigned int flags;
char flags_string[12];
char c;
@@ -603,34 +649,43 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
global_secflags = CIFSSEC_MAX;
return count;
} else if (!isdigit(c)) {
- cERROR(1, "invalid flag %c", c);
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+ flags_string);
return -EINVAL;
}
}
- /* else we have a number */
- flags = simple_strtoul(flags_string, NULL, 0);
+ /* else we have a number */
+ rc = kstrtouint(flags_string, 0, &flags);
+ if (rc) {
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+ flags_string);
+ return rc;
+ }
- cFYI(1, "sec flags 0x%x", flags);
+ cifs_dbg(FYI, "sec flags 0x%x\n", flags);
- if (flags <= 0) {
- cERROR(1, "invalid security flags %s", flags_string);
+ if (flags == 0) {
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
return -EINVAL;
}
if (flags & ~CIFSSEC_MASK) {
- cERROR(1, "attempt to set unsupported security flags 0x%x",
- flags & ~CIFSSEC_MASK);
+ cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
+ flags & ~CIFSSEC_MASK);
return -EINVAL;
}
+
+ cifs_security_flags_handle_must_flags(&flags);
+
/* flags look ok - update the global security flags for cifs module */
global_secflags = flags;
if (global_secflags & CIFSSEC_MUST_SIGN) {
/* requiring signing implies signing is allowed */
global_secflags |= CIFSSEC_MAY_SIGN;
- cFYI(1, "packet signing now required");
+ cifs_dbg(FYI, "packet signing now required\n");
} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
- cFYI(1, "packet signing disabled");
+ cifs_dbg(FYI, "packet signing disabled\n");
}
/* BB should we turn on MAY flags for other MUST options? */
return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 69ae3d3c3b31..c99b40fb609b 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,18 +25,20 @@
void cifs_dump_mem(char *label, void *data, int length);
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
-#ifdef CONFIG_CIFS_DEBUG2
-#define DBG2 2
-#else
-#define DBG2 0
-#endif
extern int traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
#define CIFS_TIMER 0x04
+#define VFS 1
+#define FYI 2
extern int cifsFYI;
+#ifdef CONFIG_CIFS_DEBUG2
+#define NOISY 4
+#else
+#define NOISY 0
+#endif
/*
* debug ON
@@ -44,31 +46,21 @@ extern int cifsFYI;
*/
#ifdef CONFIG_CIFS_DEBUG
-/* information message: e.g., configuration, major event */
-#define cifsfyi(fmt, ...) \
-do { \
- if (cifsFYI & CIFS_INFO) \
- printk(KERN_DEBUG "%s: " fmt "\n", \
- __FILE__, ##__VA_ARGS__); \
-} while (0)
-
-#define cFYI(set, fmt, ...) \
-do { \
- if (set) \
- cifsfyi(fmt, ##__VA_ARGS__); \
-} while (0)
+__printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
-#define cifswarn(fmt, ...) \
- printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
-
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, ...) \
- printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
-
-#define cERROR(set, fmt, ...) \
+/* information message: e.g., configuration, major event */
+#define cifs_dbg(type, fmt, ...) \
do { \
- if (set) \
- cifserror(fmt, ##__VA_ARGS__); \
+ if (type == FYI) { \
+ if (cifsFYI & CIFS_INFO) { \
+ printk(KERN_DEBUG "%s: " fmt, \
+ __FILE__, ##__VA_ARGS__); \
+ } \
+ } else if (type == VFS) { \
+ cifs_vfs_err(fmt, ##__VA_ARGS__); \
+ } else if (type == NOISY && type != 0) { \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } \
} while (0)
/*
@@ -76,27 +68,11 @@ do { \
* ---------
*/
#else /* _CIFS_DEBUG */
-#define cifsfyi(fmt, ...) \
+#define cifs_dbg(type, fmt, ...) \
do { \
if (0) \
- printk(KERN_DEBUG "%s: " fmt "\n", \
- __FILE__, ##__VA_ARGS__); \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
-#define cFYI(set, fmt, ...) \
-do { \
- if (0 && set) \
- cifsfyi(fmt, ##__VA_ARGS__); \
-} while (0)
-#define cifserror(fmt, ...) \
-do { \
- if (0) \
- printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
-} while (0)
-#define cERROR(set, fmt, ...) \
-do { \
- if (0 && set) \
- cifserror(fmt, ##__VA_ARGS__); \
-} while (0)
-#endif /* _CIFS_DEBUG */
+#endif
#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 210fce2df308..58df174deb10 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/fs.h>
+#include <linux/inet.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifsfs.h"
@@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void)
}
/**
- * cifs_get_share_name - extracts share name from UNC
- * @node_name: pointer to UNC string
+ * cifs_build_devname - build a devicename from a UNC and optional prepath
+ * @nodename: pointer to UNC string
+ * @prepath: pointer to prefixpath (or NULL if there isn't one)
*
- * Extracts sharename form full UNC.
- * i.e. strips from UNC trailing path that is not part of share
- * name and fixup missing '\' in the beginning of DFS node refferal
- * if necessary.
- * Returns pointer to share name on success or ERR_PTR on error.
- * Caller is responsible for freeing returned string.
+ * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
+ * big enough to hold the final thing. Copy the UNC from the nodename, and
+ * concatenate the prepath onto the end of it if there is one.
+ *
+ * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
+ * for freeing the returned string.
*/
-static char *cifs_get_share_name(const char *node_name)
+static char *
+cifs_build_devname(char *nodename, const char *prepath)
{
- int len;
- char *UNC;
- char *pSep;
-
- len = strlen(node_name);
- UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
- GFP_KERNEL);
- if (!UNC)
- return ERR_PTR(-ENOMEM);
+ size_t pplen;
+ size_t unclen;
+ char *dev;
+ char *pos;
+
+ /* skip over any preceding delimiters */
+ nodename += strspn(nodename, "\\");
+ if (!*nodename)
+ return ERR_PTR(-EINVAL);
- /* get share name and server name */
- if (node_name[1] != '\\') {
- UNC[0] = '\\';
- strncpy(UNC+1, node_name, len);
- len++;
- UNC[len] = 0;
- } else {
- strncpy(UNC, node_name, len);
- UNC[len] = 0;
- }
+ /* get length of UNC and set pos to last char */
+ unclen = strlen(nodename);
+ pos = nodename + unclen - 1;
- /* find server name end */
- pSep = memchr(UNC+2, '\\', len-2);
- if (!pSep) {
- cERROR(1, "%s: no server name end in node name: %s",
- __func__, node_name);
- kfree(UNC);
- return ERR_PTR(-EINVAL);
+ /* trim off any trailing delimiters */
+ while (*pos == '\\') {
+ --pos;
+ --unclen;
}
- /* find sharename end */
- pSep++;
- pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
- if (pSep) {
- /* trim path up to sharename end
- * now we have share name in UNC */
- *pSep = 0;
+ /* allocate a buffer:
+ * +2 for preceding "//"
+ * +1 for delimiter between UNC and prepath
+ * +1 for trailing NULL
+ */
+ pplen = prepath ? strlen(prepath) : 0;
+ dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ pos = dev;
+ /* add the initial "//" */
+ *pos = '/';
+ ++pos;
+ *pos = '/';
+ ++pos;
+
+ /* copy in the UNC portion from referral */
+ memcpy(pos, nodename, unclen);
+ pos += unclen;
+
+ /* copy the prefixpath remainder (if there is one) */
+ if (pplen) {
+ *pos = '/';
+ ++pos;
+ memcpy(pos, prepath, pplen);
+ pos += pplen;
}
- return UNC;
+ /* NULL terminator */
+ *pos = '\0';
+
+ convert_delimiter(dev, '/');
+ return dev;
}
@@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
{
int rc;
char *mountdata = NULL;
+ const char *prepath = NULL;
int md_len;
char *tkn_e;
char *srvIP = NULL;
@@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (sb_mountdata == NULL)
return ERR_PTR(-EINVAL);
- *devname = cifs_get_share_name(ref->node_name);
+ if (strlen(fullpath) - ref->path_consumed)
+ prepath = fullpath + ref->path_consumed;
+
+ *devname = cifs_build_devname(ref->node_name, prepath);
if (IS_ERR(*devname)) {
rc = PTR_ERR(*devname);
*devname = NULL;
@@ -141,17 +162,19 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
if (rc < 0) {
- cFYI(1, "%s: Failed to resolve server part of %s to IP: %d",
- __func__, *devname, rc);
+ cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
+ __func__, *devname, rc);
goto compose_mount_options_err;
}
- /* md_len = strlen(...) + 12 for 'sep+prefixpath='
- * assuming that we have 'unc=' and 'ip=' in
- * the original sb_mountdata
+ /*
+ * In most cases, we'll be building a shorter string than the original,
+ * but we do have to assume that the address in the ip= option may be
+ * much longer than the original. Add the max length of an address
+ * string to the length of the original string to allow for worst case.
*/
- md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
- mountdata = kzalloc(md_len+1, GFP_KERNEL);
+ md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
+ mountdata = kzalloc(md_len + 1, GFP_KERNEL);
if (mountdata == NULL) {
rc = -ENOMEM;
goto compose_mount_options_err;
@@ -195,29 +218,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
strncat(mountdata, &sep, 1);
strcat(mountdata, "ip=");
strcat(mountdata, srvIP);
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "unc=");
- strcat(mountdata, *devname);
-
- /* find & copy prefixpath */
- tkn_e = strchr(ref->node_name + 2, '\\');
- if (tkn_e == NULL) {
- /* invalid unc, missing share name*/
- rc = -EINVAL;
- goto compose_mount_options_err;
- }
-
- tkn_e = strchr(tkn_e + 1, '\\');
- if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "prefixpath=");
- if (tkn_e)
- strcat(mountdata, tkn_e + 1);
- strcat(mountdata, fullpath + ref->path_consumed);
- }
- /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
- /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
+ /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
+ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
compose_mount_options_out:
kfree(srvIP);
@@ -260,11 +263,12 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
static void dump_referral(const struct dfs_info3_param *ref)
{
- cFYI(1, "DFS: ref path: %s", ref->path_name);
- cFYI(1, "DFS: node path: %s", ref->node_name);
- cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
- cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
- ref->path_consumed);
+ cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
+ cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
+ cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+ ref->flags, ref->server_type);
+ cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+ ref->ref_flag, ref->path_consumed);
}
/*
@@ -283,7 +287,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
struct vfsmount *mnt;
struct tcon_link *tlink;
- cFYI(1, "in %s", __func__);
+ cifs_dbg(FYI, "in %s\n", __func__);
BUG_ON(IS_ROOT(mntpt));
/*
@@ -320,15 +324,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
/* connect to a node */
len = strlen(referrals[i].node_name);
if (len < 2) {
- cERROR(1, "%s: Net Address path too short: %s",
- __func__, referrals[i].node_name);
+ cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+ __func__, referrals[i].node_name);
mnt = ERR_PTR(-EINVAL);
break;
}
mnt = cifs_dfs_do_refmount(cifs_sb,
full_path, referrals + i);
- cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
- referrals[i].node_name, mnt);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
+ __func__, referrals[i].node_name, mnt);
if (!IS_ERR(mnt))
goto success;
}
@@ -343,7 +347,7 @@ success:
free_full_path:
kfree(full_path);
cdda_exit:
- cFYI(1, "leaving %s" , __func__);
+ cifs_dbg(FYI, "leaving %s\n" , __func__);
return mnt;
}
@@ -354,11 +358,11 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- cFYI(1, "in %s", __func__);
+ cifs_dbg(FYI, "in %s\n", __func__);
newmnt = cifs_dfs_do_automount(path->dentry);
if (IS_ERR(newmnt)) {
- cFYI(1, "leaving %s [automount failed]" , __func__);
+ cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
return newmnt;
}
@@ -366,7 +370,7 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
schedule_delayed_work(&cifs_dfs_automount_task,
cifs_dfs_mountpoint_expiry_timeout);
- cFYI(1, "leaving %s [ok]" , __func__);
+ cifs_dbg(FYI, "leaving %s [ok]\n" , __func__);
return newmnt;
}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 10e774761299..a3e932547617 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -37,12 +37,11 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
int ret;
ret = -ENOMEM;
- payload = kmalloc(prep->datalen, GFP_KERNEL);
+ payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
if (!payload)
goto error;
/* attach the data */
- memcpy(payload, prep->data, prep->datalen);
key->payload.data = payload;
ret = 0;
@@ -164,7 +163,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
dp = description + strlen(description);
sprintf(dp, ";pid=0x%x", current->pid);
- cFYI(1, "key description = %s", description);
+ cifs_dbg(FYI, "key description = %s\n", description);
spnego_key = request_key(&cifs_spnego_key_type, description, "");
#ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 71d5d0a5f6b2..0227b45ef00a 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -227,8 +227,8 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
if (charlen < 1) {
- cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
- *from, charlen);
+ cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
+ *from, charlen);
/* A question mark */
wchar_to = 0x003f;
charlen = 1;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e21..fe8d6276410a 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
/*
* UniStrupr: Upper case a unicode string
*/
-static inline wchar_t *
-UniStrupr(register wchar_t *upin)
+static inline __le16 *
+UniStrupr(register __le16 *upin)
{
- register wchar_t *up;
+ register __le16 *up;
up = upin;
while (*up) { /* For all characters */
- *up = UniToupper(*up);
+ *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
up++;
}
return upin; /* Return input pointer */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f1e3f25fe004..51f5e0ee7237 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -63,11 +63,10 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
key->datalen = prep->datalen;
return 0;
}
- payload = kmalloc(prep->datalen, GFP_KERNEL);
+ payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
if (!payload)
return -ENOMEM;
- memcpy(payload, prep->data, prep->datalen);
key->payload.data = payload;
key->datalen = prep->datalen;
return 0;
@@ -219,13 +218,13 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
sidkey = request_key(&cifs_idmap_key_type, desc, "");
if (IS_ERR(sidkey)) {
rc = -EINVAL;
- cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
- sidtype == SIDOWNER ? 'u' : 'g', cid);
+ cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
+ __func__, sidtype == SIDOWNER ? 'u' : 'g', cid);
goto out_revert_creds;
} else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
rc = -EIO;
- cFYI(1, "%s: Downcall contained malformed key "
- "(datalen=%hu)", __func__, sidkey->datalen);
+ cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+ __func__, sidkey->datalen);
goto invalidate_key;
}
@@ -241,8 +240,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
if (ksid_size > sidkey->datalen) {
rc = -EIO;
- cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
- "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
+ cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n",
+ __func__, sidkey->datalen, ksid_size);
goto invalidate_key;
}
@@ -274,8 +273,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
* Just return an error.
*/
if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
- cFYI(1, "%s: %u subauthorities is too many!", __func__,
- psid->num_subauth);
+ cifs_dbg(FYI, "%s: %u subauthorities is too many!\n",
+ __func__, psid->num_subauth);
return -EIO;
}
@@ -287,8 +286,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
if (IS_ERR(sidkey)) {
rc = -EINVAL;
- cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
- sidtype == SIDOWNER ? 'u' : 'g');
+ cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
+ __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g');
goto out_revert_creds;
}
@@ -300,8 +299,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
if (sidkey->datalen != sizeof(uid_t)) {
rc = -EIO;
- cFYI(1, "%s: Downcall contained malformed key "
- "(datalen=%hu)", __func__, sidkey->datalen);
+ cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+ __func__, sidkey->datalen);
key_invalidate(sidkey);
goto out_key_put;
}
@@ -346,7 +345,8 @@ init_cifs_idmap(void)
struct key *keyring;
int ret;
- cFYI(1, "Registering the %s key type", cifs_idmap_key_type.name);
+ cifs_dbg(FYI, "Registering the %s key type\n",
+ cifs_idmap_key_type.name);
/* create an override credential set with a special thread keyring in
* which requests are cached
@@ -379,7 +379,7 @@ init_cifs_idmap(void)
cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
root_cred = cred;
- cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
+ cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring));
return 0;
failed_put_key:
@@ -395,7 +395,7 @@ exit_cifs_idmap(void)
key_revoke(root_cred->thread_keyring);
unregister_key_type(&cifs_idmap_key_type);
put_cred(root_cred);
- cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
+ cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name);
}
/* copy ntsd, owner sid, and group sid from a security descriptor to another */
@@ -462,14 +462,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
*pbits_to_set &= ~S_IXUGO;
return;
} else if (type != ACCESS_ALLOWED) {
- cERROR(1, "unknown access control type %d", type);
+ cifs_dbg(VFS, "unknown access control type %d\n", type);
return;
}
/* else ACCESS_ALLOWED type */
if (flags & GENERIC_ALL) {
*pmode |= (S_IRWXUGO & (*pbits_to_set));
- cFYI(DBG2, "all perms");
+ cifs_dbg(NOISY, "all perms\n");
return;
}
if ((flags & GENERIC_WRITE) ||
@@ -482,7 +482,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
*pmode |= (S_IXUGO & (*pbits_to_set));
- cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
+ cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
return;
}
@@ -511,7 +511,8 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
if (mode & S_IXUGO)
*pace_flags |= SET_FILE_EXEC_RIGHTS;
- cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
+ cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+ mode, *pace_flags);
return;
}
@@ -551,24 +552,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
/* validate that we do not go past end of acl */
if (le16_to_cpu(pace->size) < 16) {
- cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
+ cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size));
return;
}
if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
- cERROR(1, "ACL too small to parse ACE");
+ cifs_dbg(VFS, "ACL too small to parse ACE\n");
return;
}
num_subauth = pace->sid.num_subauth;
if (num_subauth) {
int i;
- cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
- pace->sid.revision, pace->sid.num_subauth, pace->type,
- pace->flags, le16_to_cpu(pace->size));
+ cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n",
+ pace->sid.revision, pace->sid.num_subauth, pace->type,
+ pace->flags, le16_to_cpu(pace->size));
for (i = 0; i < num_subauth; ++i) {
- cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
- le32_to_cpu(pace->sid.sub_auth[i]));
+ cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n",
+ i, le32_to_cpu(pace->sid.sub_auth[i]));
}
/* BB add length check to make sure that we do not have huge
@@ -601,13 +602,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
/* validate that we do not go past end of acl */
if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
- cERROR(1, "ACL too small to parse DACL");
+ cifs_dbg(VFS, "ACL too small to parse DACL\n");
return;
}
- cFYI(DBG2, "DACL revision %d size %d num aces %d",
- le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
- le32_to_cpu(pdacl->num_aces));
+ cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n",
+ le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+ le32_to_cpu(pdacl->num_aces));
/* reset rwx permissions for user/group/other.
Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -627,10 +628,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
return;
ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
GFP_KERNEL);
- if (!ppace) {
- cERROR(1, "DACL memory allocation error");
+ if (!ppace)
return;
- }
for (i = 0; i < num_aces; ++i) {
ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
@@ -703,25 +702,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
/* validate that we do not go past end of ACL - sid must be at least 8
bytes long (assuming no sub-auths - e.g. the null SID */
if (end_of_acl < (char *)psid + 8) {
- cERROR(1, "ACL too small to parse SID %p", psid);
+ cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid);
return -EINVAL;
}
#ifdef CONFIG_CIFS_DEBUG2
if (psid->num_subauth) {
int i;
- cFYI(1, "SID revision %d num_auth %d",
- psid->revision, psid->num_subauth);
+ cifs_dbg(FYI, "SID revision %d num_auth %d\n",
+ psid->revision, psid->num_subauth);
for (i = 0; i < psid->num_subauth; i++) {
- cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
- le32_to_cpu(psid->sub_auth[i]));
+ cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n",
+ i, le32_to_cpu(psid->sub_auth[i]));
}
/* BB add length check to make sure that we do not have huge
num auths and therefore go off the end */
- cFYI(1, "RID 0x%x",
- le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
+ cifs_dbg(FYI, "RID 0x%x\n",
+ le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
}
#endif
@@ -748,31 +747,33 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
le32_to_cpu(pntsd->gsidoffset));
dacloffset = le32_to_cpu(pntsd->dacloffset);
dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
- cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
- "sacloffset 0x%x dacloffset 0x%x",
+ cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
le32_to_cpu(pntsd->gsidoffset),
le32_to_cpu(pntsd->sacloffset), dacloffset);
/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
rc = parse_sid(owner_sid_ptr, end_of_acl);
if (rc) {
- cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+ cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc);
return rc;
}
rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
if (rc) {
- cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
+ cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n",
+ __func__, rc);
return rc;
}
rc = parse_sid(group_sid_ptr, end_of_acl);
if (rc) {
- cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
+ cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n",
+ __func__, rc);
return rc;
}
rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
if (rc) {
- cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+ cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n",
+ __func__, rc);
return rc;
}
@@ -780,7 +781,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
group_sid_ptr, fattr);
else
- cFYI(1, "no ACL"); /* BB grant all or default perms? */
+ cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
return rc;
}
@@ -830,8 +831,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
id = from_kuid(&init_user_ns, uid);
rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
if (rc) {
- cFYI(1, "%s: Mapping error %d for owner id %d",
- __func__, rc, id);
+ cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
+ __func__, rc, id);
kfree(nowner_sid_ptr);
return rc;
}
@@ -850,8 +851,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
id = from_kgid(&init_user_ns, gid);
rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
if (rc) {
- cFYI(1, "%s: Mapping error %d for group id %d",
- __func__, rc, id);
+ cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
+ __func__, rc, id);
kfree(ngroup_sid_ptr);
return rc;
}
@@ -881,7 +882,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
cifs_put_tlink(tlink);
- cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
if (rc)
return ERR_PTR(rc);
return pntsd;
@@ -918,7 +919,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
cifs_put_tlink(tlink);
free_xid(xid);
- cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+ cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
if (rc)
return ERR_PTR(rc);
return pntsd;
@@ -972,12 +973,12 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc) {
- cERROR(1, "Unable to open file to set ACL");
+ cifs_dbg(VFS, "Unable to open file to set ACL\n");
goto out;
}
rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
- cFYI(DBG2, "SetCIFSACL rc = %d", rc);
+ cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
CIFSSMBClose(xid, tcon, fid);
out:
@@ -995,7 +996,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
u32 acllen = 0;
int rc = 0;
- cFYI(DBG2, "converting ACL to mode for %s", path);
+ cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
if (pfid)
pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -1005,12 +1006,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
- cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+ cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
} else {
rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
kfree(pntsd);
if (rc)
- cERROR(1, "parse sec desc failed rc = %d", rc);
+ cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
}
return rc;
@@ -1027,13 +1028,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
- cFYI(DBG2, "set ACL from mode for %s", path);
+ cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
/* Get the security descriptor */
pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
- cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+ cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
goto out;
}
@@ -1046,7 +1047,6 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
pnntsd = kmalloc(secdesclen, GFP_KERNEL);
if (!pnntsd) {
- cERROR(1, "Unable to allocate security descriptor");
kfree(pntsd);
return -ENOMEM;
}
@@ -1054,12 +1054,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
&aclflag);
- cFYI(DBG2, "build_sec_desc rc: %d", rc);
+ cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
if (!rc) {
/* Set the security descriptor */
rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
- cFYI(DBG2, "set_cifs_acl rc: %d", rc);
+ cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
}
kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 652f5051be09..3d8bf941d126 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -50,20 +50,20 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
return -EINVAL;
if (!server->secmech.sdescmd5) {
- cERROR(1, "%s: Can't generate signature", __func__);
+ cifs_dbg(VFS, "%s: Can't generate signature\n", __func__);
return -1;
}
rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
if (rc) {
- cERROR(1, "%s: Could not init md5", __func__);
+ cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
return rc;
}
rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
server->session_key.response, server->session_key.len);
if (rc) {
- cERROR(1, "%s: Could not update with response", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
@@ -71,7 +71,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
if (iov[i].iov_len == 0)
continue;
if (iov[i].iov_base == NULL) {
- cERROR(1, "null iovec entry");
+ cifs_dbg(VFS, "null iovec entry\n");
return -EIO;
}
/* The first entry includes a length field (which does not get
@@ -88,8 +88,8 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
iov[i].iov_base, iov[i].iov_len);
}
if (rc) {
- cERROR(1, "%s: Could not update with payload",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with payload\n",
+ __func__);
return rc;
}
}
@@ -106,7 +106,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
if (rc)
- cERROR(1, "%s: Could not generate md5 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
return rc;
}
@@ -135,8 +135,8 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
cpu_to_le32(server->sequence_number);
cifs_pdu->Signature.Sequence.Reserved = 0;
- *pexpected_response_sequence_number = server->sequence_number++;
- server->sequence_number++;
+ *pexpected_response_sequence_number = ++server->sequence_number;
+ ++server->sequence_number;
rc = cifs_calc_signature(rqst, server, smb_signature);
if (rc)
@@ -196,8 +196,8 @@ int cifs_verify_signature(struct smb_rqst *rqst,
/* Do not need to verify session setups with signature "BSRSPYL " */
if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
- cFYI(1, "dummy signature received for smb command 0x%x",
- cifs_pdu->Command);
+ cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+ cifs_pdu->Command);
/* save off the origiginal signature so we can modify the smb and check
its signature against what the server sent */
@@ -235,30 +235,30 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
return -EINVAL;
ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
- if (!ses->auth_key.response) {
- cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
+ if (!ses->auth_key.response)
return -ENOMEM;
- }
+
ses->auth_key.len = temp_len;
rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
if (rc) {
- cFYI(1, "%s Can't generate NTLM response, error: %d",
- __func__, rc);
+ cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
+ __func__, rc);
return rc;
}
rc = E_md4hash(ses->password, temp_key, nls_cp);
if (rc) {
- cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+ cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+ __func__, rc);
return rc;
}
rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
if (rc)
- cFYI(1, "%s Can't generate NTLM session key, error: %d",
- __func__, rc);
+ cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
+ __func__, rc);
return rc;
}
@@ -276,7 +276,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
- memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
memcpy(lnm_session_key, password_with_pad,
CIFS_ENCPWD_SIZE);
return 0;
@@ -334,7 +333,6 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
if (!ses->auth_key.response) {
ses->auth_key.len = 0;
- cERROR(1, "Challenge target info allocation failure");
return -ENOMEM;
}
@@ -415,12 +413,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
int rc = 0;
int len;
char nt_hash[CIFS_NTHASH_SIZE];
- wchar_t *user;
+ __le16 *user;
wchar_t *domain;
wchar_t *server;
if (!ses->server->secmech.sdeschmacmd5) {
- cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash");
+ cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
@@ -430,27 +428,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
CIFS_NTHASH_SIZE);
if (rc) {
- cERROR(1, "%s: Could not set NT Hash as a key", __func__);
+ cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
return rc;
}
rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
if (rc) {
- cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5");
+ cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
return rc;
}
- /* convert ses->user_name to unicode and uppercase */
+ /* convert ses->user_name to unicode */
len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
if (user == NULL) {
- cERROR(1, "calc_ntlmv2_hash: user mem alloc failure");
rc = -ENOMEM;
return rc;
}
if (len) {
- len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+ len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
UniStrupr(user);
} else {
memset(user, '\0', 2);
@@ -460,7 +457,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
(char *)user, 2 * len);
kfree(user);
if (rc) {
- cERROR(1, "%s: Could not update with user", __func__);
+ cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
return rc;
}
@@ -470,7 +467,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
domain = kmalloc(2 + (len * 2), GFP_KERNEL);
if (domain == NULL) {
- cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
rc = -ENOMEM;
return rc;
}
@@ -481,8 +477,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
(char *)domain, 2 * len);
kfree(domain);
if (rc) {
- cERROR(1, "%s: Could not update with domain",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with domain\n",
+ __func__);
return rc;
}
} else if (ses->serverName) {
@@ -490,7 +486,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
server = kmalloc(2 + (len * 2), GFP_KERNEL);
if (server == NULL) {
- cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
rc = -ENOMEM;
return rc;
}
@@ -501,8 +496,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
(char *)server, 2 * len);
kfree(server);
if (rc) {
- cERROR(1, "%s: Could not update with server",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with server\n",
+ __func__);
return rc;
}
}
@@ -510,7 +505,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
ntlmv2_hash);
if (rc)
- cERROR(1, "%s: Could not generate md5 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
return rc;
}
@@ -522,24 +517,25 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
if (!ses->server->secmech.sdeschmacmd5) {
- cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash");
+ cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+ __func__);
return rc;
}
rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
if (rc) {
- cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+ cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
return rc;
}
- if (ses->server->secType == RawNTLMSSP)
+ if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
memcpy(ses->auth_key.response + offset,
ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
@@ -548,14 +544,14 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
ses->auth_key.response + offset, ses->auth_key.len - offset);
if (rc) {
- cERROR(1, "%s: Could not update with response", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
ses->auth_key.response + CIFS_SESS_KEY_SIZE);
if (rc)
- cERROR(1, "%s: Could not generate md5 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
return rc;
}
@@ -571,18 +567,19 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
char ntlmv2_hash[16];
unsigned char *tiblob = NULL; /* target info blob */
- if (ses->server->secType == RawNTLMSSP) {
+ if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
if (!ses->domainName) {
rc = find_domain_name(ses, nls_cp);
if (rc) {
- cERROR(1, "error %d finding domain name", rc);
+ cifs_dbg(VFS, "error %d finding domain name\n",
+ rc);
goto setup_ntlmv2_rsp_ret;
}
}
} else {
rc = build_avpair_blob(ses, nls_cp);
if (rc) {
- cERROR(1, "error %d building av pair blob", rc);
+ cifs_dbg(VFS, "error %d building av pair blob\n", rc);
goto setup_ntlmv2_rsp_ret;
}
}
@@ -595,7 +592,6 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (!ses->auth_key.response) {
rc = ENOMEM;
ses->auth_key.len = 0;
- cERROR(1, "%s: Can't allocate auth blob", __func__);
goto setup_ntlmv2_rsp_ret;
}
ses->auth_key.len += baselen;
@@ -613,14 +609,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
/* calculate ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
- cERROR(1, "could not get v2 hash rc %d", rc);
+ cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
goto setup_ntlmv2_rsp_ret;
}
/* calculate first part of the client response (CR1) */
rc = CalcNTLMv2_response(ses, ntlmv2_hash);
if (rc) {
- cERROR(1, "Could not calculate CR1 rc: %d", rc);
+ cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
goto setup_ntlmv2_rsp_ret;
}
@@ -628,13 +624,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+ __func__);
goto setup_ntlmv2_rsp_ret;
}
rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
if (rc) {
- cERROR(1, "%s: Could not init hmacmd5", __func__);
+ cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
goto setup_ntlmv2_rsp_ret;
}
@@ -642,14 +639,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response + CIFS_SESS_KEY_SIZE,
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cERROR(1, "%s: Could not update with response", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
goto setup_ntlmv2_rsp_ret;
}
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
ses->auth_key.response);
if (rc)
- cERROR(1, "%s: Could not generate md5 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
setup_ntlmv2_rsp_ret:
kfree(tiblob);
@@ -671,7 +668,7 @@ calc_seckey(struct cifs_ses *ses)
tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_arc4)) {
rc = PTR_ERR(tfm_arc4);
- cERROR(1, "could not allocate crypto API arc4");
+ cifs_dbg(VFS, "could not allocate crypto API arc4\n");
return rc;
}
@@ -680,7 +677,8 @@ calc_seckey(struct cifs_ses *ses)
rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
CIFS_SESS_KEY_SIZE);
if (rc) {
- cERROR(1, "%s: Could not set response as a key", __func__);
+ cifs_dbg(VFS, "%s: Could not set response as a key\n",
+ __func__);
return rc;
}
@@ -689,7 +687,7 @@ calc_seckey(struct cifs_ses *ses)
rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
if (rc) {
- cERROR(1, "could not encrypt session key rc: %d", rc);
+ cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
crypto_free_blkcipher(tfm_arc4);
return rc;
}
@@ -707,6 +705,9 @@ calc_seckey(struct cifs_ses *ses)
void
cifs_crypto_shash_release(struct TCP_Server_Info *server)
{
+ if (server->secmech.cmacaes)
+ crypto_free_shash(server->secmech.cmacaes);
+
if (server->secmech.hmacsha256)
crypto_free_shash(server->secmech.hmacsha256);
@@ -716,6 +717,8 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server)
if (server->secmech.hmacmd5)
crypto_free_shash(server->secmech.hmacmd5);
+ kfree(server->secmech.sdesccmacaes);
+
kfree(server->secmech.sdeschmacsha256);
kfree(server->secmech.sdeschmacmd5);
@@ -731,29 +734,35 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
if (IS_ERR(server->secmech.hmacmd5)) {
- cERROR(1, "could not allocate crypto hmacmd5");
+ cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
return PTR_ERR(server->secmech.hmacmd5);
}
server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(server->secmech.md5)) {
- cERROR(1, "could not allocate crypto md5");
+ cifs_dbg(VFS, "could not allocate crypto md5\n");
rc = PTR_ERR(server->secmech.md5);
goto crypto_allocate_md5_fail;
}
server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
if (IS_ERR(server->secmech.hmacsha256)) {
- cERROR(1, "could not allocate crypto hmacsha256\n");
+ cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
rc = PTR_ERR(server->secmech.hmacsha256);
goto crypto_allocate_hmacsha256_fail;
}
+ server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(server->secmech.cmacaes)) {
+ cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+ rc = PTR_ERR(server->secmech.cmacaes);
+ goto crypto_allocate_cmacaes_fail;
+ }
+
size = sizeof(struct shash_desc) +
crypto_shash_descsize(server->secmech.hmacmd5);
server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
if (!server->secmech.sdeschmacmd5) {
- cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5");
rc = -ENOMEM;
goto crypto_allocate_hmacmd5_sdesc_fail;
}
@@ -764,7 +773,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
crypto_shash_descsize(server->secmech.md5);
server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
if (!server->secmech.sdescmd5) {
- cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5");
rc = -ENOMEM;
goto crypto_allocate_md5_sdesc_fail;
}
@@ -775,15 +783,28 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
crypto_shash_descsize(server->secmech.hmacsha256);
server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
if (!server->secmech.sdeschmacsha256) {
- cERROR(1, "%s: Can't alloc hmacsha256\n", __func__);
rc = -ENOMEM;
goto crypto_allocate_hmacsha256_sdesc_fail;
}
server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
server->secmech.sdeschmacsha256->shash.flags = 0x0;
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.cmacaes);
+ server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdesccmacaes) {
+ cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+ rc = -ENOMEM;
+ goto crypto_allocate_cmacaes_sdesc_fail;
+ }
+ server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+ server->secmech.sdesccmacaes->shash.flags = 0x0;
+
return 0;
+crypto_allocate_cmacaes_sdesc_fail:
+ kfree(server->secmech.sdeschmacsha256);
+
crypto_allocate_hmacsha256_sdesc_fail:
kfree(server->secmech.sdescmd5);
@@ -791,6 +812,9 @@ crypto_allocate_md5_sdesc_fail:
kfree(server->secmech.sdeschmacmd5);
crypto_allocate_hmacmd5_sdesc_fail:
+ crypto_free_shash(server->secmech.cmacaes);
+
+crypto_allocate_cmacaes_fail:
crypto_free_shash(server->secmech.hmacsha256);
crypto_allocate_hmacsha256_fail:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 345fc89c4286..4bdd547dbf6f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -161,7 +161,7 @@ cifs_read_super(struct super_block *sb)
#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
- cFYI(1, "export ops supported");
+ cifs_dbg(FYI, "export ops supported\n");
sb->s_export_op = &cifs_export_ops;
}
#endif /* CONFIG_CIFS_NFSD_EXPORT */
@@ -169,7 +169,7 @@ cifs_read_super(struct super_block *sb)
return 0;
out_no_root:
- cERROR(1, "cifs_read_super: get root inode failed");
+ cifs_dbg(VFS, "%s: get root inode failed\n", __func__);
return rc;
}
@@ -312,11 +312,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
}
static void
-cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
{
+ if (ses->sectype == Unspecified)
+ return;
+
seq_printf(s, ",sec=");
- switch (server->secType) {
+ switch (ses->sectype) {
case LANMAN:
seq_printf(s, "lanman");
break;
@@ -338,7 +341,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
break;
}
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->sign)
seq_printf(s, "i");
}
@@ -369,12 +372,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
- cifs_show_security(s, tcon->ses->server);
+ cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
- seq_printf(s, ",unc=");
- seq_escape(s, tcon->treeName, " \t\n\\");
-
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
seq_printf(s, ",multiuser");
else if (tcon->ses->user_name)
@@ -502,7 +502,7 @@ static void cifs_umount_begin(struct super_block *sb)
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
/* cancel_notify_requests(tcon); */
if (tcon->ses && tcon->ses->server) {
- cFYI(1, "wake up tasks now - umount begin not complete");
+ cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n");
wake_up_all(&tcon->ses->server->request_q);
wake_up_all(&tcon->ses->server->response_q);
msleep(1); /* yield */
@@ -573,7 +573,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
if (full_path == NULL)
return ERR_PTR(-ENOMEM);
- cFYI(1, "Get root dentry for %s", full_path);
+ cifs_dbg(FYI, "Get root dentry for %s\n", full_path);
sep = CIFS_DIR_SEP(cifs_sb);
dentry = dget(sb->s_root);
@@ -632,7 +632,7 @@ cifs_do_mount(struct file_system_type *fs_type,
struct cifs_mnt_data mnt_data;
struct dentry *root;
- cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
+ cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
volume_info = cifs_get_volume_info((char *)data, dev_name);
if (IS_ERR(volume_info))
@@ -655,7 +655,8 @@ cifs_do_mount(struct file_system_type *fs_type,
rc = cifs_mount(cifs_sb, volume_info);
if (rc) {
if (!(flags & MS_SILENT))
- cERROR(1, "cifs_mount failed w/return code = %d", rc);
+ cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
+ rc);
root = ERR_PTR(rc);
goto out_mountdata;
}
@@ -675,7 +676,7 @@ cifs_do_mount(struct file_system_type *fs_type,
}
if (sb->s_root) {
- cFYI(1, "Use existing superblock");
+ cifs_dbg(FYI, "Use existing superblock\n");
cifs_umount(cifs_sb);
} else {
rc = cifs_read_super(sb);
@@ -691,7 +692,7 @@ cifs_do_mount(struct file_system_type *fs_type,
if (IS_ERR(root))
goto out_super;
- cFYI(1, "dentry root is: %p", root);
+ cifs_dbg(FYI, "dentry root is: %p\n", root);
goto out;
out_super:
@@ -723,7 +724,8 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
- cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+ cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+ rc, inode);
return written;
}
@@ -766,7 +768,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
{
- /* note that this is called by vfs setlease with lock_flocks held
+ /* note that this is called by vfs setlease with i_lock held
to protect *lease from going away */
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
@@ -969,7 +971,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
};
const struct file_operations cifs_dir_ops = {
- .readdir = cifs_readdir,
+ .iterate = cifs_readdir,
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
@@ -1030,7 +1032,10 @@ cifs_init_request_bufs(void)
} else {
CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
}
-/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
+/*
+ cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n",
+ CIFSMaxBufSize, CIFSMaxBufSize);
+*/
cifs_req_cachep = kmem_cache_create("cifs_request",
CIFSMaxBufSize + max_hdr_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
@@ -1041,7 +1046,7 @@ cifs_init_request_bufs(void)
cifs_min_rcv = 1;
else if (cifs_min_rcv > 64) {
cifs_min_rcv = 64;
- cERROR(1, "cifs_min_rcv set to maximum (64)");
+ cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n");
}
cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -1072,7 +1077,7 @@ cifs_init_request_bufs(void)
cifs_min_small = 2;
else if (cifs_min_small > 256) {
cifs_min_small = 256;
- cFYI(1, "cifs_min_small set to maximum (256)");
+ cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n");
}
cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -1163,10 +1168,11 @@ init_cifs(void)
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
- cFYI(1, "cifs_max_pending set to min of 2");
+ cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
} else if (cifs_max_pending > CIFS_MAX_REQ) {
cifs_max_pending = CIFS_MAX_REQ;
- cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
+ cifs_dbg(FYI, "cifs_max_pending set to max of %u\n",
+ CIFS_MAX_REQ);
}
cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
@@ -1235,7 +1241,7 @@ out_clean_proc:
static void __exit
exit_cifs(void)
{
- cFYI(DBG2, "exit_cifs");
+ cifs_dbg(NOISY, "exit_cifs\n");
unregister_filesystem(&cifs_fs_type);
cifs_dfs_release_automount_timer();
#ifdef CONFIG_CIFS_ACL
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..ea723a5e8226 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
/* Functions related to dir entries */
extern const struct dentry_operations cifs_dentry_ops;
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.0"
+#define CIFS_VERSION "2.01"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f07f6fbe494..e66b08882548 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -101,20 +101,14 @@ enum statusEnum {
};
enum securityEnum {
- LANMAN = 0, /* Legacy LANMAN auth */
+ Unspecified = 0, /* not specified */
+ LANMAN, /* Legacy LANMAN auth */
NTLM, /* Legacy NTLM012 auth with NTLM hash */
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
-/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
Kerberos, /* Kerberos via SPNEGO */
};
-enum protocolEnum {
- TCP = 0,
- SCTP
- /* Netbios frames protocol not supported at this time */
-};
-
struct session_key {
unsigned int len;
char *response;
@@ -131,9 +125,11 @@ struct cifs_secmech {
struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
struct crypto_shash *md5; /* md5 hash function */
struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+ struct crypto_shash *cmacaes; /* block-cipher based MAC function */
struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
+ struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
};
/* per smb session structure/fields */
@@ -181,6 +177,7 @@ enum smb_version {
Smb_20,
Smb_21,
Smb_30,
+ Smb_302,
};
struct mid_q_entry;
@@ -228,6 +225,7 @@ struct smb_version_operations {
void (*dump_detail)(void *);
void (*clear_stats)(struct cifs_tcon *);
void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+ void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
/* verify the message */
int (*check_message)(char *, unsigned int);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -367,6 +365,8 @@ struct smb_version_operations {
void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *fid);
+ /* The next two functions will need to be changed to per smb session */
+ void (*generate_signingkey)(struct TCP_Server_Info *server);
int (*calc_signature)(struct smb_rqst *rqst,
struct TCP_Server_Info *server);
};
@@ -387,6 +387,8 @@ struct smb_version_values {
unsigned int cap_nt_find;
unsigned int cap_large_files;
unsigned int oplock_read;
+ __u16 signing_enabled;
+ __u16 signing_required;
};
#define HEADER_SIZE(server) (server->vals->header_size)
@@ -407,7 +409,8 @@ struct smb_vol {
kgid_t backupgid;
umode_t file_mode;
umode_t dir_mode;
- unsigned secFlg;
+ enum securityEnum sectype; /* sectype requested via mnt opts */
+ bool sign; /* was signing requested via mnt opts? */
bool retry:1;
bool intr:1;
bool setuids:1;
@@ -441,6 +444,7 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
+ bool nosharesock;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -514,6 +518,7 @@ struct TCP_Server_Info {
struct task_struct *tsk;
char server_GUID[16];
__u16 sec_mode;
+ bool sign; /* is signing enabled on this connection? */
bool session_estab; /* mark when very first sess is established */
#ifdef CONFIG_CIFS_SMB2
int echo_credits; /* echo reserved slots */
@@ -521,7 +526,6 @@ struct TCP_Server_Info {
bool echoes:1; /* enable echoes */
#endif
u16 dialect; /* dialect index that server chose */
- enum securityEnum secType;
bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
/* than maxReq distinct unanswered SMBs to the server when using */
@@ -540,12 +544,17 @@ struct TCP_Server_Info {
int timeAdj; /* Adjust for difference in server time zone in sec */
__u64 CurrentMid; /* multiplex id - rotating counter */
char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+ char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
+#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
+#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
+ char negflavor; /* NEGOTIATE response flavor */
/* extended security flavors that server supports */
bool sec_ntlmssp; /* supports NTLMSSP */
bool sec_kerberosu2u; /* supports U2U Kerberos */
@@ -697,7 +706,6 @@ struct cifs_ses {
enum statusEnum status;
unsigned overrideSecFlg; /* if non-zero override global sec flags */
__u16 ipc_tid; /* special tid for connection to IPC share */
- __u16 flags;
__u16 vcnum;
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -714,21 +722,14 @@ struct cifs_ses {
char *password;
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+ enum securityEnum sectype; /* what security flavor was specified? */
+ bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
#ifdef CONFIG_CIFS_SMB2
__u16 session_flags;
#endif /* CONFIG_CIFS_SMB2 */
};
-/* no more than one of the following three session flags may be set */
-#define CIFS_SES_NT4 1
-#define CIFS_SES_OS2 2
-#define CIFS_SES_W9X 4
-/* following flag is set for old servers such as OS2 (and Win95?)
- which do not negotiate NTLM or POSIX dialects, but instead
- negotiate one of the older LANMAN dialects */
-#define CIFS_SES_LANMAN 8
-
static inline bool
cap_unix(struct cifs_ses *ses)
{
@@ -816,7 +817,7 @@ struct cifs_tcon {
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
- __u32 capabilities;
+ __le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
__u32 vol_serial_number;
@@ -1348,7 +1349,7 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
-#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
@@ -1494,4 +1495,7 @@ extern struct smb_version_values smb21_values;
#define SMB30_VERSION_STRING "3.0"
extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING "3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e996ff6b26d1..11ca24a8e054 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
*/
#define CIFS_SESS_KEY_SIZE (16)
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
+
#define CIFS_CLIENT_CHALLENGE_SIZE (8)
#define CIFS_SERVER_CHALLENGE_SIZE (8)
#define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp {
#define READ_RAW_ENABLE 1
#define WRITE_RAW_ENABLE 2
#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
-
+#define SMB1_CLIENT_GUID_SIZE (16)
typedef struct negotiate_rsp {
struct smb_hdr hdr; /* wct = 17 */
__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -553,7 +558,7 @@ typedef struct negotiate_rsp {
/* followed by 16 bytes of server GUID */
/* then security blob if cap_extended_security negotiated */
struct {
- unsigned char GUID[16];
+ unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
unsigned char SecurityBlob[1];
} __attribute__((packed)) extended_response;
} __attribute__((packed)) u;
@@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
/* parms and data follow */
} __attribute__((packed)) NTRANSACT_RSP;
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+ __le64 SourceOffset;
+ __le64 DestinationOffset;
+ __le32 CopyLength;
+ __u32 Reserved;
+} __packed;
+
typedef struct smb_com_transaction_ioctl_req {
struct smb_hdr hdr; /* wct = 23 */
__u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f450f0683ddd..c8ff018fae68 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -45,17 +45,17 @@ extern void _free_xid(unsigned int);
#define get_xid() \
({ \
unsigned int __xid = _get_xid(); \
- cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \
- __func__, __xid, \
- from_kuid(&init_user_ns, current_fsuid())); \
+ cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \
+ __func__, __xid, \
+ from_kuid(&init_user_ns, current_fsuid())); \
__xid; \
})
#define free_xid(curr_xid) \
do { \
_free_xid(curr_xid); \
- cFYI(1, "CIFS VFS: leaving %s (xid = %u) rc = %d", \
- __func__, curr_xid, (int)rc); \
+ cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \
+ __func__, curr_xid, (int)rc); \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
@@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
struct cifs_ses *ses,
void **request_buf);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+ enum securityEnum requested);
extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
struct cifs_ses *ses);
extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -433,6 +436,7 @@ extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
extern int calc_seckey(struct cifs_ses *);
+extern void generate_smb3signingkey(struct TCP_Server_Info *);
#ifdef CONFIG_CIFS_WEAK_PW_HASH
extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 8e2e799e7a24..a89c4cb4e6cf 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -139,8 +139,8 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
if (smb_command != SMB_COM_WRITE_ANDX &&
smb_command != SMB_COM_OPEN_ANDX &&
smb_command != SMB_COM_TREE_DISCONNECT) {
- cFYI(1, "can not send cmd %d while umounting",
- smb_command);
+ cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+ smb_command);
return -ENODEV;
}
}
@@ -163,7 +163,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* back on-line
*/
if (!tcon->retry) {
- cFYI(1, "gave up waiting on reconnect in smb_init");
+ cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
return -EHOSTDOWN;
}
}
@@ -191,7 +191,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
cifs_mark_open_files_invalid(tcon);
rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
mutex_unlock(&ses->session_mutex);
- cFYI(1, "reconnect tcon rc = %d", rc);
+ cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc)
goto out;
@@ -367,6 +367,185 @@ vt2_err:
return -EINVAL;
}
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+ int rc = 0;
+ u16 count;
+ char *guid = pSMBr->u.extended_response.GUID;
+ struct TCP_Server_Info *server = ses->server;
+
+ count = get_bcc(&pSMBr->hdr);
+ if (count < SMB1_CLIENT_GUID_SIZE)
+ return -EIO;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->srv_count > 1) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+ cifs_dbg(FYI, "server UID changed\n");
+ memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+ }
+ } else {
+ spin_unlock(&cifs_tcp_ses_lock);
+ memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+ }
+
+ if (count == SMB1_CLIENT_GUID_SIZE) {
+ server->sec_ntlmssp = true;
+ } else {
+ count -= SMB1_CLIENT_GUID_SIZE;
+ rc = decode_negTokenInit(
+ pSMBr->u.extended_response.SecurityBlob, count, server);
+ if (rc != 1)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+ bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+ bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+ bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+
+ /*
+ * Is signing required by mnt options? If not then check
+ * global_secflags to see if it is there.
+ */
+ if (!mnt_sign_required)
+ mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+ CIFSSEC_MUST_SIGN);
+
+ /*
+ * If signing is required then it's automatically enabled too,
+ * otherwise, check to see if the secflags allow it.
+ */
+ mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+ (global_secflags & CIFSSEC_MAY_SIGN);
+
+ /* If server requires signing, does client allow it? */
+ if (srv_sign_required) {
+ if (!mnt_sign_enabled) {
+ cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+ return -ENOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ /* If client requires signing, does server allow it? */
+ if (mnt_sign_required) {
+ if (!srv_sign_enabled) {
+ cifs_dbg(VFS, "Server does not support signing!");
+ return -ENOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+ __s16 tmp;
+ struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+ if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+ return -EOPNOTSUPP;
+
+ server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+ server->maxReq = min_t(unsigned int,
+ le16_to_cpu(rsp->MaxMpxCount),
+ cifs_max_pending);
+ set_credits(server, server->maxReq);
+ server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+ server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
+ /* even though we do not use raw we might as well set this
+ accurately, in case we ever find a need for it */
+ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+ server->max_rw = 0xFF00;
+ server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+ } else {
+ server->max_rw = 0;/* do not need to use raw anyway */
+ server->capabilities = CAP_MPX_MODE;
+ }
+ tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+ if (tmp == -1) {
+ /* OS/2 often does not set timezone therefore
+ * we must use server time to calc time zone.
+ * Could deviate slightly from the right zone.
+ * Smallest defined timezone difference is 15 minutes
+ * (i.e. Nepal). Rounding up/down is done to match
+ * this requirement.
+ */
+ int val, seconds, remain, result;
+ struct timespec ts, utc;
+ utc = CURRENT_TIME;
+ ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+ rsp->SrvTime.Time, 0);
+ cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+ (int)ts.tv_sec, (int)utc.tv_sec,
+ (int)(utc.tv_sec - ts.tv_sec));
+ val = (int)(utc.tv_sec - ts.tv_sec);
+ seconds = abs(val);
+ result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+ remain = seconds % MIN_TZ_ADJ;
+ if (remain >= (MIN_TZ_ADJ / 2))
+ result += MIN_TZ_ADJ;
+ if (val < 0)
+ result = -result;
+ server->timeAdj = result;
+ } else {
+ server->timeAdj = (int)tmp;
+ server->timeAdj *= 60; /* also in seconds */
+ }
+ cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+
+
+ /* BB get server time for time conversions and add
+ code to use it and timezone since this is not UTC */
+
+ if (rsp->EncryptionKeyLength ==
+ cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+ memcpy(server->cryptkey, rsp->EncryptionKey,
+ CIFS_CRYPTO_KEY_SIZE);
+ } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+ return -EIO; /* need cryptkey unless plain text */
+ }
+
+ cifs_dbg(FYI, "LANMAN negotiated\n");
+ return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+ cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+ return -EOPNOTSUPP;
+}
+#endif
+
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+ switch (sectype) {
+ case RawNTLMSSP:
+ case Kerberos:
+ return true;
+ case Unspecified:
+ if (global_secflags &
+ (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+ return true;
+ /* Fallthrough */
+ default:
+ return false;
+ }
+}
+
int
CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
{
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
int rc = 0;
int bytes_returned;
int i;
- struct TCP_Server_Info *server;
+ struct TCP_Server_Info *server = ses->server;
u16 count;
- unsigned int secFlags;
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
+
rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
(void **) &pSMB, (void **) &pSMBr);
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
- else /* if override flags set only sign/seal OR them with global auth */
- secFlags = global_secflags | ses->overrideSecFlg;
-
- cFYI(1, "secFlags 0x%x", secFlags);
-
pSMB->hdr.Mid = get_next_mid(server);
pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
- if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
- cFYI(1, "Kerberos only mechanism, enable extended security");
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
- cFYI(1, "NTLMSSP only mechanism, enable extended security");
+ if (should_set_ext_sec_flag(ses->sectype)) {
+ cifs_dbg(FYI, "Requesting extended security.");
pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
}
@@ -428,7 +590,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
goto neg_err_exit;
server->dialect = le16_to_cpu(pSMBr->DialectIndex);
- cFYI(1, "Dialect: %d", server->dialect);
+ cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
/* Check wct = 1 error case */
if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
/* core returns wct = 1, but we do not ask for core - otherwise
@@ -436,129 +598,20 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
could not negotiate a common dialect */
rc = -EOPNOTSUPP;
goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- } else if ((pSMBr->hdr.WordCount == 13)
- && ((server->dialect == LANMAN_PROT)
- || (server->dialect == LANMAN2_PROT))) {
- __s16 tmp;
- struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-
- if ((secFlags & CIFSSEC_MAY_LANMAN) ||
- (secFlags & CIFSSEC_MAY_PLNTXT))
- server->secType = LANMAN;
- else {
- cERROR(1, "mount failed weak security disabled"
- " in /proc/fs/cifs/SecurityFlags");
- rc = -EOPNOTSUPP;
- goto neg_err_exit;
- }
- server->sec_mode = le16_to_cpu(rsp->SecurityMode);
- server->maxReq = min_t(unsigned int,
- le16_to_cpu(rsp->MaxMpxCount),
- cifs_max_pending);
- set_credits(server, server->maxReq);
- server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
- server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
- /* even though we do not use raw we might as well set this
- accurately, in case we ever find a need for it */
- if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
- server->max_rw = 0xFF00;
- server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
- } else {
- server->max_rw = 0;/* do not need to use raw anyway */
- server->capabilities = CAP_MPX_MODE;
- }
- tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
- if (tmp == -1) {
- /* OS/2 often does not set timezone therefore
- * we must use server time to calc time zone.
- * Could deviate slightly from the right zone.
- * Smallest defined timezone difference is 15 minutes
- * (i.e. Nepal). Rounding up/down is done to match
- * this requirement.
- */
- int val, seconds, remain, result;
- struct timespec ts, utc;
- utc = CURRENT_TIME;
- ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
- rsp->SrvTime.Time, 0);
- cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
- (int)ts.tv_sec, (int)utc.tv_sec,
- (int)(utc.tv_sec - ts.tv_sec));
- val = (int)(utc.tv_sec - ts.tv_sec);
- seconds = abs(val);
- result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
- remain = seconds % MIN_TZ_ADJ;
- if (remain >= (MIN_TZ_ADJ / 2))
- result += MIN_TZ_ADJ;
- if (val < 0)
- result = -result;
- server->timeAdj = result;
- } else {
- server->timeAdj = (int)tmp;
- server->timeAdj *= 60; /* also in seconds */
- }
- cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
-
-
- /* BB get server time for time conversions and add
- code to use it and timezone since this is not UTC */
-
- if (rsp->EncryptionKeyLength ==
- cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
- memcpy(ses->server->cryptkey, rsp->EncryptionKey,
- CIFS_CRYPTO_KEY_SIZE);
- } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
- rc = -EIO; /* need cryptkey unless plain text */
- goto neg_err_exit;
- }
-
- cFYI(1, "LANMAN negotiated");
- /* we will not end up setting signing flags - as no signing
- was in LANMAN and server did not return the flags on */
- goto signing_check;
-#else /* weak security disabled */
} else if (pSMBr->hdr.WordCount == 13) {
- cERROR(1, "mount failed, cifs module not built "
- "with CIFS_WEAK_PW_HASH support");
- rc = -EOPNOTSUPP;
-#endif /* WEAK_PW_HASH */
- goto neg_err_exit;
+ server->negflavor = CIFS_NEGFLAVOR_LANMAN;
+ rc = decode_lanman_negprot_rsp(server, pSMBr);
+ goto signing_check;
} else if (pSMBr->hdr.WordCount != 17) {
/* unknown wct */
rc = -EOPNOTSUPP;
goto neg_err_exit;
}
- /* else wct == 17 NTLM */
+ /* else wct == 17, NTLM or better */
+
server->sec_mode = pSMBr->SecurityMode;
if ((server->sec_mode & SECMODE_USER) == 0)
- cFYI(1, "share mode security");
-
- if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
-#endif /* CIFS_WEAK_PW_HASH */
- cERROR(1, "Server requests plain text password"
- " but client support disabled");
-
- if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
- server->secType = NTLMv2;
- else if (secFlags & CIFSSEC_MAY_NTLM)
- server->secType = NTLM;
- else if (secFlags & CIFSSEC_MAY_NTLMV2)
- server->secType = NTLMv2;
- else if (secFlags & CIFSSEC_MAY_KRB5)
- server->secType = Kerberos;
- else if (secFlags & CIFSSEC_MAY_NTLMSSP)
- server->secType = RawNTLMSSP;
- else if (secFlags & CIFSSEC_MAY_LANMAN)
- server->secType = LANMAN;
- else {
- rc = -EOPNOTSUPP;
- cERROR(1, "Invalid security type");
- goto neg_err_exit;
- }
- /* else ... any others ...? */
+ cifs_dbg(FYI, "share mode security\n");
/* one byte, so no need to convert this or EncryptionKeyLen from
little endian */
@@ -568,100 +621,34 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
- cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
+ cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
server->timeAdj *= 60;
+
if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+ server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
CIFS_CRYPTO_KEY_SIZE);
} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
server->capabilities & CAP_EXTENDED_SECURITY) &&
(pSMBr->EncryptionKeyLength == 0)) {
- /* decode security blob */
- count = get_bcc(&pSMBr->hdr);
- if (count < 16) {
- rc = -EIO;
- goto neg_err_exit;
- }
- spin_lock(&cifs_tcp_ses_lock);
- if (server->srv_count > 1) {
- spin_unlock(&cifs_tcp_ses_lock);
- if (memcmp(server->server_GUID,
- pSMBr->u.extended_response.
- GUID, 16) != 0) {
- cFYI(1, "server UID changed");
- memcpy(server->server_GUID,
- pSMBr->u.extended_response.GUID,
- 16);
- }
- } else {
- spin_unlock(&cifs_tcp_ses_lock);
- memcpy(server->server_GUID,
- pSMBr->u.extended_response.GUID, 16);
- }
-
- if (count == 16) {
- server->secType = RawNTLMSSP;
- } else {
- rc = decode_negTokenInit(pSMBr->u.extended_response.
- SecurityBlob, count - 16,
- server);
- if (rc == 1)
- rc = 0;
- else
- rc = -EINVAL;
- if (server->secType == Kerberos) {
- if (!server->sec_kerberos &&
- !server->sec_mskerberos)
- rc = -EOPNOTSUPP;
- } else if (server->secType == RawNTLMSSP) {
- if (!server->sec_ntlmssp)
- rc = -EOPNOTSUPP;
- } else
- rc = -EOPNOTSUPP;
- }
+ server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+ rc = decode_ext_sec_blob(ses, pSMBr);
} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
rc = -EIO; /* no crypt key only if plain text pwd */
- goto neg_err_exit;
- } else
- server->capabilities &= ~CAP_EXTENDED_SECURITY;
-
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-signing_check:
-#endif
- if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
- /* MUST_SIGN already includes the MAY_SIGN FLAG
- so if this is zero it means that signing is disabled */
- cFYI(1, "Signing disabled");
- if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
- cERROR(1, "Server requires "
- "packet signing to be enabled in "
- "/proc/fs/cifs/SecurityFlags.");
- rc = -EOPNOTSUPP;
- }
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
- /* signing required */
- cFYI(1, "Must sign - secFlags 0x%x", secFlags);
- if ((server->sec_mode &
- (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
- cERROR(1, "signing required but server lacks support");
- rc = -EOPNOTSUPP;
- } else
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
} else {
- /* signing optional ie CIFSSEC_MAY_SIGN */
- if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+ server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+ server->capabilities &= ~CAP_EXTENDED_SECURITY;
}
+signing_check:
+ if (!rc)
+ rc = cifs_enable_signing(server, ses->sign);
neg_err_exit:
cifs_buf_release(pSMB);
- cFYI(1, "negprot rc %d", rc);
+ cifs_dbg(FYI, "negprot rc %d\n", rc);
return rc;
}
@@ -671,7 +658,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
struct smb_hdr *smb_buffer;
int rc = 0;
- cFYI(1, "In tree disconnect");
+ cifs_dbg(FYI, "In tree disconnect\n");
/* BB: do we need to check this? These should never be NULL. */
if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -693,7 +680,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
if (rc)
- cFYI(1, "Tree disconnect failed %d", rc);
+ cifs_dbg(FYI, "Tree disconnect failed %d\n", rc);
/* No need to return error on this operation if tid invalidated and
closed on server already e.g. due to tcp session crashing */
@@ -728,7 +715,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
struct smb_rqst rqst = { .rq_iov = &iov,
.rq_nvec = 1 };
- cFYI(1, "In echo request");
+ cifs_dbg(FYI, "In echo request\n");
rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
if (rc)
@@ -747,7 +734,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
if (rc)
- cFYI(1, "Echo request failed: %d", rc);
+ cifs_dbg(FYI, "Echo request failed: %d\n", rc);
cifs_small_buf_release(smb);
@@ -760,7 +747,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
LOGOFF_ANDX_REQ *pSMB;
int rc = 0;
- cFYI(1, "In SMBLogoff for session disconnect");
+ cifs_dbg(FYI, "In SMBLogoff for session disconnect\n");
/*
* BB: do we need to check validity of ses and server? They should
@@ -782,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
pSMB->hdr.Mid = get_next_mid(ses->server);
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+ if (ses->server->sign)
+ pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
pSMB->hdr.Uid = ses->Suid;
@@ -814,7 +800,7 @@ CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, param_offset, offset, byte_count;
- cFYI(1, "In POSIX delete");
+ cifs_dbg(FYI, "In POSIX delete\n");
PsxDelete:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -866,7 +852,7 @@ PsxDelete:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "Posix delete returned %d", rc);
+ cifs_dbg(FYI, "Posix delete returned %d\n", rc);
cifs_buf_release(pSMB);
cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
@@ -914,7 +900,7 @@ DelFileRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
if (rc)
- cFYI(1, "Error in RMFile = %d", rc);
+ cifs_dbg(FYI, "Error in RMFile = %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -934,7 +920,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
int name_len;
int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
- cFYI(1, "In CIFSSMBRmDir");
+ cifs_dbg(FYI, "In CIFSSMBRmDir\n");
RmDirRetry:
rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -960,7 +946,7 @@ RmDirRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs);
if (rc)
- cFYI(1, "Error in RMDir = %d", rc);
+ cifs_dbg(FYI, "Error in RMDir = %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -979,7 +965,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
int name_len;
int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
- cFYI(1, "In CIFSSMBMkDir");
+ cifs_dbg(FYI, "In CIFSSMBMkDir\n");
MkDirRetry:
rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -1005,7 +991,7 @@ MkDirRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs);
if (rc)
- cFYI(1, "Error in Mkdir = %d", rc);
+ cifs_dbg(FYI, "Error in Mkdir = %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -1029,7 +1015,7 @@ CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
OPEN_PSX_REQ *pdata;
OPEN_PSX_RSP *psx_rsp;
- cFYI(1, "In POSIX Create");
+ cifs_dbg(FYI, "In POSIX Create\n");
PsxCreat:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -1083,11 +1069,11 @@ PsxCreat:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Posix create returned %d", rc);
+ cifs_dbg(FYI, "Posix create returned %d\n", rc);
goto psx_create_err;
}
- cFYI(1, "copying inode info");
+ cifs_dbg(FYI, "copying inode info\n");
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
@@ -1109,11 +1095,11 @@ PsxCreat:
/* check to make sure response data is there */
if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
pRetData->Type = cpu_to_le32(-1); /* unknown */
- cFYI(DBG2, "unknown type");
+ cifs_dbg(NOISY, "unknown type\n");
} else {
if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
+ sizeof(FILE_UNIX_BASIC_INFO)) {
- cERROR(1, "Open response data too small");
+ cifs_dbg(VFS, "Open response data too small\n");
pRetData->Type = cpu_to_le32(-1);
goto psx_create_err;
}
@@ -1160,7 +1146,7 @@ static __u16 convert_disposition(int disposition)
ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
break;
default:
- cFYI(1, "unknown disposition %d", disposition);
+ cifs_dbg(FYI, "unknown disposition %d\n", disposition);
ofun = SMBOPEN_OAPPEND; /* regular open */
}
return ofun;
@@ -1251,7 +1237,7 @@ OldOpenRetry:
(struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
if (rc) {
- cFYI(1, "Error in Open = %d", rc);
+ cifs_dbg(FYI, "Error in Open = %d\n", rc);
} else {
/* BB verify if wct == 15 */
@@ -1364,7 +1350,7 @@ openRetry:
(struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
if (rc) {
- cFYI(1, "Error in Open = %d", rc);
+ cifs_dbg(FYI, "Error in Open = %d\n", rc);
} else {
*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
*netfid = pSMBr->Fid; /* cifs fid stays in le */
@@ -1425,8 +1411,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
char *buf = server->smallbuf;
unsigned int buflen = get_rfc1002_length(buf) + 4;
- cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__,
- mid->mid, rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+ __func__, mid->mid, rdata->offset, rdata->bytes);
/*
* read the rest of READ_RSP header (sans Data array), or whatever we
@@ -1447,16 +1433,16 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
- cFYI(1, "%s: server returned error %d", __func__,
- rdata->result);
+ cifs_dbg(FYI, "%s: server returned error %d\n",
+ __func__, rdata->result);
return cifs_readv_discard(server, mid);
}
/* Is there enough to get to the rest of the READ_RSP header? */
if (server->total_read < server->vals->read_rsp_size) {
- cFYI(1, "%s: server returned short header. got=%u expected=%zu",
- __func__, server->total_read,
- server->vals->read_rsp_size);
+ cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+ __func__, server->total_read,
+ server->vals->read_rsp_size);
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
}
@@ -1468,19 +1454,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* is beyond the EOF. Treat it as if the data starts just after
* the header.
*/
- cFYI(1, "%s: data offset (%u) inside read response header",
- __func__, data_offset);
+ cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+ __func__, data_offset);
data_offset = server->total_read;
} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
/* data_offset is beyond the end of smallbuf */
- cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
- __func__, data_offset);
+ cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+ __func__, data_offset);
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
}
- cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
- server->total_read, data_offset);
+ cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+ __func__, server->total_read, data_offset);
len = data_offset - server->total_read;
if (len > 0) {
@@ -1496,8 +1482,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* set up first iov for signature check */
rdata->iov.iov_base = buf;
rdata->iov.iov_len = server->total_read;
- cFYI(1, "0: iov_base=%p iov_len=%zu",
- rdata->iov.iov_base, rdata->iov.iov_len);
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov.iov_base, rdata->iov.iov_len);
/* how much data is in the response? */
data_len = server->ops->read_data_length(buf);
@@ -1514,8 +1500,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->total_read += length;
rdata->bytes = length;
- cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
- buflen, data_len);
+ cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+ server->total_read, buflen, data_len);
/* discard anything left over */
if (server->total_read < buflen)
@@ -1538,21 +1524,21 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
- cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
- mid->mid, mid->mid_state, rdata->result, rdata->bytes);
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ __func__, mid->mid, mid->mid_state, rdata->result,
+ rdata->bytes);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
/* result already set, check signature */
- if (server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
int rc = 0;
rc = cifs_verify_signature(&rqst, server,
- mid->sequence_number + 1);
+ mid->sequence_number);
if (rc)
- cERROR(1, "SMB signature verification returned "
- "error = %d", rc);
+ cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+ rc);
}
/* FIXME: should this be counted toward the initiating task? */
task_io_account_read(rdata->bytes);
@@ -1582,8 +1568,8 @@ cifs_async_readv(struct cifs_readdata *rdata)
struct smb_rqst rqst = { .rq_iov = &rdata->iov,
.rq_nvec = 1 };
- cFYI(1, "%s: offset=%llu bytes=%u", __func__,
- rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+ __func__, rdata->offset, rdata->bytes);
if (tcon->ses->capabilities & CAP_LARGE_FILES)
wct = 12;
@@ -1653,7 +1639,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
struct cifs_tcon *tcon = io_parms->tcon;
unsigned int count = io_parms->length;
- cFYI(1, "Reading %d bytes on fid %d", count, netfid);
+ cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid);
if (tcon->ses->capabilities & CAP_LARGE_FILES)
wct = 12;
else {
@@ -1701,7 +1687,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
pSMBr = (READ_RSP *)iov[0].iov_base;
if (rc) {
- cERROR(1, "Send error in read = %d", rc);
+ cifs_dbg(VFS, "Send error in read = %d\n", rc);
} else {
int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
data_length = data_length << 16;
@@ -1711,7 +1697,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
/*check that DataLength would not go beyond end of SMB */
if ((data_length > CIFSMaxBufSize)
|| (data_length > count)) {
- cFYI(1, "bad length %d for count %d",
+ cifs_dbg(FYI, "bad length %d for count %d\n",
data_length, count);
rc = -EIO;
*nbytes = 0;
@@ -1719,7 +1705,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
pReadData = (char *) (&pSMBr->hdr.Protocol) +
le16_to_cpu(pSMBr->DataOffset);
/* if (rc = copy_to_user(buf, pReadData, data_length)) {
- cERROR(1, "Faulting on read rc = %d",rc);
+ cifs_dbg(VFS, "Faulting on read rc = %d\n",rc);
rc = -EFAULT;
}*/ /* can not use copy_to_user when using page cache*/
if (*buf)
@@ -1767,7 +1753,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
*nbytes = 0;
- /* cFYI(1, "write at %lld %d bytes", offset, count);*/
+ /* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/
if (tcon->ses == NULL)
return -ECONNABORTED;
@@ -1852,7 +1838,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
(struct smb_hdr *) pSMBr, &bytes_returned, long_op);
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
if (rc) {
- cFYI(1, "Send error in write = %d", rc);
+ cifs_dbg(FYI, "Send error in write = %d\n", rc);
} else {
*nbytes = le16_to_cpu(pSMBr->CountHigh);
*nbytes = (*nbytes) << 16;
@@ -1959,7 +1945,7 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
/* this would overflow */
if (nr_pages == 0) {
- cERROR(1, "%s: called with nr_pages == 0!", __func__);
+ cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__);
return NULL;
}
@@ -2075,7 +2061,8 @@ cifs_async_writev(struct cifs_writedata *wdata)
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
- cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+ cifs_dbg(FYI, "async write at %llu %u bytes\n",
+ wdata->offset, wdata->bytes);
smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
@@ -2123,7 +2110,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
*nbytes = 0;
- cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
+ cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count);
if (tcon->ses->capabilities & CAP_LARGE_FILES) {
wct = 14;
@@ -2182,7 +2169,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
if (rc) {
- cFYI(1, "Send error Write2 = %d", rc);
+ cifs_dbg(FYI, "Send error Write2 = %d\n", rc);
} else if (resp_buf_type == 0) {
/* presumably this can not happen, but best to be safe */
rc = -EIO;
@@ -2223,7 +2210,8 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buf_type;
__u16 count;
- cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+ cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n",
+ num_lock, num_unlock);
rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
if (rc)
@@ -2249,7 +2237,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
if (rc)
- cFYI(1, "Send error in cifs_lockv = %d", rc);
+ cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
return rc;
}
@@ -2268,7 +2256,8 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
int flags = 0;
__u16 count;
- cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
+ cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n",
+ (int)waitFlag, numLock);
rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
if (rc)
@@ -2317,7 +2306,7 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
}
cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
if (rc)
- cFYI(1, "Send error in Lock = %d", rc);
+ cifs_dbg(FYI, "Send error in Lock = %d\n", rc);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@ -2341,7 +2330,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
__u16 params, param_offset, offset, byte_count, count;
struct kvec iov[1];
- cFYI(1, "Posix Lock");
+ cifs_dbg(FYI, "Posix Lock\n");
rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
@@ -2408,7 +2397,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
}
if (rc) {
- cFYI(1, "Send error in Posix Lock = %d", rc);
+ cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc);
} else if (pLockData) {
/* lock structure can be returned on get */
__u16 data_offset;
@@ -2465,7 +2454,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
{
int rc = 0;
CLOSE_REQ *pSMB = NULL;
- cFYI(1, "In CIFSSMBClose");
+ cifs_dbg(FYI, "In CIFSSMBClose\n");
/* do not retry on dead session on close */
rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -2482,7 +2471,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
if (rc) {
if (rc != -EINTR) {
/* EINTR is expected when user ctl-c to kill app */
- cERROR(1, "Send error in Close = %d", rc);
+ cifs_dbg(VFS, "Send error in Close = %d\n", rc);
}
}
@@ -2498,7 +2487,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
{
int rc = 0;
FLUSH_REQ *pSMB = NULL;
- cFYI(1, "In CIFSSMBFlush");
+ cifs_dbg(FYI, "In CIFSSMBFlush\n");
rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
if (rc)
@@ -2509,7 +2498,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes);
if (rc)
- cERROR(1, "Send error in Flush = %d", rc);
+ cifs_dbg(VFS, "Send error in Flush = %d\n", rc);
return rc;
}
@@ -2527,7 +2516,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
__u16 count;
int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
- cFYI(1, "In CIFSSMBRename");
+ cifs_dbg(FYI, "In CIFSSMBRename\n");
renameRetry:
rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -2574,7 +2563,7 @@ renameRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_renames);
if (rc)
- cFYI(1, "Send error in rename = %d", rc);
+ cifs_dbg(FYI, "Send error in rename = %d\n", rc);
cifs_buf_release(pSMB);
@@ -2598,7 +2587,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
int len_of_str;
__u16 params, param_offset, offset, count, byte_count;
- cFYI(1, "Rename to File by handle");
+ cifs_dbg(FYI, "Rename to File by handle\n");
rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
(void **) &pSMBr);
if (rc)
@@ -2655,7 +2644,8 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames);
if (rc)
- cFYI(1, "Send error in Rename (by file handle) = %d", rc);
+ cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n",
+ rc);
cifs_buf_release(pSMB);
@@ -2677,7 +2667,7 @@ CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon,
int name_len, name_len2;
__u16 count;
- cFYI(1, "In CIFSSMBCopy");
+ cifs_dbg(FYI, "In CIFSSMBCopy\n");
copyRetry:
rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -2722,8 +2712,8 @@ copyRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in copy = %d with %d files copied",
- rc, le16_to_cpu(pSMBr->CopyCount));
+ cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n",
+ rc, le16_to_cpu(pSMBr->CopyCount));
}
cifs_buf_release(pSMB);
@@ -2747,7 +2737,7 @@ CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, param_offset, offset, byte_count;
- cFYI(1, "In Symlink Unix style");
+ cifs_dbg(FYI, "In Symlink Unix style\n");
createSymLinkRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -2812,7 +2802,8 @@ createSymLinkRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks);
if (rc)
- cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
+ cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n",
+ rc);
cifs_buf_release(pSMB);
@@ -2836,7 +2827,7 @@ CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, param_offset, offset, byte_count;
- cFYI(1, "In Create Hard link Unix style");
+ cifs_dbg(FYI, "In Create Hard link Unix style\n");
createHardLinkRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -2898,7 +2889,8 @@ createHardLinkRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
if (rc)
- cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
+ cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n",
+ rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -2920,7 +2912,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
__u16 count;
int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
- cFYI(1, "In CIFSCreateHardLink");
+ cifs_dbg(FYI, "In CIFSCreateHardLink\n");
winCreateHardLinkRetry:
rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2972,7 +2964,7 @@ winCreateHardLinkRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
if (rc)
- cFYI(1, "Send error in hard link (NT rename) = %d", rc);
+ cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -2995,7 +2987,7 @@ CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
__u16 params, byte_count;
char *data_start;
- cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
+ cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName);
querySymLinkRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3042,7 +3034,7 @@ querySymLinkRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc);
} else {
/* decode response */
@@ -3097,7 +3089,8 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
struct smb_com_transaction_ioctl_req *pSMB;
struct smb_com_transaction_ioctl_rsp *pSMBr;
- cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
+ cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n",
+ searchName);
rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
(void **) &pSMBr);
if (rc)
@@ -3125,7 +3118,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
} else { /* decode response */
__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
__u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -3149,7 +3142,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
if ((reparse_buf->LinkNamesBuf +
reparse_buf->TargetNameOffset +
reparse_buf->TargetNameLen) > end_of_smb) {
- cFYI(1, "reparse buf beyond SMB");
+ cifs_dbg(FYI, "reparse buf beyond SMB\n");
rc = -EIO;
goto qreparse_out;
}
@@ -3170,12 +3163,11 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
}
} else {
rc = -EIO;
- cFYI(1, "Invalid return data count on "
- "get reparse info ioctl");
+ cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
}
symlinkinfo[buflen] = 0; /* just in case so the caller
does not go off the end of the buffer */
- cFYI(1, "readlink result - %s", symlinkinfo);
+ cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo);
}
qreparse_out:
@@ -3198,7 +3190,10 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
- /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
+/*
+ cifs_dbg(FYI, "perm %d tag %d id %d\n",
+ ace->e_perm, ace->e_tag, ace->e_id);
+*/
return;
}
@@ -3224,8 +3219,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
size += sizeof(struct cifs_posix_ace) * count;
/* check if we would go beyond end of SMB */
if (size_of_data_area < size) {
- cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
- size_of_data_area, size);
+ cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
+ size_of_data_area, size);
return -EINVAL;
}
} else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -3272,7 +3267,10 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
cifs_ace->cifs_uid = cpu_to_le64(-1);
} else
cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
- /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
+/*
+ cifs_dbg(FYI, "perm %d tag %d id %d\n",
+ ace->e_perm, ace->e_tag, ace->e_id);
+*/
return rc;
}
@@ -3290,12 +3288,11 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return 0;
count = posix_acl_xattr_count((size_t)buflen);
- cFYI(1, "setting acl with %d entries from buf of length %d and "
- "version of %d",
- count, buflen, le32_to_cpu(local_acl->a_version));
+ cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
+ count, buflen, le32_to_cpu(local_acl->a_version));
if (le32_to_cpu(local_acl->a_version) != 2) {
- cFYI(1, "unknown POSIX ACL version %d",
- le32_to_cpu(local_acl->a_version));
+ cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
+ le32_to_cpu(local_acl->a_version));
return 0;
}
cifs_acl->version = cpu_to_le16(1);
@@ -3304,7 +3301,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
else if (acl_type == ACL_TYPE_DEFAULT)
cifs_acl->default_entry_count = cpu_to_le16(count);
else {
- cFYI(1, "unknown ACL type %d", acl_type);
+ cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
for (i = 0; i < count; i++) {
@@ -3337,7 +3334,7 @@ CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
int name_len;
__u16 params, byte_count;
- cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
+ cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
queryAclRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3390,7 +3387,7 @@ queryAclRetry:
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
if (rc) {
- cFYI(1, "Send error in Query POSIX ACL = %d", rc);
+ cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
} else {
/* decode response */
@@ -3427,7 +3424,7 @@ CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, byte_count, data_count, param_offset, offset;
- cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
+ cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
setAclRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -3482,7 +3479,7 @@ setAclRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "Set POSIX ACL returned %d", rc);
+ cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
setACLerrorExit:
cifs_buf_release(pSMB);
@@ -3502,7 +3499,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned;
__u16 params, byte_count;
- cFYI(1, "In GetExtAttr");
+ cifs_dbg(FYI, "In GetExtAttr\n");
if (tcon == NULL)
return -ENODEV;
@@ -3541,7 +3538,7 @@ GetExtAttrRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "error %d in GetExtAttr", rc);
+ cifs_dbg(FYI, "error %d in GetExtAttr\n", rc);
} else {
/* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3556,7 +3553,7 @@ GetExtAttrRetry:
struct file_chattr_info *pfinfo;
/* BB Do we need a cast or hash here ? */
if (count != 16) {
- cFYI(1, "Illegal size ret in GetExtAttr");
+ cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n");
rc = -EIO;
goto GetExtAttrOut;
}
@@ -3644,21 +3641,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
/* should we also check that parm and data areas do not overlap? */
if (*ppparm > end_of_smb) {
- cFYI(1, "parms start after end of smb");
+ cifs_dbg(FYI, "parms start after end of smb\n");
return -EINVAL;
} else if (parm_count + *ppparm > end_of_smb) {
- cFYI(1, "parm end after end of smb");
+ cifs_dbg(FYI, "parm end after end of smb\n");
return -EINVAL;
} else if (*ppdata > end_of_smb) {
- cFYI(1, "data starts after end of smb");
+ cifs_dbg(FYI, "data starts after end of smb\n");
return -EINVAL;
} else if (data_count + *ppdata > end_of_smb) {
- cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
- *ppdata, data_count, (data_count + *ppdata),
- end_of_smb, pSMBr);
+ cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n",
+ *ppdata, data_count, (data_count + *ppdata),
+ end_of_smb, pSMBr);
return -EINVAL;
} else if (parm_count + data_count > bcc) {
- cFYI(1, "parm count and data count larger than SMB");
+ cifs_dbg(FYI, "parm count and data count larger than SMB\n");
return -EINVAL;
}
*pdatalen = data_count;
@@ -3676,7 +3673,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
QUERY_SEC_DESC_REQ *pSMB;
struct kvec iov[1];
- cFYI(1, "GetCifsACL");
+ cifs_dbg(FYI, "GetCifsACL\n");
*pbuflen = 0;
*acl_inf = NULL;
@@ -3701,7 +3698,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
if (rc) {
- cFYI(1, "Send error in QuerySecDesc = %d", rc);
+ cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc);
} else { /* decode response */
__le32 *parm;
__u32 parm_len;
@@ -3716,7 +3713,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
goto qsec_out;
pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
- cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
+ cifs_dbg(FYI, "smb %p parm %p data %p\n",
+ pSMBr, parm, *acl_inf);
if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
rc = -EIO; /* bad smb */
@@ -3728,8 +3726,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
acl_len = le32_to_cpu(*parm);
if (acl_len != *pbuflen) {
- cERROR(1, "acl length %d does not match %d",
- acl_len, *pbuflen);
+ cifs_dbg(VFS, "acl length %d does not match %d\n",
+ acl_len, *pbuflen);
if (*pbuflen > acl_len)
*pbuflen = acl_len;
}
@@ -3738,16 +3736,15 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
header followed by the smallest SID */
if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
(*pbuflen >= 64 * 1024)) {
- cERROR(1, "bad acl length %d", *pbuflen);
+ cifs_dbg(VFS, "bad acl length %d\n", *pbuflen);
rc = -EINVAL;
*pbuflen = 0;
} else {
- *acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+ *acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL);
if (*acl_inf == NULL) {
*pbuflen = 0;
rc = -ENOMEM;
}
- memcpy(*acl_inf, pdata, *pbuflen);
}
}
qsec_out:
@@ -3809,9 +3806,10 @@ setCifsAclRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
- cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
+ cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n",
+ bytes_returned, rc);
if (rc)
- cFYI(1, "Set CIFS ACL returned %d", rc);
+ cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -3835,7 +3833,7 @@ SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned;
int name_len;
- cFYI(1, "In SMBQPath path %s", search_name);
+ cifs_dbg(FYI, "In SMBQPath path %s\n", search_name);
QInfRetry:
rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -3862,7 +3860,7 @@ QInfRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QueryInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc);
} else if (data) {
struct timespec ts;
__u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3932,11 +3930,12 @@ QFileInfoRetry:
pSMB->Pad = 0;
pSMB->Fid = netfid;
inc_rfc1001_len(pSMB, byte_count);
+ pSMB->t2.ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QPathInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3973,7 +3972,7 @@ CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
int name_len;
__u16 params, byte_count;
- /* cFYI(1, "In QPathInfo path %s", search_name); */
+ /* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */
QPathInfoRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -4023,7 +4022,7 @@ QPathInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QPathInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4100,18 +4099,17 @@ UnixQFileInfoRetry:
pSMB->Pad = 0;
pSMB->Fid = netfid;
inc_rfc1001_len(pSMB, byte_count);
+ pSMB->t2.ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QPathInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
- cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. "
- "Unix Extensions can be disabled on mount "
- "by specifying the nosfu mount option.");
+ cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
rc = -EIO; /* bad smb */
} else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4143,7 +4141,7 @@ CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
int name_len;
__u16 params, byte_count;
- cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
+ cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName);
UnixQPathInfoRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -4190,14 +4188,12 @@ UnixQPathInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QPathInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
- cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. "
- "Unix Extensions can be disabled on mount "
- "by specifying the nosfu mount option.");
+ cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
rc = -EIO; /* bad smb */
} else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4231,7 +4227,7 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
__u16 params, byte_count;
struct nls_table *nls_codepage;
- cFYI(1, "In FindFirst for %s", searchName);
+ cifs_dbg(FYI, "In FindFirst for %s\n", searchName);
findFirstRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4314,7 +4310,7 @@ findFirstRetry:
if (rc) {/* BB add logic to retry regular search if Unix search
rejected unexpectedly by server */
/* BB Add code to handle unsupported level rc */
- cFYI(1, "Error in FindFirst = %d", rc);
+ cifs_dbg(FYI, "Error in FindFirst = %d\n", rc);
cifs_buf_release(pSMB);
@@ -4352,7 +4348,7 @@ findFirstRetry:
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
if (CIFSMaxBufSize < lnoff) {
- cERROR(1, "ignoring corrupt resume name");
+ cifs_dbg(VFS, "ignoring corrupt resume name\n");
psrch_inf->last_entry = NULL;
return rc;
}
@@ -4383,7 +4379,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int name_len;
__u16 params, byte_count;
- cFYI(1, "In FindNext");
+ cifs_dbg(FYI, "In FindNext\n");
if (psrch_inf->endOfSearch)
return -ENOENT;
@@ -4444,7 +4440,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
cifs_buf_release(pSMB);
rc = 0; /* search probably was closed at end of search*/
} else
- cFYI(1, "FindNext returned = %d", rc);
+ cifs_dbg(FYI, "FindNext returned = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4479,15 +4475,15 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
psrch_inf->entries_in_buffer;
lnoff = le16_to_cpu(parms->LastNameOffset);
if (CIFSMaxBufSize < lnoff) {
- cERROR(1, "ignoring corrupt resume name");
+ cifs_dbg(VFS, "ignoring corrupt resume name\n");
psrch_inf->last_entry = NULL;
return rc;
} else
psrch_inf->last_entry =
psrch_inf->srch_entries_start + lnoff;
-/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
- psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
+/* cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n",
+ psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
/* BB fixme add unlock here */
}
@@ -4512,7 +4508,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
FINDCLOSE_REQ *pSMB = NULL;
- cFYI(1, "In CIFSSMBFindClose");
+ cifs_dbg(FYI, "In CIFSSMBFindClose\n");
rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
/* no sense returning error if session restarted
@@ -4526,7 +4522,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->ByteCount = 0;
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
- cERROR(1, "Send error in FindClose = %d", rc);
+ cifs_dbg(VFS, "Send error in FindClose = %d\n", rc);
cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose);
@@ -4548,7 +4544,7 @@ CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
int name_len, bytes_returned;
__u16 params, byte_count;
- cFYI(1, "In GetSrvInodeNum for %s", search_name);
+ cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name);
if (tcon == NULL)
return -ENODEV;
@@ -4599,7 +4595,7 @@ GetInodeNumberRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "error %d in QueryInternalInfo", rc);
+ cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc);
} else {
/* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4614,7 +4610,7 @@ GetInodeNumberRetry:
struct file_internal_info *pfinfo;
/* BB Do we need a cast or hash here ? */
if (count < 8) {
- cFYI(1, "Illegal size ret in QryIntrnlInf");
+ cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n");
rc = -EIO;
goto GetInodeNumOut;
}
@@ -4655,16 +4651,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
if (*num_of_nodes < 1) {
- cERROR(1, "num_referrals: must be at least > 0,"
- "but we get num_referrals = %d", *num_of_nodes);
+ cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
+ *num_of_nodes);
rc = -EINVAL;
goto parse_DFS_referrals_exit;
}
ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
if (ref->VersionNumber != cpu_to_le16(3)) {
- cERROR(1, "Referrals of V%d version are not supported,"
- "should be V3", le16_to_cpu(ref->VersionNumber));
+ cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
+ le16_to_cpu(ref->VersionNumber));
rc = -EINVAL;
goto parse_DFS_referrals_exit;
}
@@ -4673,14 +4669,12 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
data_end = (char *)(&(pSMBr->PathConsumed)) +
le16_to_cpu(pSMBr->t2.DataCount);
- cFYI(1, "num_referrals: %d dfs flags: 0x%x ...",
- *num_of_nodes,
- le32_to_cpu(pSMBr->DFSFlags));
+ cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n",
+ *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags));
- *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
- *num_of_nodes, GFP_KERNEL);
+ *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param),
+ GFP_KERNEL);
if (*target_nodes == NULL) {
- cERROR(1, "Failed to allocate buffer for target_nodes");
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
@@ -4759,7 +4753,7 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
*num_of_nodes = 0;
*target_nodes = NULL;
- cFYI(1, "In GetDFSRefer the path %s", search_name);
+ cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name);
if (ses == NULL)
return -ENODEV;
getDFSRetry:
@@ -4792,11 +4786,8 @@ getDFSRetry:
strncpy(pSMB->RequestFileName, search_name, name_len);
}
- if (ses->server) {
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- }
+ if (ses->server && ses->server->sign)
+ pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
pSMB->hdr.Uid = ses->Suid;
@@ -4827,7 +4818,7 @@ getDFSRetry:
rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in GetDFSRefer = %d", rc);
+ cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc);
goto GetDFSRefExit;
}
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4838,9 +4829,8 @@ getDFSRetry:
goto GetDFSRefExit;
}
- cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d",
- get_bcc(&pSMBr->hdr),
- le16_to_cpu(pSMBr->t2.DataOffset));
+ cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d Offset %d\n",
+ get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset));
/* parse returned result into more usable form */
rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4869,7 +4859,7 @@ SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "OldQFSInfo");
+ cifs_dbg(FYI, "OldQFSInfo\n");
oldQFSInfoRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -4902,7 +4892,7 @@ oldQFSInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QFSInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4910,7 +4900,7 @@ oldQFSInfoRetry:
rc = -EIO; /* bad smb */
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
- cFYI(1, "qfsinf resp BCC: %d Offset %d",
+ cifs_dbg(FYI, "qfsinf resp BCC: %d Offset %d\n",
get_bcc(&pSMBr->hdr), data_offset);
response_data = (FILE_SYSTEM_ALLOC_INFO *)
@@ -4923,10 +4913,10 @@ oldQFSInfoRetry:
le32_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
le32_to_cpu(response_data->FreeAllocationUnits);
- cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
- (unsigned long long)FSData->f_blocks,
- (unsigned long long)FSData->f_bfree,
- FSData->f_bsize);
+ cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n",
+ (unsigned long long)FSData->f_blocks,
+ (unsigned long long)FSData->f_bfree,
+ FSData->f_bsize);
}
}
cifs_buf_release(pSMB);
@@ -4949,7 +4939,7 @@ CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "In QFSInfo");
+ cifs_dbg(FYI, "In QFSInfo\n");
QFSInfoRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -4982,7 +4972,7 @@ QFSInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QFSInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -5003,10 +4993,10 @@ QFSInfoRetry:
le64_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
le64_to_cpu(response_data->FreeAllocationUnits);
- cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
- (unsigned long long)FSData->f_blocks,
- (unsigned long long)FSData->f_bfree,
- FSData->f_bsize);
+ cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n",
+ (unsigned long long)FSData->f_blocks,
+ (unsigned long long)FSData->f_bfree,
+ FSData->f_bsize);
}
}
cifs_buf_release(pSMB);
@@ -5028,7 +5018,7 @@ CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon)
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "In QFSAttributeInfo");
+ cifs_dbg(FYI, "In QFSAttributeInfo\n");
QFSAttributeRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -5062,7 +5052,7 @@ QFSAttributeRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
+ cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -5098,7 +5088,7 @@ CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon)
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "In QFSDeviceInfo");
+ cifs_dbg(FYI, "In QFSDeviceInfo\n");
QFSDeviceRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -5133,7 +5123,7 @@ QFSDeviceRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -5169,7 +5159,7 @@ CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon)
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "In QFSUnixInfo");
+ cifs_dbg(FYI, "In QFSUnixInfo\n");
QFSUnixRetry:
rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
(void **) &pSMB, (void **) &pSMBr);
@@ -5203,7 +5193,7 @@ QFSUnixRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cERROR(1, "Send error in QFSUnixInfo = %d", rc);
+ cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -5238,7 +5228,7 @@ CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap)
int bytes_returned = 0;
__u16 params, param_offset, offset, byte_count;
- cFYI(1, "In SETFSUnixInfo");
+ cifs_dbg(FYI, "In SETFSUnixInfo\n");
SETFSUnixRetry:
/* BB switch to small buf init to save memory */
rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
@@ -5286,7 +5276,7 @@ SETFSUnixRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
+ cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc)
@@ -5314,7 +5304,7 @@ CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, byte_count;
- cFYI(1, "In QFSPosixInfo");
+ cifs_dbg(FYI, "In QFSPosixInfo\n");
QFSPosixRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -5348,7 +5338,7 @@ QFSPosixRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QFSUnixInfo = %d", rc);
+ cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -5410,7 +5400,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
__u16 params, byte_count, data_count, param_offset, offset;
- cFYI(1, "In SetEOF");
+ cifs_dbg(FYI, "In SetEOF\n");
SetEOFRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -5476,7 +5466,7 @@ SetEOFRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "SetPathInfo (file size) returned %d", rc);
+ cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc);
cifs_buf_release(pSMB);
@@ -5495,8 +5485,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
__u16 params, param_offset, offset, byte_count, count;
- cFYI(1, "SetFileSize (via SetFileInfo) %lld",
- (long long)size);
+ cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n",
+ (long long)size);
rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
if (rc)
@@ -5553,7 +5543,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc) {
- cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
+ cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n",
+ rc);
}
/* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -5577,7 +5568,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
__u16 params, param_offset, offset, byte_count, count;
- cFYI(1, "Set Times (via SetFileInfo)");
+ cifs_dbg(FYI, "Set Times (via SetFileInfo)\n");
rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
if (rc)
@@ -5623,7 +5614,8 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
- cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+ cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+ rc);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@ -5640,7 +5632,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
__u16 params, param_offset, offset, byte_count, count;
- cFYI(1, "Set File Disposition (via SetFileInfo)");
+ cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n");
rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
if (rc)
@@ -5682,7 +5674,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
*data_offset = delete_file ? 1 : 0;
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
- cFYI(1, "Send error in SetFileDisposition = %d", rc);
+ cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc);
return rc;
}
@@ -5700,7 +5692,7 @@ CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
char *data_offset;
__u16 params, param_offset, offset, byte_count, count;
- cFYI(1, "In SetTimes");
+ cifs_dbg(FYI, "In SetTimes\n");
SetTimesRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5756,7 +5748,7 @@ SetTimesRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "SetPathInfo (times) returned %d", rc);
+ cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc);
cifs_buf_release(pSMB);
@@ -5781,7 +5773,7 @@ CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName,
int bytes_returned;
int name_len;
- cFYI(1, "In SetAttrLegacy");
+ cifs_dbg(FYI, "In SetAttrLegacy\n");
SetAttrLgcyRetry:
rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5807,7 +5799,7 @@ SetAttrLgcyRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "Error in LegacySetAttr = %d", rc);
+ cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc);
cifs_buf_release(pSMB);
@@ -5875,7 +5867,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
u16 params, param_offset, offset, byte_count, count;
- cFYI(1, "Set Unix Info (via SetFileInfo)");
+ cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n");
rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
if (rc)
@@ -5921,7 +5913,8 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
if (rc)
- cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+ cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+ rc);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@ -5943,7 +5936,7 @@ CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
FILE_UNIX_BASIC_INFO *data_offset;
__u16 params, param_offset, offset, count, byte_count;
- cFYI(1, "In SetUID/GID/Mode");
+ cifs_dbg(FYI, "In SetUID/GID/Mode\n");
setPermsRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -5999,7 +5992,7 @@ setPermsRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "SetPathInfo (perms) returned %d", rc);
+ cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc);
cifs_buf_release(pSMB);
if (rc == -EAGAIN)
@@ -6036,7 +6029,7 @@ CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
__u16 params, byte_count, data_offset;
unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
- cFYI(1, "In Query All EAs path %s", searchName);
+ cifs_dbg(FYI, "In Query All EAs path %s\n", searchName);
QAllEAsRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -6083,7 +6076,7 @@ QAllEAsRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cFYI(1, "Send error in QueryAllEAs = %d", rc);
+ cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc);
goto QAllEAsOut;
}
@@ -6111,16 +6104,16 @@ QAllEAsRetry:
(((char *) &pSMBr->hdr.Protocol) + data_offset);
list_len = le32_to_cpu(ea_response_data->list_len);
- cFYI(1, "ea length %d", list_len);
+ cifs_dbg(FYI, "ea length %d\n", list_len);
if (list_len <= 8) {
- cFYI(1, "empty EA list returned from server");
+ cifs_dbg(FYI, "empty EA list returned from server\n");
goto QAllEAsOut;
}
/* make sure list_len doesn't go past end of SMB */
end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
if ((char *)ea_response_data + list_len > end_of_smb) {
- cFYI(1, "EA list appears to go beyond SMB");
+ cifs_dbg(FYI, "EA list appears to go beyond SMB\n");
rc = -EIO;
goto QAllEAsOut;
}
@@ -6137,7 +6130,7 @@ QAllEAsRetry:
temp_ptr += 4;
/* make sure we can read name_len and value_len */
if (list_len < 0) {
- cFYI(1, "EA entry goes beyond length of list");
+ cifs_dbg(FYI, "EA entry goes beyond length of list\n");
rc = -EIO;
goto QAllEAsOut;
}
@@ -6146,7 +6139,7 @@ QAllEAsRetry:
value_len = le16_to_cpu(temp_fea->value_len);
list_len -= name_len + 1 + value_len;
if (list_len < 0) {
- cFYI(1, "EA entry goes beyond length of list");
+ cifs_dbg(FYI, "EA entry goes beyond length of list\n");
rc = -EIO;
goto QAllEAsOut;
}
@@ -6214,7 +6207,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned = 0;
__u16 params, param_offset, byte_count, offset, count;
- cFYI(1, "In SetEA");
+ cifs_dbg(FYI, "In SetEA\n");
SetEARetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
(void **) &pSMBr);
@@ -6296,7 +6289,7 @@ SetEARetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc)
- cFYI(1, "SetPathInfo (EA) returned %d", rc);
+ cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc);
cifs_buf_release(pSMB);
@@ -6339,7 +6332,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
struct dir_notify_req *dnotify_req;
int bytes_returned;
- cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+ cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid);
rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
(void **) &pSMBr);
if (rc)
@@ -6368,7 +6361,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
(struct smb_hdr *)pSMBr, &bytes_returned,
CIFS_ASYNC_OP);
if (rc) {
- cFYI(1, "Error in Notify = %d", rc);
+ cifs_dbg(FYI, "Error in Notify = %d\n", rc);
} else {
/* Add file to outstanding requests */
/* BB change to kmem cache alloc */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 21b3a291c327..afcb8a1a33b7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
Opt_acl, Opt_noacl, Opt_locallease,
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
- Opt_multiuser, Opt_sloppy,
+ Opt_multiuser, Opt_sloppy, Opt_nosharesock,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -95,9 +95,7 @@ enum {
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
- Opt_unc, Opt_domain,
- Opt_srcaddr, Opt_prefixpath,
- Opt_iocharset,
+ Opt_domain, Opt_srcaddr, Opt_iocharset,
Opt_netbiosname, Opt_servern,
Opt_ver, Opt_vers, Opt_sec, Opt_cache,
@@ -167,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_mfsymlinks, "mfsymlinks" },
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
+ { Opt_nosharesock, "nosharesock" },
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -193,14 +192,14 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_blank_ip, "addr=" },
{ Opt_ip, "ip=%s" },
{ Opt_ip, "addr=%s" },
- { Opt_unc, "unc=%s" },
- { Opt_unc, "target=%s" },
- { Opt_unc, "path=%s" },
+ { Opt_ignore, "unc=%s" },
+ { Opt_ignore, "target=%s" },
+ { Opt_ignore, "path=%s" },
{ Opt_domain, "dom=%s" },
{ Opt_domain, "domain=%s" },
{ Opt_domain, "workgroup=%s" },
{ Opt_srcaddr, "srcaddr=%s" },
- { Opt_prefixpath, "prefixpath=%s" },
+ { Opt_ignore, "prefixpath=%s" },
{ Opt_iocharset, "iocharset=%s" },
{ Opt_netbiosname, "netbiosname=%s" },
{ Opt_servern, "servern=%s" },
@@ -277,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
{ Smb_20, SMB20_VERSION_STRING},
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
+ { Smb_302, SMB302_VERSION_STRING },
};
static int ip_connect(struct TCP_Server_Info *server);
@@ -318,11 +318,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->max_read = 0;
#endif
- cFYI(1, "Reconnecting tcp session");
+ cifs_dbg(FYI, "Reconnecting tcp session\n");
/* before reconnecting the tcp session, mark the smb session (uid)
and the tid bad so they are not used until reconnected */
- cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
+ cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n",
+ __func__);
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
@@ -336,15 +337,14 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
/* do not want to be sending data on a socket we are freeing */
- cFYI(1, "%s: tearing down socket", __func__);
+ cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
mutex_lock(&server->srv_mutex);
if (server->ssocket) {
- cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
- server->ssocket->flags);
+ cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
+ server->ssocket->state, server->ssocket->flags);
kernel_sock_shutdown(server->ssocket, SHUT_WR);
- cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
- server->ssocket->state,
- server->ssocket->flags);
+ cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
+ server->ssocket->state, server->ssocket->flags);
sock_release(server->ssocket);
server->ssocket = NULL;
}
@@ -358,7 +358,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
- cFYI(1, "%s: moving mids to private list", __func__);
+ cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
@@ -368,7 +368,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
spin_unlock(&GlobalMid_Lock);
- cFYI(1, "%s: issuing mid callbacks", __func__);
+ cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_safe(tmp, tmp2, &retry_list) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
list_del_init(&mid_entry->qhead);
@@ -381,7 +381,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* we should try only the port we connected to before */
rc = generic_ip_connect(server);
if (rc) {
- cFYI(1, "reconnect error %d", rc);
+ cifs_dbg(FYI, "reconnect error %d\n", rc);
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -415,8 +415,8 @@ cifs_echo_request(struct work_struct *work)
rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
if (rc)
- cFYI(1, "Unable to send echo request to server: %s",
- server->hostname);
+ cifs_dbg(FYI, "Unable to send echo request to server: %s\n",
+ server->hostname);
requeue_echo:
queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
@@ -428,7 +428,7 @@ allocate_buffers(struct TCP_Server_Info *server)
if (!server->bigbuf) {
server->bigbuf = (char *)cifs_buf_get();
if (!server->bigbuf) {
- cERROR(1, "No memory for large SMB response");
+ cifs_dbg(VFS, "No memory for large SMB response\n");
msleep(3000);
/* retry will check if exiting */
return false;
@@ -441,7 +441,7 @@ allocate_buffers(struct TCP_Server_Info *server)
if (!server->smallbuf) {
server->smallbuf = (char *)cifs_small_buf_get();
if (!server->smallbuf) {
- cERROR(1, "No memory for SMB response");
+ cifs_dbg(VFS, "No memory for SMB response\n");
msleep(1000);
/* retry will check if exiting */
return false;
@@ -471,9 +471,8 @@ server_unresponsive(struct TCP_Server_Info *server)
*/
if (server->tcpStatus == CifsGood &&
time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
- cERROR(1, "Server %s has not responded in %d seconds. "
- "Reconnecting...", server->hostname,
- (2 * SMB_ECHO_INTERVAL) / HZ);
+ cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
+ server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
cifs_reconnect(server);
wake_up(&server->response_q);
return true;
@@ -584,8 +583,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
length = 0;
continue;
} else if (length <= 0) {
- cFYI(1, "Received no data or error: expecting %d "
- "got %d", to_read, length);
+ cifs_dbg(FYI, "Received no data or error: expecting %d\n"
+ "got %d", to_read, length);
cifs_reconnect(server);
total_read = -EAGAIN;
break;
@@ -619,17 +618,17 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
/* Regular SMB response */
return true;
case RFC1002_SESSION_KEEP_ALIVE:
- cFYI(1, "RFC 1002 session keep alive");
+ cifs_dbg(FYI, "RFC 1002 session keep alive\n");
break;
case RFC1002_POSITIVE_SESSION_RESPONSE:
- cFYI(1, "RFC 1002 positive session response");
+ cifs_dbg(FYI, "RFC 1002 positive session response\n");
break;
case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
* SMB negprot response.
*/
- cFYI(1, "RFC 1002 negative session response");
+ cifs_dbg(FYI, "RFC 1002 negative session response\n");
/* give server a second to clean up */
msleep(1000);
/*
@@ -643,7 +642,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
wake_up(&server->response_q);
break;
default:
- cERROR(1, "RFC 1002 unknown response type 0x%x", type);
+ cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
cifs_reconnect(server);
}
@@ -729,7 +728,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cFYI(1, "Clearing mid 0x%llx", mid_entry->mid);
+ cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
}
@@ -738,7 +737,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cFYI(1, "Callback mid 0x%llx", mid_entry->mid);
+ cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
}
@@ -755,7 +754,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
* least 45 seconds before giving up on a request getting a
* response and going ahead and killing cifsd.
*/
- cFYI(1, "Wait for exit from demultiplex thread");
+ cifs_dbg(FYI, "Wait for exit from demultiplex thread\n");
msleep(46000);
/*
* If threads still have not exited they are probably never
@@ -782,8 +781,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* make sure this will fit in a large buffer */
if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
- cERROR(1, "SMB response too long (%u bytes)",
- pdu_length);
+ cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server);
wake_up(&server->response_q);
return -EAGAIN;
@@ -841,7 +839,7 @@ cifs_demultiplex_thread(void *p)
struct mid_q_entry *mid_entry;
current->flags |= PF_MEMALLOC;
- cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
+ cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
length = atomic_inc_return(&tcpSesAllocCount);
if (length > 1)
@@ -871,14 +869,14 @@ cifs_demultiplex_thread(void *p)
*/
pdu_length = get_rfc1002_length(buf);
- cFYI(1, "RFC1002 header 0x%x", pdu_length);
+ cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
/* make sure we have enough to get to the MID */
if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
- cERROR(1, "SMB response too short (%u bytes)",
- pdu_length);
+ cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
+ pdu_length);
cifs_reconnect(server);
wake_up(&server->response_q);
continue;
@@ -910,8 +908,8 @@ cifs_demultiplex_thread(void *p)
mid_entry->callback(mid_entry);
} else if (!server->ops->is_oplock_break ||
!server->ops->is_oplock_break(buf, server)) {
- cERROR(1, "No task to wake, unknown frame received! "
- "NumMids %d", atomic_read(&midCount));
+ cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
+ atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", buf,
HEADER_SIZE(server));
#ifdef CONFIG_CIFS_DEBUG2
@@ -1028,46 +1026,51 @@ static int cifs_parse_security_flavors(char *value,
substring_t args[MAX_OPT_ARGS];
+ /*
+ * With mount options, the last one should win. Reset any existing
+ * settings back to default.
+ */
+ vol->sectype = Unspecified;
+ vol->sign = false;
+
switch (match_token(value, cifs_secflavor_tokens, args)) {
- case Opt_sec_krb5:
- vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
- break;
- case Opt_sec_krb5i:
- vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
- break;
case Opt_sec_krb5p:
- /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */
- cERROR(1, "Krb5 cifs privacy not supported");
- break;
- case Opt_sec_ntlmssp:
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+ cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+ return 1;
+ case Opt_sec_krb5i:
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_krb5:
+ vol->sectype = Kerberos;
break;
case Opt_sec_ntlmsspi:
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN;
- break;
- case Opt_ntlm:
- /* ntlm is default so can be turned off too */
- vol->secFlg |= CIFSSEC_MAY_NTLM;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_ntlmssp:
+ vol->sectype = RawNTLMSSP;
break;
case Opt_sec_ntlmi:
- vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN;
- break;
- case Opt_sec_ntlmv2:
- vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_ntlm:
+ vol->sectype = NTLM;
break;
case Opt_sec_ntlmv2i:
- vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_ntlmv2:
+ vol->sectype = NTLMv2;
break;
#ifdef CONFIG_CIFS_WEAK_PW_HASH
case Opt_sec_lanman:
- vol->secFlg |= CIFSSEC_MAY_LANMAN;
+ vol->sectype = LANMAN;
break;
#endif
case Opt_sec_none:
vol->nullauth = 1;
break;
default:
- cERROR(1, "bad security option: %s", value);
+ cifs_dbg(VFS, "bad security option: %s\n", value);
return 1;
}
@@ -1093,7 +1096,7 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
vol->strict_io = false;
break;
default:
- cERROR(1, "bad cache= option: %s", value);
+ cifs_dbg(VFS, "bad cache= option: %s\n", value);
return 1;
}
return 0;
@@ -1122,9 +1125,13 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->ops = &smb30_operations;
vol->vals = &smb30_values;
break;
+ case Smb_302:
+ vol->ops = &smb30_operations; /* currently identical with 3.0 */
+ vol->vals = &smb302_values;
+ break;
#endif
default:
- cERROR(1, "Unknown vers= option specified: %s", value);
+ cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
return 1;
}
return 0;
@@ -1255,20 +1262,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
separator[0] = options[4];
options += 5;
} else {
- cFYI(1, "Null separator not allowed");
+ cifs_dbg(FYI, "Null separator not allowed\n");
}
}
vol->backupuid_specified = false; /* no backup intent for a user */
vol->backupgid_specified = false; /* no backup intent for a group */
- /*
- * For now, we ignore -EINVAL errors under the assumption that the
- * unc= and prefixpath= options will be usable.
- */
- if (cifs_parse_devname(devname, vol) == -ENOMEM) {
- printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
- "device string.\n");
- goto out_nomem;
+ switch (cifs_parse_devname(devname, vol)) {
+ case 0:
+ break;
+ case -ENOMEM:
+ cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
+ goto cifs_parse_mount_err;
+ case -EINVAL:
+ cifs_dbg(VFS, "Malformed UNC in devname.\n");
+ goto cifs_parse_mount_err;
+ default:
+ cifs_dbg(VFS, "Unknown error parsing devname.\n");
+ goto cifs_parse_mount_err;
}
while ((data = strsep(&options, separator)) != NULL) {
@@ -1423,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->local_lease = 1;
break;
case Opt_sign:
- vol->secFlg |= CIFSSEC_MUST_SIGN;
+ vol->sign = true;
break;
case Opt_seal:
/* we do not do the following in secFlags because seal
@@ -1440,8 +1451,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
break;
case Opt_fsc:
#ifndef CONFIG_CIFS_FSCACHE
- cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE "
- "kernel config option set");
+ cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
goto cifs_parse_mount_err;
#endif
vol->fsc = true;
@@ -1455,59 +1465,62 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_sloppy:
sloppy = true;
break;
+ case Opt_nosharesock:
+ vol->nosharesock = true;
+ break;
/* Numeric Values */
case Opt_backupuid:
if (get_option_uid(args, &vol->backupuid)) {
- cERROR(1, "%s: Invalid backupuid value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid backupuid value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->backupuid_specified = true;
break;
case Opt_backupgid:
if (get_option_gid(args, &vol->backupgid)) {
- cERROR(1, "%s: Invalid backupgid value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid backupgid value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->backupgid_specified = true;
break;
case Opt_uid:
if (get_option_uid(args, &vol->linux_uid)) {
- cERROR(1, "%s: Invalid uid value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid uid value\n",
+ __func__);
goto cifs_parse_mount_err;
}
uid_specified = true;
break;
case Opt_cruid:
if (get_option_uid(args, &vol->cred_uid)) {
- cERROR(1, "%s: Invalid cruid value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid cruid value\n",
+ __func__);
goto cifs_parse_mount_err;
}
break;
case Opt_gid:
if (get_option_gid(args, &vol->linux_gid)) {
- cERROR(1, "%s: Invalid gid value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid gid value\n",
+ __func__);
goto cifs_parse_mount_err;
}
gid_specified = true;
break;
case Opt_file_mode:
if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid file_mode value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid file_mode value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->file_mode = option;
break;
case Opt_dirmode:
if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid dir_mode value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid dir_mode value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->dir_mode = option;
@@ -1515,37 +1528,37 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_port:
if (get_option_ul(args, &option) ||
option > USHRT_MAX) {
- cERROR(1, "%s: Invalid port value", __func__);
+ cifs_dbg(VFS, "%s: Invalid port value\n",
+ __func__);
goto cifs_parse_mount_err;
}
port = (unsigned short)option;
break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid rsize value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid rsize value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->rsize = option;
break;
case Opt_wsize:
if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid wsize value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid wsize value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->wsize = option;
break;
case Opt_actimeo:
if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid actimeo value",
- __func__);
+ cifs_dbg(VFS, "%s: Invalid actimeo value\n",
+ __func__);
goto cifs_parse_mount_err;
}
vol->actimeo = HZ * option;
if (vol->actimeo > CIFS_MAX_ACTIMEO) {
- cERROR(1, "CIFS: attribute cache"
- "timeout too large");
+ cifs_dbg(VFS, "attribute cache timeout too large\n");
goto cifs_parse_mount_err;
}
break;
@@ -1568,11 +1581,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
vol->username = kstrdup(string, GFP_KERNEL);
- if (!vol->username) {
- printk(KERN_WARNING "CIFS: no memory "
- "for username\n");
+ if (!vol->username)
goto cifs_parse_mount_err;
- }
break;
case Opt_blank_pass:
/* passwords have to be handled differently
@@ -1660,30 +1670,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
got_ip = true;
break;
- case Opt_unc:
- string = vol->UNC;
- vol->UNC = match_strdup(args);
- if (vol->UNC == NULL)
- goto out_nomem;
-
- convert_delimiter(vol->UNC, '\\');
- if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
- printk(KERN_ERR "CIFS: UNC Path does not "
- "begin with // or \\\\\n");
- goto cifs_parse_mount_err;
- }
-
- /* Compare old unc= option to new one */
- if (!string || strcmp(string, vol->UNC))
- printk(KERN_WARNING "CIFS: the value of the "
- "unc= mount option does not match the "
- "device string. Using the unc= option "
- "for now. In 3.10, that option will "
- "be ignored and the contents of the "
- "device string will be used "
- "instead. (%s != %s)\n", string,
- vol->UNC);
- break;
case Opt_domain:
string = match_strdup(args);
if (string == NULL)
@@ -1701,7 +1687,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
"for domainname\n");
goto cifs_parse_mount_err;
}
- cFYI(1, "Domain name set");
+ cifs_dbg(FYI, "Domain name set\n");
break;
case Opt_srcaddr:
string = match_strdup(args);
@@ -1716,26 +1702,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
break;
- case Opt_prefixpath:
- /* skip over any leading delimiter */
- if (*args[0].from == '/' || *args[0].from == '\\')
- args[0].from++;
-
- string = vol->prepath;
- vol->prepath = match_strdup(args);
- if (vol->prepath == NULL)
- goto out_nomem;
- /* Compare old prefixpath= option to new one */
- if (!string || strcmp(string, vol->prepath))
- printk(KERN_WARNING "CIFS: the value of the "
- "prefixpath= mount option does not "
- "match the device string. Using the "
- "prefixpath= option for now. In 3.10, "
- "that option will be ignored and the "
- "contents of the device string will be "
- "used instead.(%s != %s)\n", string,
- vol->prepath);
- break;
case Opt_iocharset:
string = match_strdup(args);
if (string == NULL)
@@ -1759,7 +1725,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* if iocharset not set then load_nls_default
* is used by caller
*/
- cFYI(1, "iocharset set to %s", string);
+ cifs_dbg(FYI, "iocharset set to %s\n", string);
break;
case Opt_netbiosname:
string = match_strdup(args);
@@ -1873,20 +1839,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
#ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (vol->multiuser) {
- cERROR(1, "Multiuser mounts require kernels with "
- "CONFIG_KEYS enabled.");
+ cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
goto cifs_parse_mount_err;
}
#endif
if (!vol->UNC) {
- cERROR(1, "CIFS mount error: No usable UNC path provided in "
- "device string or in unc= option!");
+ cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
goto cifs_parse_mount_err;
}
/* make sure UNC has a share name */
if (!strchr(vol->UNC + 3, '\\')) {
- cERROR(1, "Malformed UNC. Unable to find share name.");
+ cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
goto cifs_parse_mount_err;
}
@@ -2027,47 +1991,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
static bool
match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
{
- unsigned int secFlags;
-
- if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- secFlags = vol->secFlg;
- else
- secFlags = global_secflags | vol->secFlg;
-
- switch (server->secType) {
- case LANMAN:
- if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
- return false;
- break;
- case NTLMv2:
- if (!(secFlags & CIFSSEC_MAY_NTLMV2))
- return false;
- break;
- case NTLM:
- if (!(secFlags & CIFSSEC_MAY_NTLM))
- return false;
- break;
- case Kerberos:
- if (!(secFlags & CIFSSEC_MAY_KRB5))
- return false;
- break;
- case RawNTLMSSP:
- if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
- return false;
- break;
- default:
- /* shouldn't happen */
+ /*
+ * The select_sectype function should either return the vol->sectype
+ * that was specified, or "Unspecified" if that sectype was not
+ * compatible with the given NEGOTIATE request.
+ */
+ if (select_sectype(server, vol->sectype) == Unspecified)
return false;
- }
- /* now check if signing mode is acceptable */
- if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
- (server->sec_mode & SECMODE_SIGN_REQUIRED))
- return false;
- else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
- (server->sec_mode &
- (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
- return false;
+ /*
+ * Now check if signing mode is acceptable. No need to check
+ * global_secflags at this point since if MUST_SIGN is set then
+ * the server->sign had better be too.
+ */
+ if (vol->sign && !server->sign)
+ return false;
return true;
}
@@ -2076,6 +2014,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
{
struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+ if (vol->nosharesock)
+ return 0;
+
if ((server->vals != vol->vals) || (server->ops != vol->ops))
return 0;
@@ -2107,7 +2048,7 @@ cifs_find_tcp_session(struct smb_vol *vol)
++server->srv_count;
spin_unlock(&cifs_tcp_ses_lock);
- cFYI(1, "Existing tcp session with server found");
+ cifs_dbg(FYI, "Existing tcp session with server found\n");
return server;
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -2154,7 +2095,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
struct TCP_Server_Info *tcp_ses = NULL;
int rc;
- cFYI(1, "UNC: %s", volume_info->UNC);
+ cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC);
/* see if we already have a matching tcp_ses */
tcp_ses = cifs_find_tcp_session(volume_info);
@@ -2169,7 +2110,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
rc = cifs_crypto_shash_allocate(tcp_ses);
if (rc) {
- cERROR(1, "could not setup hash structures rc %d", rc);
+ cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
goto out_err;
}
@@ -2216,7 +2157,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
rc = ip_connect(tcp_ses);
if (rc < 0) {
- cERROR(1, "Error connecting to socket. Aborting operation");
+ cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
goto out_err_crypto_release;
}
@@ -2229,7 +2170,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses, "cifsd");
if (IS_ERR(tcp_ses->tsk)) {
rc = PTR_ERR(tcp_ses->tsk);
- cERROR(1, "error %d create cifsd thread", rc);
+ cifs_dbg(VFS, "error %d create cifsd thread\n", rc);
module_put(THIS_MODULE);
goto out_err_crypto_release;
}
@@ -2265,7 +2206,11 @@ out_err:
static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
{
- switch (ses->server->secType) {
+ if (vol->sectype != Unspecified &&
+ vol->sectype != ses->sectype)
+ return 0;
+
+ switch (ses->sectype) {
case Kerberos:
if (!uid_eq(vol->cred_uid, ses->cred_uid))
return 0;
@@ -2316,7 +2261,7 @@ cifs_put_smb_ses(struct cifs_ses *ses)
unsigned int xid;
struct TCP_Server_Info *server = ses->server;
- cFYI(1, "%s: ses_count=%d", __func__, ses->ses_count);
+ cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
spin_lock(&cifs_tcp_ses_lock);
if (--ses->ses_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -2368,23 +2313,24 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
break;
default:
- cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+ cifs_dbg(FYI, "Bad ss_family (%hu)\n",
+ server->dstaddr.ss_family);
rc = -EINVAL;
goto out_err;
}
- cFYI(1, "%s: desc=%s", __func__, desc);
+ cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
key = request_key(&key_type_logon, desc, "");
if (IS_ERR(key)) {
if (!ses->domainName) {
- cFYI(1, "domainName is NULL");
+ cifs_dbg(FYI, "domainName is NULL\n");
rc = PTR_ERR(key);
goto out_err;
}
/* didn't work, try to find a domain key */
sprintf(desc, "cifs:d:%s", ses->domainName);
- cFYI(1, "%s: desc=%s", __func__, desc);
+ cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
key = request_key(&key_type_logon, desc, "");
if (IS_ERR(key)) {
rc = PTR_ERR(key);
@@ -2402,32 +2348,34 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
/* find first : in payload */
payload = (char *)upayload->data;
delim = strnchr(payload, upayload->datalen, ':');
- cFYI(1, "payload=%s", payload);
+ cifs_dbg(FYI, "payload=%s\n", payload);
if (!delim) {
- cFYI(1, "Unable to find ':' in payload (datalen=%d)",
- upayload->datalen);
+ cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n",
+ upayload->datalen);
rc = -EINVAL;
goto out_key_put;
}
len = delim - payload;
if (len > MAX_USERNAME_SIZE || len <= 0) {
- cFYI(1, "Bad value from username search (len=%zd)", len);
+ cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
+ len);
rc = -EINVAL;
goto out_key_put;
}
vol->username = kstrndup(payload, len, GFP_KERNEL);
if (!vol->username) {
- cFYI(1, "Unable to allocate %zd bytes for username", len);
+ cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n",
+ len);
rc = -ENOMEM;
goto out_key_put;
}
- cFYI(1, "%s: username=%s", __func__, vol->username);
+ cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
len = key->datalen - (len + 1);
if (len > MAX_PASSWORD_SIZE || len <= 0) {
- cFYI(1, "Bad len for password search (len=%zd)", len);
+ cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
rc = -EINVAL;
kfree(vol->username);
vol->username = NULL;
@@ -2437,7 +2385,8 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
++delim;
vol->password = kstrndup(delim, len, GFP_KERNEL);
if (!vol->password) {
- cFYI(1, "Unable to allocate %zd bytes for password", len);
+ cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
+ len);
rc = -ENOMEM;
kfree(vol->username);
vol->username = NULL;
@@ -2449,7 +2398,7 @@ out_key_put:
key_put(key);
out_err:
kfree(desc);
- cFYI(1, "%s: returning %d", __func__, rc);
+ cifs_dbg(FYI, "%s: returning %d\n", __func__, rc);
return rc;
}
#else /* ! CONFIG_KEYS */
@@ -2474,7 +2423,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses = cifs_find_smb_ses(server, volume_info);
if (ses) {
- cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+ cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
+ ses->status);
mutex_lock(&ses->session_mutex);
rc = cifs_negotiate_protocol(xid, ses);
@@ -2486,7 +2436,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
return ERR_PTR(rc);
}
if (ses->need_reconnect) {
- cFYI(1, "Session needs reconnect");
+ cifs_dbg(FYI, "Session needs reconnect\n");
rc = cifs_setup_session(xid, ses,
volume_info->local_nls);
if (rc) {
@@ -2505,7 +2455,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
return ses;
}
- cFYI(1, "Existing smb sess not found");
+ cifs_dbg(FYI, "Existing smb sess not found\n");
ses = sesInfoAlloc();
if (ses == NULL)
goto get_ses_fail;
@@ -2537,7 +2487,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->cred_uid = volume_info->cred_uid;
ses->linux_uid = volume_info->linux_uid;
- ses->overrideSecFlg = volume_info->secFlg;
+ ses->sectype = volume_info->sectype;
+ ses->sign = volume_info->sign;
mutex_lock(&ses->session_mutex);
rc = cifs_negotiate_protocol(xid, ses);
@@ -2595,7 +2546,7 @@ cifs_put_tcon(struct cifs_tcon *tcon)
unsigned int xid;
struct cifs_ses *ses = tcon->ses;
- cFYI(1, "%s: tc_count=%d", __func__, tcon->tc_count);
+ cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
if (--tcon->tc_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -2623,12 +2574,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
tcon = cifs_find_tcon(ses, volume_info->UNC);
if (tcon) {
- cFYI(1, "Found match on UNC path");
+ cifs_dbg(FYI, "Found match on UNC path\n");
/* existing tcon already has a reference */
cifs_put_smb_ses(ses);
if (tcon->seal != volume_info->seal)
- cERROR(1, "transport encryption setting "
- "conflicts with existing tid");
+ cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n");
return tcon;
}
@@ -2660,13 +2610,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon,
volume_info->local_nls);
free_xid(xid);
- cFYI(1, "Tcon rc = %d", rc);
+ cifs_dbg(FYI, "Tcon rc = %d\n", rc);
if (rc)
goto out_fail;
if (volume_info->nodfs) {
tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
- cFYI(1, "DFS disabled (%d)", tcon->Flags);
+ cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
}
tcon->seal = volume_info->seal;
/*
@@ -2820,7 +2770,7 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$");
rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL,
nls_codepage);
- cFYI(1, "Tcon rc = %d ipc_tid = %d", rc, ses->ipc_tid);
+ cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid);
kfree(temp_unc);
}
if (rc == 0)
@@ -2898,13 +2848,11 @@ bind_socket(struct TCP_Server_Info *server)
saddr4 = (struct sockaddr_in *)&server->srcaddr;
saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
if (saddr6->sin6_family == AF_INET6)
- cERROR(1, "cifs: "
- "Failed to bind to: %pI6c, error: %d",
- &saddr6->sin6_addr, rc);
+ cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
+ &saddr6->sin6_addr, rc);
else
- cERROR(1, "cifs: "
- "Failed to bind to: %pI4, error: %d",
- &saddr4->sin_addr.s_addr, rc);
+ cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
+ &saddr4->sin_addr.s_addr, rc);
}
}
return rc;
@@ -3009,13 +2957,13 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
IPPROTO_TCP, &socket, 1);
if (rc < 0) {
- cERROR(1, "Error %d creating socket", rc);
+ cifs_dbg(VFS, "Error %d creating socket\n", rc);
server->ssocket = NULL;
return rc;
}
/* BB other socket options to set KEEPALIVE, NODELAY? */
- cFYI(1, "Socket created");
+ cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
if (sfamily == AF_INET6)
@@ -3049,16 +2997,17 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
(char *)&val, sizeof(val));
if (rc)
- cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+ cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
+ rc);
}
- cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+ cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
socket->sk->sk_sndbuf,
socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
rc = socket->ops->connect(socket, saddr, slen, 0);
if (rc < 0) {
- cFYI(1, "Error %d connecting to server", rc);
+ cifs_dbg(FYI, "Error %d connecting to server\n", rc);
sock_release(socket);
server->ssocket = NULL;
return rc;
@@ -3116,19 +3065,19 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
if (vol_info && vol_info->no_linux_ext) {
tcon->fsUnixInfo.Capability = 0;
tcon->unix_ext = 0; /* Unix Extensions disabled */
- cFYI(1, "Linux protocol extensions disabled");
+ cifs_dbg(FYI, "Linux protocol extensions disabled\n");
return;
} else if (vol_info)
tcon->unix_ext = 1; /* Unix Extensions supported */
if (tcon->unix_ext == 0) {
- cFYI(1, "Unix extensions disabled so not set on reconnect");
+ cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n");
return;
}
if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
- cFYI(1, "unix caps which server supports %lld", cap);
+ cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
/* check for reconnect case in which we do not
want to change the mount behavior if we can avoid it */
if (vol_info == NULL) {
@@ -3138,22 +3087,22 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
- cERROR(1, "POSIXPATH support change");
+ cifs_dbg(VFS, "POSIXPATH support change\n");
cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
- cERROR(1, "possible reconnect error");
- cERROR(1, "server disabled POSIX path support");
+ cifs_dbg(VFS, "possible reconnect error\n");
+ cifs_dbg(VFS, "server disabled POSIX path support\n");
}
}
if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
- cERROR(1, "per-share encryption not supported yet");
+ cifs_dbg(VFS, "per-share encryption not supported yet\n");
cap &= CIFS_UNIX_CAP_MASK;
if (vol_info && vol_info->no_psx_acl)
cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
- cFYI(1, "negotiated posix acl support");
+ cifs_dbg(FYI, "negotiated posix acl support\n");
if (cifs_sb)
cifs_sb->mnt_cifs_flags |=
CIFS_MOUNT_POSIXACL;
@@ -3162,43 +3111,38 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
if (vol_info && vol_info->posix_paths == 0)
cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
- cFYI(1, "negotiate posix pathnames");
+ cifs_dbg(FYI, "negotiate posix pathnames\n");
if (cifs_sb)
cifs_sb->mnt_cifs_flags |=
CIFS_MOUNT_POSIX_PATHS;
}
- cFYI(1, "Negotiate caps 0x%x", (int)cap);
+ cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap);
#ifdef CONFIG_CIFS_DEBUG2
if (cap & CIFS_UNIX_FCNTL_CAP)
- cFYI(1, "FCNTL cap");
+ cifs_dbg(FYI, "FCNTL cap\n");
if (cap & CIFS_UNIX_EXTATTR_CAP)
- cFYI(1, "EXTATTR cap");
+ cifs_dbg(FYI, "EXTATTR cap\n");
if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
- cFYI(1, "POSIX path cap");
+ cifs_dbg(FYI, "POSIX path cap\n");
if (cap & CIFS_UNIX_XATTR_CAP)
- cFYI(1, "XATTR cap");
+ cifs_dbg(FYI, "XATTR cap\n");
if (cap & CIFS_UNIX_POSIX_ACL_CAP)
- cFYI(1, "POSIX ACL cap");
+ cifs_dbg(FYI, "POSIX ACL cap\n");
if (cap & CIFS_UNIX_LARGE_READ_CAP)
- cFYI(1, "very large read cap");
+ cifs_dbg(FYI, "very large read cap\n");
if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
- cFYI(1, "very large write cap");
+ cifs_dbg(FYI, "very large write cap\n");
if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
- cFYI(1, "transport encryption cap");
+ cifs_dbg(FYI, "transport encryption cap\n");
if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
- cFYI(1, "mandatory transport encryption cap");
+ cifs_dbg(FYI, "mandatory transport encryption cap\n");
#endif /* CIFS_DEBUG2 */
if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
if (vol_info == NULL) {
- cFYI(1, "resetting capabilities failed");
+ cifs_dbg(FYI, "resetting capabilities failed\n");
} else
- cERROR(1, "Negotiating Unix capabilities "
- "with the server failed. Consider "
- "mounting with the Unix Extensions "
- "disabled if problems are found "
- "by specifying the nounix mount "
- "option.");
+ cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n");
}
}
@@ -3223,8 +3167,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_gid = pvolume_info->linux_gid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
- cFYI(1, "file mode: 0x%hx dir mode: 0x%hx",
- cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+ cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n",
+ cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
cifs_sb->actimeo = pvolume_info->actimeo;
cifs_sb->local_nls = pvolume_info->local_nls;
@@ -3273,21 +3217,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
if (pvolume_info->strict_io)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
if (pvolume_info->direct_io) {
- cFYI(1, "mounting share using direct i/o");
+ cifs_dbg(FYI, "mounting share using direct i/o\n");
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
}
if (pvolume_info->mfsymlinks) {
if (pvolume_info->sfu_emul) {
- cERROR(1, "mount option mfsymlinks ignored if sfu "
- "mount option is used");
+ cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n");
} else {
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
}
}
if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
- cERROR(1, "mount option dynperm ignored if cifsacl "
- "mount option supported");
+ cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
}
static void
@@ -3332,14 +3274,14 @@ build_unc_path_to_root(const struct smb_vol *vol,
pos = full_path + unc_len;
if (pplen) {
- *pos++ = CIFS_DIR_SEP(cifs_sb);
- strncpy(pos, vol->prepath, pplen);
+ *pos = CIFS_DIR_SEP(cifs_sb);
+ strncpy(pos + 1, vol->prepath, pplen);
pos += pplen;
}
*pos = '\0'; /* add trailing null */
convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
- cFYI(1, "%s: full_path=%s", __func__, full_path);
+ cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
return full_path;
}
@@ -3410,14 +3352,14 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
return -EINVAL;
if (volume_info->nullauth) {
- cFYI(1, "Anonymous login");
+ cifs_dbg(FYI, "Anonymous login\n");
kfree(volume_info->username);
volume_info->username = NULL;
} else if (volume_info->username) {
/* BB fixme parse for domain name here */
- cFYI(1, "Username: %s", volume_info->username);
+ cifs_dbg(FYI, "Username: %s\n", volume_info->username);
} else {
- cifserror("No username specified");
+ cifs_dbg(VFS, "No username specified\n");
/* In userspace mount helper we can get user name from alternate
locations such as env variables and files on disk */
return -EINVAL;
@@ -3430,7 +3372,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
} else {
volume_info->local_nls = load_nls(volume_info->iocharset);
if (volume_info->local_nls == NULL) {
- cERROR(1, "CIFS mount error: iocharset %s not found",
+ cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
volume_info->iocharset);
return -ELIBACC;
}
@@ -3709,7 +3651,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
NTLMv2 password here) */
#ifdef CONFIG_CIFS_WEAK_PW_HASH
if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
- (ses->server->secType == LANMAN))
+ (ses->sectype == LANMAN))
calc_lanman_hash(tcon->password, ses->server->cryptkey,
ses->server->sec_mode &
SECMODE_PW_ENCRYPT ? true : false,
@@ -3727,8 +3669,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
}
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
if (ses->capabilities & CAP_STATUS32) {
@@ -3780,18 +3721,18 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
if (length == 3) {
if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
(bcc_ptr[2] == 'C')) {
- cFYI(1, "IPC connection");
+ cifs_dbg(FYI, "IPC connection\n");
tcon->ipc = 1;
}
} else if (length == 2) {
if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
/* the most common case */
- cFYI(1, "disk share connection");
+ cifs_dbg(FYI, "disk share connection\n");
}
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+ strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -3799,7 +3740,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
bytes_left, is_unicode,
nls_codepage);
- cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
+ cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem);
if ((smb_buffer_response->WordCount == 3) ||
(smb_buffer_response->WordCount == 7))
@@ -3807,7 +3748,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
else
tcon->Flags = 0;
- cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
+ cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
} else if ((rc == 0) && tcon == NULL) {
/* all we need to save for IPC$ connection */
ses->ipc_tid = smb_buffer_response->Tid;
@@ -3880,31 +3821,32 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
int rc = -ENOSYS;
struct TCP_Server_Info *server = ses->server;
- ses->flags = 0;
ses->capabilities = server->capabilities;
if (linuxExtEnabled == 0)
ses->capabilities &= (~server->vals->cap_unix);
- cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+ cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
server->sec_mode, server->capabilities, server->timeAdj);
if (server->ops->sess_setup)
rc = server->ops->sess_setup(xid, ses, nls_info);
if (rc) {
- cERROR(1, "Send error in SessSetup = %d", rc);
+ cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
} else {
- mutex_lock(&ses->server->srv_mutex);
+ mutex_lock(&server->srv_mutex);
if (!server->session_estab) {
server->session_key.response = ses->auth_key.response;
server->session_key.len = ses->auth_key.len;
server->sequence_number = 0x2;
server->session_estab = true;
ses->auth_key.response = NULL;
+ if (server->ops->generate_signingkey)
+ server->ops->generate_signingkey(server);
}
mutex_unlock(&server->srv_mutex);
- cFYI(1, "CIFS Session Established successfully");
+ cifs_dbg(FYI, "CIFS Session Established successfully\n");
spin_lock(&GlobalMid_Lock);
ses->status = CifsGood;
ses->need_reconnect = false;
@@ -3923,23 +3865,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
static int
cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
{
- switch (ses->server->secType) {
- case Kerberos:
- vol->secFlg = CIFSSEC_MUST_KRB5;
+ vol->sectype = ses->sectype;
+
+ /* krb5 is special, since we don't need username or pw */
+ if (vol->sectype == Kerberos)
return 0;
- case NTLMv2:
- vol->secFlg = CIFSSEC_MUST_NTLMV2;
- break;
- case NTLM:
- vol->secFlg = CIFSSEC_MUST_NTLM;
- break;
- case RawNTLMSSP:
- vol->secFlg = CIFSSEC_MUST_NTLMSSP;
- break;
- case LANMAN:
- vol->secFlg = CIFSSEC_MUST_LANMAN;
- break;
- }
return cifs_set_cifscreds(vol, ses);
}
@@ -3965,6 +3895,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->nocase = master_tcon->nocase;
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
+ vol_info->sectype = master_tcon->ses->sectype;
+ vol_info->sign = master_tcon->ses->sign;
rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1cd016217448..5175aebf6737 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -102,7 +102,7 @@ cifs_bp_rename_retry:
namelen += (1 + temp->d_name.len);
temp = temp->d_parent;
if (temp == NULL) {
- cERROR(1, "corrupt dentry");
+ cifs_dbg(VFS, "corrupt dentry\n");
rcu_read_unlock();
return NULL;
}
@@ -124,12 +124,12 @@ cifs_bp_rename_retry:
full_path[namelen] = dirsep;
strncpy(full_path + namelen + 1, temp->d_name.name,
temp->d_name.len);
- cFYI(0, "name: %s", full_path + namelen);
+ cifs_dbg(FYI, "name: %s\n", full_path + namelen);
}
spin_unlock(&temp->d_lock);
temp = temp->d_parent;
if (temp == NULL) {
- cERROR(1, "corrupt dentry");
+ cifs_dbg(VFS, "corrupt dentry\n");
rcu_read_unlock();
kfree(full_path);
return NULL;
@@ -137,8 +137,8 @@ cifs_bp_rename_retry:
}
rcu_read_unlock();
if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
- cFYI(1, "did not end path lookup where expected. namelen=%d "
- "dfsplen=%d", namelen, dfsplen);
+ cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
+ namelen, dfsplen);
/* presumably this is only possible if racing with a rename
of one of the parent directories (we can not lock the dentries
above us to prevent this, but retrying should be harmless) */
@@ -178,7 +178,7 @@ check_name(struct dentry *direntry)
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
for (i = 0; i < direntry->d_name.len; i++) {
if (direntry->d_name.name[i] == '\\') {
- cFYI(1, "Invalid file name");
+ cifs_dbg(FYI, "Invalid file name\n");
return -EINVAL;
}
}
@@ -291,7 +291,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
else if ((oflags & O_CREAT) == O_CREAT)
disposition = FILE_OPEN_IF;
else
- cFYI(1, "Create flag not set in create function");
+ cifs_dbg(FYI, "Create flag not set in create function\n");
/*
* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -323,7 +323,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
desired_access, create_options, fid, oplock,
buf, cifs_sb);
if (rc) {
- cFYI(1, "cifs_create returned 0x%x", rc);
+ cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
goto out;
}
@@ -389,7 +389,8 @@ cifs_create_get_file_info:
cifs_create_set_dentry:
if (rc != 0) {
- cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+ cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
+ rc);
if (server->ops->close)
server->ops->close(xid, tcon, fid);
goto out;
@@ -452,12 +453,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
xid = get_xid();
- cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+ inode, direntry->d_name.name, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
- if (IS_ERR(tlink))
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
goto out_free_xid;
+ }
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
@@ -518,8 +521,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
__u32 oplock;
int created = FILE_CREATED;
- cFYI(1, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+ inode, direntry->d_name.name, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
@@ -613,7 +616,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
- cFYI(1, "sfu compat create special file");
+ cifs_dbg(FYI, "sfu compat create special file\n");
buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
if (buf == NULL) {
@@ -688,8 +691,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
xid = get_xid();
- cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
- parent_dir_inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+ parent_dir_inode, direntry->d_name.name, direntry);
/* check whether path exists */
@@ -715,11 +718,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
}
if (direntry->d_inode != NULL) {
- cFYI(1, "non-NULL inode in lookup");
+ cifs_dbg(FYI, "non-NULL inode in lookup\n");
} else {
- cFYI(1, "NULL inode in lookup");
+ cifs_dbg(FYI, "NULL inode in lookup\n");
}
- cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
+ cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
+ full_path, direntry->d_inode);
if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newInode, full_path,
@@ -742,7 +746,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
/* if it was once a directory (but how can we tell?) we could do
shrink_dcache_parent(direntry); */
} else if (rc != -EACCES) {
- cERROR(1, "Unexpected lookup error %d", rc);
+ cifs_dbg(VFS, "Unexpected lookup error %d\n", rc);
/* We special case check for Access Denied - since that
is a common return code */
}
@@ -807,7 +811,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
int rc = 0;
- cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
+ cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name);
return rc;
} */
@@ -818,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = {
/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
};
-static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
{
struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
unsigned long hash;
@@ -834,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
return 0;
}
-static int cifs_ci_compare(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+ struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
if ((name->len == len) &&
(nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1d2d91d9bf65..7ede7306599f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -34,7 +34,7 @@
/**
* dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- * @unc: UNC path specifying the server
+ * @unc: UNC path specifying the server (with '/' as delimiter)
* @ip_addr: Where to return the IP address.
*
* The IP address will be returned in string form, and the caller is
@@ -55,7 +55,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
len = strlen(unc);
if (len < 3) {
- cFYI(1, "%s: unc is too short: %s", __func__, unc);
+ cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc);
return -EINVAL;
}
@@ -64,12 +64,12 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
hostname = unc + 2;
/* Search for server name delimiter */
- sep = memchr(hostname, '\\', len);
+ sep = memchr(hostname, '/', len);
if (sep)
len = sep - hostname;
else
- cFYI(1, "%s: probably server name is whole unc: %s",
- __func__, unc);
+ cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n",
+ __func__, unc);
/* Try to interpret hostname as an IPv4 or IPv6 address */
rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
@@ -79,11 +79,11 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
/* Perform the upcall */
rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
if (rc < 0)
- cFYI(1, "%s: unable to resolve: %*.*s",
- __func__, len, len, hostname);
+ cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
+ __func__, len, len, hostname);
else
- cFYI(1, "%s: resolved: %*.*s to %s",
- __func__, len, len, hostname, *ip_addr);
+ cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
+ __func__, len, len, hostname, *ip_addr);
return rc;
name_is_IP_address:
@@ -92,7 +92,8 @@ name_is_IP_address:
return -ENOMEM;
memcpy(name, hostname, len);
name[len] = 0;
- cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+ cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
+ __func__, name);
*ip_addr = name;
return 0;
}
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 9c7ecdccf2f3..ce8b7f677c58 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
static struct dentry *cifs_get_parent(struct dentry *dentry)
{
/* BB need to add code here eventually to enable export via NFSD */
- cFYI(1, "get parent for %p", dentry);
+ cifs_dbg(FYI, "get parent for %p\n", dentry);
return ERR_PTR(-EACCES);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7a0dd99e4507..91d8629e69a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,9 +78,8 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
if (flags & O_EXCL)
posix_flags |= SMB_O_EXCL;
} else if (flags & O_EXCL)
- cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag"
- "but not O_CREAT on file open. Ignoring O_EXCL",
- current->comm, current->tgid);
+ cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n",
+ current->comm, current->tgid);
if (flags & O_TRUNC)
posix_flags |= SMB_O_TRUNC;
@@ -123,7 +122,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- cFYI(1, "posix open %s", full_path);
+ cifs_dbg(FYI, "posix open %s\n", full_path);
presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
if (presp_data == NULL)
@@ -308,7 +307,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
*/
if (oplock == server->vals->oplock_read &&
cifs_has_mand_locks(cinode)) {
- cFYI(1, "Reset oplock val from read to None due to mand locks");
+ cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
oplock = 0;
}
@@ -374,8 +373,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
list_del(&cifs_file->tlist);
if (list_empty(&cifsi->openFileList)) {
- cFYI(1, "closing last open instance for inode %p",
- cifs_file->dentry->d_inode);
+ cifs_dbg(FYI, "closing last open instance for inode %p\n",
+ cifs_file->dentry->d_inode);
/*
* In strict cache mode we need invalidate mapping on the last
* close because it may cause a error when we open this file
@@ -454,7 +453,7 @@ int cifs_open(struct inode *inode, struct file *file)
goto out;
}
- cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+ cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
inode, file->f_flags, full_path);
if (server->oplocks)
@@ -470,16 +469,13 @@ int cifs_open(struct inode *inode, struct file *file)
cifs_sb->mnt_file_mode /* ignored */,
file->f_flags, &oplock, &fid.netfid, xid);
if (rc == 0) {
- cFYI(1, "posix open succeeded");
+ cifs_dbg(FYI, "posix open succeeded\n");
posix_open_ok = true;
} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
if (tcon->ses->serverNOS)
- cERROR(1, "server %s of type %s returned"
- " unexpected error on SMB posix open"
- ", disabling posix open support."
- " Check if server update available.",
- tcon->ses->serverName,
- tcon->ses->serverNOS);
+ cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n",
+ tcon->ses->serverName,
+ tcon->ses->serverNOS);
tcon->broken_posix_open = true;
} else if ((rc != -EIO) && (rc != -EREMOTE) &&
(rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -621,8 +617,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
return rc;
}
- cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags,
- full_path);
+ cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n",
+ inode, cfile->f_flags, full_path);
if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
@@ -643,7 +639,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &fid.netfid, xid);
if (rc == 0) {
- cFYI(1, "posix reopen succeeded");
+ cifs_dbg(FYI, "posix reopen succeeded\n");
goto reopen_success;
}
/*
@@ -672,8 +668,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
NULL, cifs_sb);
if (rc) {
mutex_unlock(&cfile->fh_mutex);
- cFYI(1, "cifs_reopen returned 0x%x", rc);
- cFYI(1, "oplock: %d", oplock);
+ cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
+ cifs_dbg(FYI, "oplock: %d\n", oplock);
goto reopen_error_exit;
}
@@ -729,7 +725,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
struct TCP_Server_Info *server;
char *buf;
- cFYI(1, "Closedir inode = 0x%p", inode);
+ cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode);
if (cfile == NULL)
return rc;
@@ -738,7 +734,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
tcon = tlink_tcon(cfile->tlink);
server = tcon->ses->server;
- cFYI(1, "Freeing private data in close dir");
+ cifs_dbg(FYI, "Freeing private data in close dir\n");
spin_lock(&cifs_file_list_lock);
if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
cfile->invalidHandle = true;
@@ -747,7 +743,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
rc = server->ops->close_dir(xid, tcon, &cfile->fid);
else
rc = -ENOSYS;
- cFYI(1, "Closing uncompleted readdir with rc %d", rc);
+ cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc);
/* not much we can do if it fails anyway, ignore rc */
rc = 0;
} else
@@ -755,7 +751,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
buf = cfile->srch_inf.ntwrk_buf_start;
if (buf) {
- cFYI(1, "closedir free smb buf in srch struct");
+ cifs_dbg(FYI, "closedir free smb buf in srch struct\n");
cfile->srch_inf.ntwrk_buf_start = NULL;
if (cfile->srch_inf.smallBuf)
cifs_small_buf_release(buf);
@@ -1003,7 +999,7 @@ try_again:
rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
if (!rc)
goto try_again;
- locks_delete_block(flock);
+ posix_unblock_lock(flock);
}
return rc;
}
@@ -1096,6 +1092,7 @@ struct lock_to_push {
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
+ struct inode *inode = cfile->dentry->d_inode;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock, **before;
unsigned int count = 0, i = 0;
@@ -1106,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
xid = get_xid();
- lock_flocks();
- cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ spin_lock(&inode->i_lock);
+ cifs_for_each_lock(inode, before) {
if ((*before)->fl_flags & FL_POSIX)
count++;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&locks_to_send);
@@ -1130,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
}
el = locks_to_send.next;
- lock_flocks();
- cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ spin_lock(&inode->i_lock);
+ cifs_for_each_lock(inode, before) {
flock = *before;
if ((flock->fl_flags & FL_POSIX) == 0)
continue;
@@ -1140,7 +1137,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
* The list ended. We don't have enough allocated
* structures - something is really wrong.
*/
- cERROR(1, "Can't push all brlocks!");
+ cifs_dbg(VFS, "Can't push all brlocks!\n");
break;
}
length = 1 + flock->fl_end - flock->fl_start;
@@ -1156,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
lck->offset = flock->fl_start;
el = el->next;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
int stored_rc;
@@ -1213,47 +1210,46 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
bool *wait_flag, struct TCP_Server_Info *server)
{
if (flock->fl_flags & FL_POSIX)
- cFYI(1, "Posix");
+ cifs_dbg(FYI, "Posix\n");
if (flock->fl_flags & FL_FLOCK)
- cFYI(1, "Flock");
+ cifs_dbg(FYI, "Flock\n");
if (flock->fl_flags & FL_SLEEP) {
- cFYI(1, "Blocking lock");
+ cifs_dbg(FYI, "Blocking lock\n");
*wait_flag = true;
}
if (flock->fl_flags & FL_ACCESS)
- cFYI(1, "Process suspended by mandatory locking - "
- "not implemented yet");
+ cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
if (flock->fl_flags & FL_LEASE)
- cFYI(1, "Lease on file - not implemented yet");
+ cifs_dbg(FYI, "Lease on file - not implemented yet\n");
if (flock->fl_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP |
FL_ACCESS | FL_LEASE | FL_CLOSE)))
- cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
+ cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
*type = server->vals->large_lock_type;
if (flock->fl_type == F_WRLCK) {
- cFYI(1, "F_WRLCK ");
+ cifs_dbg(FYI, "F_WRLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
} else if (flock->fl_type == F_UNLCK) {
- cFYI(1, "F_UNLCK");
+ cifs_dbg(FYI, "F_UNLCK\n");
*type |= server->vals->unlock_lock_type;
*unlock = 1;
/* Check if unlock includes more than one lock range */
} else if (flock->fl_type == F_RDLCK) {
- cFYI(1, "F_RDLCK");
+ cifs_dbg(FYI, "F_RDLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
} else if (flock->fl_type == F_EXLCK) {
- cFYI(1, "F_EXLCK");
+ cifs_dbg(FYI, "F_EXLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
} else if (flock->fl_type == F_SHLCK) {
- cFYI(1, "F_SHLCK");
+ cifs_dbg(FYI, "F_SHLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
} else
- cFYI(1, "Unknown type of lock");
+ cifs_dbg(FYI, "Unknown type of lock\n");
}
static int
@@ -1296,8 +1292,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
type, 0, 1, false);
flock->fl_type = F_UNLCK;
if (rc != 0)
- cERROR(1, "Error unlocking previously locked "
- "range %d during test of lock", rc);
+ cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+ rc);
return 0;
}
@@ -1316,8 +1312,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
type | server->vals->shared_lock_type, 0, 1, false);
flock->fl_type = F_RDLCK;
if (rc != 0)
- cERROR(1, "Error unlocking previously locked "
- "range %d during test of lock", rc);
+ cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+ rc);
} else
flock->fl_type = F_WRLCK;
@@ -1508,8 +1504,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
if (!CIFS_I(inode)->clientCanCacheAll &&
CIFS_I(inode)->clientCanCacheRead) {
cifs_invalidate_mapping(inode);
- cFYI(1, "Set no oplock for inode=%p due to mand locks",
- inode);
+ cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
+ inode);
CIFS_I(inode)->clientCanCacheRead = false;
}
@@ -1546,9 +1542,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
rc = -EACCES;
xid = get_xid();
- cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
- "end: %lld", cmd, flock->fl_flags, flock->fl_type,
- flock->fl_start, flock->fl_end);
+ cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
+ cmd, flock->fl_flags, flock->fl_type,
+ flock->fl_start, flock->fl_end);
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -1620,8 +1616,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
cifs_sb = CIFS_SB(dentry->d_sb);
- cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
- *offset, dentry->d_name.name);
+ cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n",
+ write_size, *offset, dentry->d_name.name);
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -1736,7 +1732,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
it being zero) during stress testcases so we need to check for it */
if (cifs_inode == NULL) {
- cERROR(1, "Null inode passed to cifs_writeable_file");
+ cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
dump_stack();
return NULL;
}
@@ -1848,7 +1844,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
else if (bytes_written < 0)
rc = bytes_written;
} else {
- cFYI(1, "No writeable filehandles for inode");
+ cifs_dbg(FYI, "No writeable filehandles for inode\n");
rc = -EIO;
}
@@ -2015,7 +2011,7 @@ retry:
wdata->cfile = find_writable_file(CIFS_I(mapping->host),
false);
if (!wdata->cfile) {
- cERROR(1, "No writable handles for inode");
+ cifs_dbg(VFS, "No writable handles for inode\n");
rc = -EBADF;
break;
}
@@ -2076,7 +2072,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
/* BB add check for wbc flags */
page_cache_get(page);
if (!PageUptodate(page))
- cFYI(1, "ppw - page not up to date");
+ cifs_dbg(FYI, "ppw - page not up to date\n");
/*
* Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -2127,7 +2123,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
else
pid = current->tgid;
- cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+ cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
page, pos, copied);
if (PageChecked(page)) {
@@ -2191,13 +2187,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
xid = get_xid();
- cFYI(1, "Sync file - name: %s datasync: 0x%x",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+ file->f_path.dentry->d_name.name, datasync);
if (!CIFS_I(inode)->clientCanCacheRead) {
rc = cifs_invalidate_mapping(inode);
if (rc) {
- cFYI(1, "rc: %d during invalidate phase", rc);
+ cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
rc = 0; /* don't care about it in fsync */
}
}
@@ -2233,8 +2229,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
xid = get_xid();
- cFYI(1, "Sync file - name: %s datasync: 0x%x",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+ file->f_path.dentry->d_name.name, datasync);
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
@@ -2262,7 +2258,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
if (file->f_mode & FMODE_WRITE)
rc = filemap_write_and_wait(inode->i_mapping);
- cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
+ cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
return rc;
}
@@ -2520,8 +2516,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
- sb_start_write(inode->i_sb);
-
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
@@ -2545,7 +2539,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
}
up_read(&cinode->lock_sem);
- sb_end_write(inode->i_sb);
return rc;
}
@@ -2582,8 +2575,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
* an old data.
*/
cifs_invalidate_mapping(inode);
- cFYI(1, "Set no oplock for inode=%p after a write operation",
- inode);
+ cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+ inode);
cinode->clientCanCacheRead = false;
}
return written;
@@ -2759,15 +2752,15 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
/* enough data to fill the page */
iov.iov_base = kmap(page);
iov.iov_len = PAGE_SIZE;
- cFYI(1, "%u: iov_base=%p iov_len=%zu",
- i, iov.iov_base, iov.iov_len);
+ cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+ i, iov.iov_base, iov.iov_len);
len -= PAGE_SIZE;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
iov.iov_base = kmap(page);
iov.iov_len = len;
- cFYI(1, "%u: iov_base=%p iov_len=%zu",
- i, iov.iov_base, iov.iov_len);
+ cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+ i, iov.iov_base, iov.iov_len);
memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
rdata->tailsz = len;
len = 0;
@@ -2827,7 +2820,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
pid = current->tgid;
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cFYI(1, "attempting read on write only file instance");
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
do {
cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
@@ -3006,7 +2999,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
pid = current->tgid;
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cFYI(1, "attempting read on write only file instance");
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
for (total_read = 0, cur_offset = read_data; read_size > total_read;
total_read += bytes_read, cur_offset += bytes_read) {
@@ -3097,7 +3090,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
xid = get_xid();
rc = cifs_revalidate_file(file);
if (rc) {
- cFYI(1, "Validation prior to mmap failed, error=%d", rc);
+ cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
+ rc);
free_xid(xid);
return rc;
}
@@ -3150,7 +3144,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
/* determine the eof that the server (probably) has */
eof = CIFS_I(rdata->mapping->host)->server_eof;
eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
- cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+ cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
rdata->tailsz = PAGE_CACHE_SIZE;
for (i = 0; i < nr_pages; i++) {
@@ -3160,15 +3154,15 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
/* enough data to fill the page */
iov.iov_base = kmap(page);
iov.iov_len = PAGE_CACHE_SIZE;
- cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
- i, page->index, iov.iov_base, iov.iov_len);
+ cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+ i, page->index, iov.iov_base, iov.iov_len);
len -= PAGE_CACHE_SIZE;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
iov.iov_base = kmap(page);
iov.iov_len = len;
- cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
- i, page->index, iov.iov_base, iov.iov_len);
+ cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+ i, page->index, iov.iov_base, iov.iov_len);
memset(iov.iov_base + len,
'\0', PAGE_CACHE_SIZE - len);
rdata->tailsz = len;
@@ -3248,8 +3242,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rc = 0;
INIT_LIST_HEAD(&tmplist);
- cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
- mapping, num_pages);
+ cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
+ __func__, file, mapping, num_pages);
/*
* Start with the page at end of list and move it to private
@@ -3379,7 +3373,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
if (rc < 0)
goto io_error;
else
- cFYI(1, "Bytes read %d", rc);
+ cifs_dbg(FYI, "Bytes read %d\n", rc);
file_inode(file)->i_atime =
current_fs_time(file_inode(file)->i_sb);
@@ -3417,7 +3411,7 @@ static int cifs_readpage(struct file *file, struct page *page)
return rc;
}
- cFYI(1, "readpage %p at offset %d 0x%x",
+ cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
page, (int)offset, (int)offset);
rc = cifs_readpage_worker(file, page, &offset);
@@ -3484,7 +3478,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
int rc = 0;
- cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
+ cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
@@ -3553,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return cifs_fscache_release_page(page, gfp);
}
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
@@ -3573,7 +3568,7 @@ static int cifs_launder_page(struct page *page)
.range_end = range_end,
};
- cFYI(1, "Launder page: %p", page);
+ cifs_dbg(FYI, "Launder page: %p\n", page);
if (clear_page_dirty_for_io(page))
rc = cifs_writepage_locked(page, &wbc);
@@ -3593,8 +3588,8 @@ void cifs_oplock_break(struct work_struct *work)
if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
cifs_has_mand_locks(cinode)) {
- cFYI(1, "Reset oplock to None for inode=%p due to mand locks",
- inode);
+ cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
+ inode);
cinode->clientCanCacheRead = false;
}
@@ -3609,12 +3604,12 @@ void cifs_oplock_break(struct work_struct *work)
mapping_set_error(inode->i_mapping, rc);
cifs_invalidate_mapping(inode);
}
- cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
+ cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
}
rc = cifs_push_locks(cfile);
if (rc)
- cERROR(1, "Push locks rc = %d", rc);
+ cifs_dbg(VFS, "Push locks rc = %d\n", rc);
/*
* releasing stale oplock after recent reconnect of smb session using
@@ -3625,7 +3620,7 @@ void cifs_oplock_break(struct work_struct *work)
if (!cfile->oplock_break_cancelled) {
rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
cinode);
- cFYI(1, "Oplock release rc = %d", rc);
+ cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
}
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 42e5363b4102..2f4bc5a58054 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
server->fscache =
fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
&cifs_fscache_server_index_def, server);
- cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
- server->fscache);
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+ __func__, server, server->fscache);
}
void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
{
- cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
- server->fscache);
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+ __func__, server, server->fscache);
fscache_relinquish_cookie(server->fscache, 0);
server->fscache = NULL;
}
@@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
tcon->fscache =
fscache_acquire_cookie(server->fscache,
&cifs_fscache_super_index_def, tcon);
- cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache,
- tcon->fscache);
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+ __func__, server->fscache, tcon->fscache);
}
void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
{
- cFYI(1, "%s: (0x%p)", __func__, tcon->fscache);
+ cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
fscache_relinquish_cookie(tcon->fscache, 0);
tcon->fscache = NULL;
}
@@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
&cifs_fscache_inode_object_def, cifsi);
- cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__,
- tcon->fscache, cifsi->fscache);
+ cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+ __func__, tcon->fscache, cifsi->fscache);
}
}
@@ -80,7 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
if (cifsi->fscache) {
- cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+ cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
fscache_relinquish_cookie(cifsi->fscache, 0);
cifsi->fscache = NULL;
}
@@ -91,7 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
if (cifsi->fscache) {
- cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+ cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
fscache_uncache_all_inode_pages(cifsi->fscache, inode);
fscache_relinquish_cookie(cifsi->fscache, 1);
cifsi->fscache = NULL;
@@ -120,8 +120,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
cifs_sb_master_tcon(cifs_sb)->fscache,
&cifs_fscache_inode_object_def,
cifsi);
- cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p",
- __func__, cifsi->fscache, old);
+ cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
+ __func__, cifsi->fscache, old);
}
}
@@ -131,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
struct inode *inode = page->mapping->host;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
- cFYI(1, "%s: (0x%p/0x%p)", __func__, page,
- cifsi->fscache);
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+ __func__, page, cifsi->fscache);
if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
return 0;
}
@@ -143,7 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
int error)
{
- cFYI(1, "%s: (0x%p/%d)", __func__, page, error);
+ cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error);
if (!error)
SetPageUptodate(page);
unlock_page(page);
@@ -156,8 +156,8 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
int ret;
- cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__,
- CIFS_I(inode)->fscache, page, inode);
+ cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+ __func__, CIFS_I(inode)->fscache, page, inode);
ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
cifs_readpage_from_fscache_complete,
NULL,
@@ -165,15 +165,15 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
switch (ret) {
case 0: /* page found in fscache, read submitted */
- cFYI(1, "%s: submitted", __func__);
+ cifs_dbg(FYI, "%s: submitted\n", __func__);
return ret;
case -ENOBUFS: /* page won't be cached */
case -ENODATA: /* page not in cache */
- cFYI(1, "%s: %d", __func__, ret);
+ cifs_dbg(FYI, "%s: %d\n", __func__, ret);
return 1;
default:
- cERROR(1, "unknown error ret = %d", ret);
+ cifs_dbg(VFS, "unknown error ret = %d\n", ret);
}
return ret;
}
@@ -188,8 +188,8 @@ int __cifs_readpages_from_fscache(struct inode *inode,
{
int ret;
- cFYI(1, "%s: (0x%p/%u/0x%p)", __func__,
- CIFS_I(inode)->fscache, *nr_pages, inode);
+ cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
+ __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
pages, nr_pages,
cifs_readpage_from_fscache_complete,
@@ -197,16 +197,16 @@ int __cifs_readpages_from_fscache(struct inode *inode,
mapping_gfp_mask(mapping));
switch (ret) {
case 0: /* read submitted to the cache for all pages */
- cFYI(1, "%s: submitted", __func__);
+ cifs_dbg(FYI, "%s: submitted\n", __func__);
return ret;
case -ENOBUFS: /* some pages are not cached and can't be */
case -ENODATA: /* some pages are not cached */
- cFYI(1, "%s: no page", __func__);
+ cifs_dbg(FYI, "%s: no page\n", __func__);
return 1;
default:
- cFYI(1, "unknown error ret = %d", ret);
+ cifs_dbg(FYI, "unknown error ret = %d\n", ret);
}
return ret;
@@ -216,8 +216,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
{
int ret;
- cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__,
- CIFS_I(inode)->fscache, page, inode);
+ cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
+ __func__, CIFS_I(inode)->fscache, page, inode);
ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
if (ret != 0)
fscache_uncache_page(CIFS_I(inode)->fscache, page);
@@ -228,7 +228,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct fscache_cookie *cookie = cifsi->fscache;
- cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie);
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
fscache_wait_on_page_write(cookie, page);
fscache_uncache_page(cookie, page);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20887bf63121..20efd81266c6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -91,30 +91,32 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
- cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+ cifs_dbg(FYI, "%s: revalidating inode %llu\n",
+ __func__, cifs_i->uniqueid);
if (inode->i_state & I_NEW) {
- cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+ cifs_dbg(FYI, "%s: inode %llu is new\n",
+ __func__, cifs_i->uniqueid);
return;
}
/* don't bother with revalidation if we have an oplock */
if (cifs_i->clientCanCacheRead) {
- cFYI(1, "%s: inode %llu is oplocked", __func__,
- cifs_i->uniqueid);
+ cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
+ __func__, cifs_i->uniqueid);
return;
}
/* revalidate if mtime or size have changed */
if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
cifs_i->server_eof == fattr->cf_eof) {
- cFYI(1, "%s: inode %llu is unchanged", __func__,
- cifs_i->uniqueid);
+ cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
+ __func__, cifs_i->uniqueid);
return;
}
- cFYI(1, "%s: invalidating inode %llu mapping", __func__,
- cifs_i->uniqueid);
+ cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
+ __func__, cifs_i->uniqueid);
cifs_i->invalid_mapping = true;
}
@@ -169,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
inode->i_flags |= S_AUTOMOUNT;
- cifs_set_ops(inode);
+ if (inode->i_state & I_NEW)
+ cifs_set_ops(inode);
}
void
@@ -240,7 +243,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
/* safest to call it a file if we do not know */
fattr->cf_mode |= S_IFREG;
fattr->cf_dtype = DT_REG;
- cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
+ cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type));
break;
}
@@ -279,7 +282,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- cFYI(1, "creating fake fattr for DFS referral");
+ cifs_dbg(FYI, "creating fake fattr for DFS referral\n");
memset(fattr, 0, sizeof(*fattr));
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -329,7 +332,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- cFYI(1, "Getting info on %s", full_path);
+ cifs_dbg(FYI, "Getting info on %s\n", full_path);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -355,7 +358,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
if (tmprc)
- cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+ cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
}
if (*pinode == NULL) {
@@ -422,7 +425,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
&buf_type);
if ((rc == 0) && (bytes_read >= 8)) {
if (memcmp("IntxBLK", pbuf, 8) == 0) {
- cFYI(1, "Block device");
+ cifs_dbg(FYI, "Block device\n");
fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
if (bytes_read == 24) {
@@ -434,7 +437,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
fattr->cf_rdev = MKDEV(mjr, mnr);
}
} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
- cFYI(1, "Char device");
+ cifs_dbg(FYI, "Char device\n");
fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
if (bytes_read == 24) {
@@ -446,7 +449,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
fattr->cf_rdev = MKDEV(mjr, mnr);
}
} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
- cFYI(1, "Symlink");
+ cifs_dbg(FYI, "Symlink\n");
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
} else {
@@ -497,10 +500,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
else if (rc > 3) {
mode = le32_to_cpu(*((__le32 *)ea_value));
fattr->cf_mode &= ~SFBITS_MASK;
- cFYI(1, "special bits 0%o org mode 0%o", mode,
- fattr->cf_mode);
+ cifs_dbg(FYI, "special bits 0%o org mode 0%o\n",
+ mode, fattr->cf_mode);
fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
- cFYI(1, "special mode bits 0%o", mode);
+ cifs_dbg(FYI, "special mode bits 0%o\n", mode);
}
return 0;
@@ -635,11 +638,11 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- cFYI(1, "Getting info on %s", full_path);
+ cifs_dbg(FYI, "Getting info on %s\n", full_path);
if ((data == NULL) && (*inode != NULL)) {
if (CIFS_I(*inode)->clientCanCacheRead) {
- cFYI(1, "No need to revalidate cached inode sizes");
+ cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto cgii_exit;
}
}
@@ -714,7 +717,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
tcon, cifs_sb, full_path,
&fattr.cf_uniqueid, data);
if (tmprc) {
- cFYI(1, "GetSrvInodeNum rc %d", tmprc);
+ cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+ tmprc);
fattr.cf_uniqueid = iunique(sb, ROOT_I);
cifs_autodisable_serverino(cifs_sb);
}
@@ -729,7 +733,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
if (tmprc)
- cFYI(1, "cifs_sfu_type failed: %d", tmprc);
+ cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
}
#ifdef CONFIG_CIFS_ACL
@@ -737,8 +741,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
if (rc) {
- cFYI(1, "%s: Getting ACL failed with error: %d",
- __func__, rc);
+ cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
+ __func__, rc);
goto cgii_exit;
}
}
@@ -752,7 +756,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
if (tmprc)
- cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+ cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
}
if (!*inode) {
@@ -836,7 +840,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
struct inode *inode;
retry_iget5_locked:
- cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
+ cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid);
/* hash down to 32-bits on 32-bit arch */
hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -899,7 +903,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
#endif
if (rc && tcon->ipc) {
- cFYI(1, "ipc connection - fake read inode");
+ cifs_dbg(FYI, "ipc connection - fake read inode\n");
spin_lock(&inode->i_lock);
inode->i_mode |= S_IFDIR;
set_nlink(inode, 2);
@@ -958,7 +962,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
* server times.
*/
if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
- cFYI(1, "CIFS - CTIME changed");
+ cifs_dbg(FYI, "CIFS - CTIME changed\n");
info_buf.ChangeTime =
cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
} else
@@ -1127,7 +1131,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
struct iattr *attrs = NULL;
__u32 dosattr = 0, origattr = 0;
- cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+ cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -1150,7 +1154,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- cFYI(1, "posix del rc %d", rc);
+ cifs_dbg(FYI, "posix del rc %d\n", rc);
if ((rc == 0) || (rc == -ENOENT))
goto psx_del_no_retry;
}
@@ -1320,7 +1324,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
if (rc == -EOPNOTSUPP)
goto posix_mkdir_out;
else if (rc) {
- cFYI(1, "posix mkdir returned 0x%x", rc);
+ cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc);
d_drop(dentry);
goto posix_mkdir_out;
}
@@ -1342,11 +1346,12 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
d_instantiate(dentry, newinode);
#ifdef CONFIG_CIFS_DEBUG2
- cFYI(1, "instantiated dentry %p %s to inode %p", dentry,
- dentry->d_name.name, newinode);
+ cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n",
+ dentry, dentry->d_name.name, newinode);
if (newinode->i_nlink != 2)
- cFYI(1, "unexpected number of links %d", newinode->i_nlink);
+ cifs_dbg(FYI, "unexpected number of links %d\n",
+ newinode->i_nlink);
#endif
posix_mkdir_out:
@@ -1368,7 +1373,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
struct TCP_Server_Info *server;
char *full_path;
- cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
+ cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+ mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
@@ -1402,7 +1408,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
/* BB add setting the equivalent of mode via CreateX w/ACLs */
rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
if (rc) {
- cFYI(1, "cifs_mkdir returned 0x%x", rc);
+ cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
d_drop(direntry);
goto mkdir_out;
}
@@ -1432,7 +1438,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
char *full_path = NULL;
struct cifsInodeInfo *cifsInode;
- cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
+ cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode);
xid = get_xid();
@@ -1681,8 +1687,8 @@ cifs_invalidate_mapping(struct inode *inode)
if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
rc = invalidate_inode_pages2(inode->i_mapping);
if (rc) {
- cERROR(1, "%s: could not invalidate inode %p", __func__,
- inode);
+ cifs_dbg(VFS, "%s: could not invalidate inode %p\n",
+ __func__, inode);
cifs_i->invalid_mapping = true;
}
}
@@ -1732,8 +1738,8 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
goto out;
}
- cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
- "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
+ cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
+ full_path, inode, inode->i_count.counter,
dentry, dentry->d_time, jiffies);
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1883,7 +1889,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
else
rc = -ENOSYS;
cifsFileInfo_put(open_file);
- cFYI(1, "SetFSize for attrs rc = %d", rc);
+ cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
unsigned int bytes_written;
@@ -1894,7 +1900,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
io_parms.length = attrs->ia_size;
rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
NULL, NULL, 1);
- cFYI(1, "Wrt seteof rc %d", rc);
+ cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
}
} else
rc = -EINVAL;
@@ -1920,7 +1926,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
attrs->ia_size, cifs_sb, false);
else
rc = -ENOSYS;
- cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
+ cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
__u16 netfid;
int oplock = 0;
@@ -1940,7 +1946,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
io_parms.length = attrs->ia_size;
rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
NULL, 1);
- cFYI(1, "wrt seteof rc %d", rc);
+ cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
CIFSSMBClose(xid, tcon, netfid);
}
}
@@ -1971,7 +1977,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
struct cifs_unix_set_info_args *args = NULL;
struct cifsFileInfo *open_file;
- cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+ cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n",
direntry->d_name.name, attrs->ia_valid);
xid = get_xid();
@@ -2114,7 +2120,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
xid = get_xid();
- cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+ cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n",
direntry->d_name.name, attrs->ia_valid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
@@ -2166,8 +2172,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
uid, gid);
if (rc) {
- cFYI(1, "%s: Setting id failed with error: %d",
- __func__, rc);
+ cifs_dbg(FYI, "%s: Setting id failed with error: %d\n",
+ __func__, rc);
goto cifs_setattr_exit;
}
}
@@ -2188,8 +2194,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = id_mode_to_cifs_acl(inode, full_path, mode,
INVALID_UID, INVALID_GID);
if (rc) {
- cFYI(1, "%s: Setting ACL failed with error: %d",
- __func__, rc);
+ cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n",
+ __func__, rc);
goto cifs_setattr_exit;
}
} else
@@ -2277,7 +2283,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
#if 0
void cifs_delete_inode(struct inode *inode)
{
- cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
+ cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode);
/* may have to add back in if and when safe distributed caching of
directories added e.g. via FindNotify */
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 6c9f1214cf0b..3e0845585853 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -44,7 +44,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg);
+ cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg);
cifs_sb = CIFS_SB(inode->i_sb);
@@ -83,11 +83,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
* &ExtAttrMask);
*/
}
- cFYI(1, "set flags not implemented yet");
+ cifs_dbg(FYI, "set flags not implemented yet\n");
break;
#endif /* CONFIG_CIFS_POSIX */
default:
- cFYI(1, "unsupported ioctl");
+ cifs_dbg(FYI, "unsupported ioctl\n");
break;
}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 9f6c4c45d21e..b83c3f5646bd 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -56,14 +56,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
md5 = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(md5)) {
rc = PTR_ERR(md5);
- cERROR(1, "%s: Crypto md5 allocation error %d", __func__, rc);
+ cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n",
+ __func__, rc);
return rc;
}
size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
sdescmd5 = kmalloc(size, GFP_KERNEL);
if (!sdescmd5) {
rc = -ENOMEM;
- cERROR(1, "%s: Memory allocation failure", __func__);
goto symlink_hash_err;
}
sdescmd5->shash.tfm = md5;
@@ -71,17 +71,17 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
rc = crypto_shash_init(&sdescmd5->shash);
if (rc) {
- cERROR(1, "%s: Could not init md5 shash", __func__);
+ cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
goto symlink_hash_err;
}
rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
if (rc) {
- cERROR(1, "%s: Could not update with link_str", __func__);
+ cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
goto symlink_hash_err;
}
rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
if (rc)
- cERROR(1, "%s: Could not generate md5 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
symlink_hash_err:
crypto_free_shash(md5);
@@ -115,7 +115,7 @@ CIFSParseMFSymlink(const u8 *buf,
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
- cFYI(1, "%s: MD5 hash failure: %d", __func__, rc);
+ cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
return rc;
}
@@ -154,7 +154,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
- cFYI(1, "%s: MD5 hash failure: %d", __func__, rc);
+ cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
return rc;
}
@@ -521,7 +521,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
if (!full_path)
goto out;
- cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
+ cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
rc = -EACCES;
/*
@@ -578,8 +578,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
goto symlink_exit;
}
- cFYI(1, "Full path: %s", full_path);
- cFYI(1, "symname is %s", symname);
+ cifs_dbg(FYI, "Full path: %s\n", full_path);
+ cifs_dbg(FYI, "symname is %s\n", symname);
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
@@ -601,8 +601,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
inode->i_sb, xid, NULL);
if (rc != 0) {
- cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
- rc);
+ cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
+ rc);
} else {
d_instantiate(direntry, newinode);
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1b15bf839f37..f7d4b2285efe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -54,7 +54,7 @@ _get_xid(void)
if (GlobalTotalActiveXid > GlobalMaxActiveXid)
GlobalMaxActiveXid = GlobalTotalActiveXid;
if (GlobalTotalActiveXid > 65000)
- cFYI(1, "warning: more than 65000 requests active");
+ cifs_dbg(FYI, "warning: more than 65000 requests active\n");
xid = GlobalCurrentXid++;
spin_unlock(&GlobalMid_Lock);
return xid;
@@ -91,7 +91,7 @@ void
sesInfoFree(struct cifs_ses *buf_to_free)
{
if (buf_to_free == NULL) {
- cFYI(1, "Null buffer passed to sesInfoFree");
+ cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
}
@@ -130,7 +130,7 @@ void
tconInfoFree(struct cifs_tcon *buf_to_free)
{
if (buf_to_free == NULL) {
- cFYI(1, "Null buffer passed to tconInfoFree");
+ cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
atomic_dec(&tconInfoAllocCount);
@@ -180,7 +180,7 @@ void
cifs_buf_release(void *buf_to_free)
{
if (buf_to_free == NULL) {
- /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
+ /* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/
return;
}
mempool_free(buf_to_free, cifs_req_poolp);
@@ -216,7 +216,7 @@ cifs_small_buf_release(void *buf_to_free)
{
if (buf_to_free == NULL) {
- cFYI(1, "Null buffer passed to cifs_small_buf_release");
+ cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n");
return;
}
mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
if (treeCon->nocase)
buffer->Flags |= SMBFLG_CASELESS;
if ((treeCon->ses) && (treeCon->ses->server))
- if (treeCon->ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (treeCon->ses->server->sign)
buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
}
@@ -282,15 +281,15 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
{
/* does it have the right SMB "signature" ? */
if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
- cERROR(1, "Bad protocol string signature header 0x%x",
- *(unsigned int *)smb->Protocol);
+ cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n",
+ *(unsigned int *)smb->Protocol);
return 1;
}
/* Make sure that message ids match */
if (mid != smb->Mid) {
- cERROR(1, "Mids do not match. received=%u expected=%u",
- smb->Mid, mid);
+ cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n",
+ smb->Mid, mid);
return 1;
}
@@ -302,7 +301,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
- cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
+ cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid);
return 1;
}
@@ -313,8 +312,8 @@ checkSMB(char *buf, unsigned int total_read)
__u16 mid = smb->Mid;
__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
- cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
- total_read, rfclen);
+ cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
+ total_read, rfclen);
/* is this frame too small to even get to a BCC? */
if (total_read < 2 + sizeof(struct smb_hdr)) {
@@ -340,9 +339,9 @@ checkSMB(char *buf, unsigned int total_read)
tmp[sizeof(struct smb_hdr)+1] = 0;
return 0;
}
- cERROR(1, "rcvd invalid byte count (bcc)");
+ cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n");
} else {
- cERROR(1, "Length less than smb header size");
+ cifs_dbg(VFS, "Length less than smb header size\n");
}
return -EIO;
}
@@ -353,8 +352,8 @@ checkSMB(char *buf, unsigned int total_read)
clc_len = smbCalcSize(smb);
if (4 + rfclen != total_read) {
- cERROR(1, "Length read does not match RFC1001 length %d",
- rfclen);
+ cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
+ rfclen);
return -EIO;
}
@@ -365,12 +364,12 @@ checkSMB(char *buf, unsigned int total_read)
if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
return 0; /* bcc wrapped */
}
- cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
- clc_len, 4 + rfclen, smb->Mid);
+ cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
+ clc_len, 4 + rfclen, smb->Mid);
if (4 + rfclen < clc_len) {
- cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
- rfclen, smb->Mid);
+ cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
+ rfclen, smb->Mid);
return -EIO;
} else if (rfclen > clc_len + 512) {
/*
@@ -382,8 +381,8 @@ checkSMB(char *buf, unsigned int total_read)
* trailing data, we choose limit the amount of extra
* data to 512 bytes.
*/
- cERROR(1, "RFC1001 size %u more than 512 bytes larger "
- "than SMB for mid=%u", rfclen, smb->Mid);
+ cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
+ rfclen, smb->Mid);
return -EIO;
}
}
@@ -401,7 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
struct cifsInodeInfo *pCifsInode;
struct cifsFileInfo *netfile;
- cFYI(1, "Checking for oplock break or dnotify response");
+ cifs_dbg(FYI, "Checking for oplock break or dnotify response\n");
if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
(pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -413,15 +412,15 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
pnotify = (struct file_notify_information *)
((char *)&pSMBr->hdr.Protocol + data_offset);
- cFYI(1, "dnotify on %s Action: 0x%x",
+ cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
pnotify->FileName, pnotify->Action);
/* cifs_dump_mem("Rcvd notify Data: ",buf,
sizeof(struct smb_hdr)+60); */
return true;
}
if (pSMBr->hdr.Status.CifsError) {
- cFYI(1, "notify err 0x%d",
- pSMBr->hdr.Status.CifsError);
+ cifs_dbg(FYI, "notify err 0x%d\n",
+ pSMBr->hdr.Status.CifsError);
return true;
}
return false;
@@ -435,7 +434,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
large dirty files cached on the client */
if ((NT_STATUS_INVALID_HANDLE) ==
le32_to_cpu(pSMB->hdr.Status.CifsError)) {
- cFYI(1, "invalid handle on oplock break");
+ cifs_dbg(FYI, "invalid handle on oplock break\n");
return true;
} else if (ERRbadfid ==
le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -447,7 +446,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
if (pSMB->hdr.WordCount != 8)
return false;
- cFYI(1, "oplock type 0x%d level 0x%d",
+ cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n",
pSMB->LockType, pSMB->OplockLevel);
if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
return false;
@@ -469,7 +468,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
if (pSMB->Fid != netfile->fid.netfid)
continue;
- cFYI(1, "file id match, oplock break");
+ cifs_dbg(FYI, "file id match, oplock break\n");
pCifsInode = CIFS_I(netfile->dentry->d_inode);
cifs_set_oplock_level(pCifsInode,
@@ -484,12 +483,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
}
spin_unlock(&cifs_file_list_lock);
spin_unlock(&cifs_tcp_ses_lock);
- cFYI(1, "No matching file for oplock break");
+ cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
- cFYI(1, "Can not process oplock break for non-existent connection");
+ cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
return true;
}
@@ -536,12 +535,8 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
- cERROR(1, "Autodisabling the use of server inode numbers on "
- "%s. This server doesn't seem to support them "
- "properly. Hardlinks will not be recognized on this "
- "mount. Consider mounting with the \"noserverino\" "
- "option to silence this message.",
- cifs_sb_master_tcon(cifs_sb)->treeName);
+ cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
+ cifs_sb_master_tcon(cifs_sb)->treeName);
}
}
@@ -552,13 +547,13 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
if (oplock == OPLOCK_EXCLUSIVE) {
cinode->clientCanCacheAll = true;
cinode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock granted on inode %p",
- &cinode->vfs_inode);
+ cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+ &cinode->vfs_inode);
} else if (oplock == OPLOCK_READ) {
cinode->clientCanCacheAll = false;
cinode->clientCanCacheRead = true;
- cFYI(1, "Level II Oplock granted on inode %p",
- &cinode->vfs_inode);
+ cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+ &cinode->vfs_inode);
} else {
cinode->clientCanCacheAll = false;
cinode->clientCanCacheRead = false;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index c0b25b28be6c..af847e1cf1c1 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -150,8 +150,8 @@ cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
else if (address_family == AF_INET6)
ret = in6_pton(cp, len, dst , '\\', NULL);
- cFYI(DBG2, "address conversion returned %d for %*.*s",
- ret, len, len, cp);
+ cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n",
+ ret, len, len, cp);
if (ret > 0)
ret = 1;
return ret;
@@ -887,7 +887,7 @@ map_smb_to_linux_error(char *buf, bool logErr)
}
/* else ERRHRD class errors or junk - return EIO */
- cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+ cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
le32_to_cpu(smb->Status.CifsError), rc);
/* generic corrective action e.g. reconnect SMB session on
@@ -951,20 +951,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
SMB_TIME *st = (SMB_TIME *)&time;
SMB_DATE *sd = (SMB_DATE *)&date;
- cFYI(1, "date %d time %d", date, time);
+ cifs_dbg(FYI, "date %d time %d\n", date, time);
sec = 2 * st->TwoSeconds;
min = st->Minutes;
if ((sec > 59) || (min > 59))
- cERROR(1, "illegal time min %d sec %d", min, sec);
+ cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec);
sec += (min * 60);
sec += 60 * 60 * st->Hours;
if (st->Hours > 24)
- cERROR(1, "illegal hours %d", st->Hours);
+ cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
days = sd->Day;
month = sd->Month;
if ((days > 31) || (month > 12)) {
- cERROR(1, "illegal date, month %d day: %d", month, days);
+ cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
if (month > 12)
month = 12;
}
@@ -990,7 +990,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
ts.tv_sec = sec + offset;
- /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
+ /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */
ts.tv_nsec = 0;
return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index df40cc5fd13a..ab8778469394 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -48,15 +48,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
if (file) {
cf = file->private_data;
if (cf == NULL) {
- cFYI(1, "empty cifs private file data");
+ cifs_dbg(FYI, "empty cifs private file data\n");
return;
}
if (cf->invalidHandle)
- cFYI(1, "invalid handle");
+ cifs_dbg(FYI, "invalid handle\n");
if (cf->srch_inf.endOfSearch)
- cFYI(1, "end of search");
+ cifs_dbg(FYI, "end of search\n");
if (cf->srch_inf.emptyDir)
- cFYI(1, "empty dir");
+ cifs_dbg(FYI, "empty dir\n");
}
}
#else
@@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct super_block *sb = parent->d_inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- cFYI(1, "%s: for %s", __func__, name->name);
+ cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
dentry = d_hash_and_lookup(parent, name);
if (unlikely(IS_ERR(dentry)))
@@ -126,6 +126,22 @@ out:
dput(dentry);
}
+/*
+ * Is it possible that this directory might turn out to be a DFS referral
+ * once we go to try and use it?
+ */
+static bool
+cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+ return true;
+#endif
+ return false;
+}
+
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -135,6 +151,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
+ /*
+ * Windows CIFS servers generally make DFS referrals look
+ * like directories in FIND_* responses with the reparse
+ * attribute flag also set (since DFS junctions are
+ * reparse points). We must revalidate at least these
+ * directory inodes before trying to use them (if
+ * they are DFS we will get PATH_NOT_COVERED back
+ * when queried directly and can then try to connect
+ * to the DFS target)
+ */
+ if (cifs_dfs_is_possible(cifs_sb) &&
+ (fattr->cf_cifsattrs & ATTR_REPARSE))
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
@@ -233,7 +262,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
fid,
cifs_sb->local_nls);
if (CIFSSMBClose(xid, ptcon, fid)) {
- cFYI(1, "Error closing temporary reparsepoint open");
+ cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
}
}
}
@@ -285,7 +314,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
goto error_exit;
}
- cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
+ cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos);
ffirst_retry:
/* test for Unix extensions */
@@ -336,7 +365,7 @@ static int cifs_unicode_bytelen(const char *str)
if (ustr[len] == 0)
return len << 1;
}
- cFYI(1, "Unicode string longer than PATH_MAX found");
+ cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n");
return len << 1;
}
@@ -353,18 +382,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
pfData->FileNameLength;
} else
new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
- cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
+ cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry);
/* validate that new_entry is not past end of SMB */
if (new_entry >= end_of_smb) {
- cERROR(1, "search entry %p began after end of SMB %p old entry %p",
- new_entry, end_of_smb, old_entry);
+ cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n",
+ new_entry, end_of_smb, old_entry);
return NULL;
} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
(new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
|| ((level != SMB_FIND_FILE_INFO_STANDARD) &&
(new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
- cERROR(1, "search entry %p extends after end of SMB %p",
- new_entry, end_of_smb);
+ cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n",
+ new_entry, end_of_smb);
return NULL;
} else
return new_entry;
@@ -457,7 +486,7 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
cifs_fill_dirent_std(de, info);
break;
default:
- cFYI(1, "Unknown findfirst level %d", level);
+ cifs_dbg(FYI, "Unknown findfirst level %d\n", level);
return -EINVAL;
}
@@ -537,14 +566,14 @@ static int cifs_save_resume_key(const char *current_entry,
* every entry (do not increment for . or .. entry).
*/
static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
struct file *file, char **current_entry, int *num_to_ret)
{
__u16 search_flags;
int rc = 0;
int pos_in_buf = 0;
loff_t first_entry_in_buffer;
- loff_t index_to_find = file->f_pos;
+ loff_t index_to_find = pos;
struct cifsFileInfo *cfile = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -572,7 +601,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
if (((index_to_find < cfile->srch_inf.index_of_last_entry) &&
is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
/* close and restart search */
- cFYI(1, "search backing up - close and restart search");
+ cifs_dbg(FYI, "search backing up - close and restart search\n");
spin_lock(&cifs_file_list_lock);
if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
cfile->invalidHandle = true;
@@ -582,7 +611,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
} else
spin_unlock(&cifs_file_list_lock);
if (cfile->srch_inf.ntwrk_buf_start) {
- cFYI(1, "freeing SMB ff cache buf on search rewind");
+ cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n");
if (cfile->srch_inf.smallBuf)
cifs_small_buf_release(cfile->srch_inf.
ntwrk_buf_start);
@@ -593,7 +622,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = initiate_cifs_search(xid, file);
if (rc) {
- cFYI(1, "error %d reinitiating a search on rewind",
+ cifs_dbg(FYI, "error %d reinitiating a search on rewind\n",
rc);
return rc;
}
@@ -608,7 +637,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
while ((index_to_find >= cfile->srch_inf.index_of_last_entry) &&
(rc == 0) && !cfile->srch_inf.endOfSearch) {
- cFYI(1, "calling findnext2");
+ cifs_dbg(FYI, "calling findnext2\n");
rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
search_flags,
&cfile->srch_inf);
@@ -631,7 +660,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
- cfile->srch_inf.entries_in_buffer;
pos_in_buf = index_to_find - first_entry_in_buffer;
- cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
+ cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf);
for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) {
/* go entry by entry figuring out which is first */
@@ -640,19 +669,18 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
}
if ((cur_ent == NULL) && (i < pos_in_buf)) {
/* BB fixme - check if we should flag this error */
- cERROR(1, "reached end of buf searching for pos in buf"
- " %d index to find %lld rc %d", pos_in_buf,
- index_to_find, rc);
+ cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n",
+ pos_in_buf, index_to_find, rc);
}
rc = 0;
*current_entry = cur_ent;
} else {
- cFYI(1, "index not in buffer - could not findnext into it");
+ cifs_dbg(FYI, "index not in buffer - could not findnext into it\n");
return 0;
}
if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) {
- cFYI(1, "can not return entries pos_in_buf beyond last");
+ cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n");
*num_to_ret = 0;
} else
*num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -660,8 +688,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
- void *dirent, char *scratch_buf, unsigned int max_len)
+static int cifs_filldir(char *find_entry, struct file *file,
+ struct dir_context *ctx,
+ char *scratch_buf, unsigned int max_len)
{
struct cifsFileInfo *file_info = file->private_data;
struct super_block *sb = file->f_path.dentry->d_sb;
@@ -678,8 +707,8 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
return rc;
if (de.namelen > max_len) {
- cERROR(1, "bad search response length %zd past smb end",
- de.namelen);
+ cifs_dbg(VFS, "bad search response length %zd past smb end\n",
+ de.namelen);
return -EINVAL;
}
@@ -741,13 +770,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
cifs_prime_dcache(file->f_dentry, &name, &fattr);
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
- rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
- fattr.cf_dtype);
- return rc;
+ return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
}
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
{
int rc = 0;
unsigned int xid;
@@ -768,108 +795,91 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
*/
if (file->private_data == NULL) {
rc = initiate_cifs_search(xid, file);
- cFYI(1, "initiate cifs search rc %d", rc);
+ cifs_dbg(FYI, "initiate cifs search rc %d\n", rc);
if (rc)
goto rddir2_exit;
}
- switch ((int) file->f_pos) {
- case 0:
- if (filldir(direntry, ".", 1, file->f_pos,
- file_inode(file)->i_ino, DT_DIR) < 0) {
- cERROR(1, "Filldir for current dir failed");
- rc = -ENOMEM;
- break;
- }
- file->f_pos++;
- case 1:
- if (filldir(direntry, "..", 2, file->f_pos,
- parent_ino(file->f_path.dentry), DT_DIR) < 0) {
- cERROR(1, "Filldir for parent dir failed");
- rc = -ENOMEM;
- break;
- }
- file->f_pos++;
- default:
- /* 1) If search is active,
- is in current search buffer?
- if it before then restart search
- if after then keep searching till find it */
-
- if (file->private_data == NULL) {
- rc = -EINVAL;
- free_xid(xid);
- return rc;
- }
- cifsFile = file->private_data;
- if (cifsFile->srch_inf.endOfSearch) {
- if (cifsFile->srch_inf.emptyDir) {
- cFYI(1, "End of search, empty dir");
- rc = 0;
- break;
- }
- } /* else {
- cifsFile->invalidHandle = true;
- tcon->ses->server->close(xid, tcon, &cifsFile->fid);
- } */
+ if (!dir_emit_dots(file, ctx))
+ goto rddir2_exit;
- tcon = tlink_tcon(cifsFile->tlink);
- rc = find_cifs_entry(xid, tcon, file, &current_entry,
- &num_to_fill);
- if (rc) {
- cFYI(1, "fce error %d", rc);
- goto rddir2_exit;
- } else if (current_entry != NULL) {
- cFYI(1, "entry %lld found", file->f_pos);
- } else {
- cFYI(1, "could not find entry");
+ /* 1) If search is active,
+ is in current search buffer?
+ if it before then restart search
+ if after then keep searching till find it */
+
+ if (file->private_data == NULL) {
+ rc = -EINVAL;
+ goto rddir2_exit;
+ }
+ cifsFile = file->private_data;
+ if (cifsFile->srch_inf.endOfSearch) {
+ if (cifsFile->srch_inf.emptyDir) {
+ cifs_dbg(FYI, "End of search, empty dir\n");
+ rc = 0;
goto rddir2_exit;
}
- cFYI(1, "loop through %d times filling dir for net buf %p",
- num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
- max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start);
- end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-
- tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
- if (tmp_buf == NULL) {
- rc = -ENOMEM;
+ } /* else {
+ cifsFile->invalidHandle = true;
+ tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+ } */
+
+ tcon = tlink_tcon(cifsFile->tlink);
+ rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+ &num_to_fill);
+ if (rc) {
+ cifs_dbg(FYI, "fce error %d\n", rc);
+ goto rddir2_exit;
+ } else if (current_entry != NULL) {
+ cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+ } else {
+ cifs_dbg(FYI, "could not find entry\n");
+ goto rddir2_exit;
+ }
+ cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+ num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+ max_len = tcon->ses->server->ops->calc_smb_size(
+ cifsFile->srch_inf.ntwrk_buf_start);
+ end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+ tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+ if (tmp_buf == NULL) {
+ rc = -ENOMEM;
+ goto rddir2_exit;
+ }
+
+ for (i = 0; i < num_to_fill; i++) {
+ if (current_entry == NULL) {
+ /* evaluate whether this case is an error */
+ cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n",
+ num_to_fill, i);
break;
}
-
- for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
- if (current_entry == NULL) {
- /* evaluate whether this case is an error */
- cERROR(1, "past SMB end, num to fill %d i %d",
- num_to_fill, i);
- break;
- }
- /*
- * if buggy server returns . and .. late do we want to
- * check for that here?
- */
- rc = cifs_filldir(current_entry, file, filldir,
- direntry, tmp_buf, max_len);
- if (rc == -EOVERFLOW) {
+ /*
+ * if buggy server returns . and .. late do we want to
+ * check for that here?
+ */
+ rc = cifs_filldir(current_entry, file, ctx,
+ tmp_buf, max_len);
+ if (rc) {
+ if (rc > 0)
rc = 0;
- break;
- }
-
- file->f_pos++;
- if (file->f_pos ==
- cifsFile->srch_inf.index_of_last_entry) {
- cFYI(1, "last entry in buf at pos %lld %s",
- file->f_pos, tmp_buf);
- cifs_save_resume_key(current_entry, cifsFile);
- break;
- } else
- current_entry =
- nxt_dir_entry(current_entry, end_of_smb,
- cifsFile->srch_inf.info_level);
+ break;
}
- kfree(tmp_buf);
- break;
- } /* end switch */
+
+ ctx->pos++;
+ if (ctx->pos ==
+ cifsFile->srch_inf.index_of_last_entry) {
+ cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+ ctx->pos, tmp_buf);
+ cifs_save_resume_key(current_entry, cifsFile);
+ break;
+ } else
+ current_entry =
+ nxt_dir_entry(current_entry, end_of_smb,
+ cifsFile->srch_inf.info_level);
+ }
+ kfree(tmp_buf);
rddir2_exit:
free_xid(xid);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 76809f4d3428..79358e341fd2 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->server->sign)
pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
if (ses->capabilities & CAP_UNICODE) {
@@ -283,11 +282,11 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
int len;
char *data = *pbcc_area;
- cFYI(1, "bleft %d", bleft);
+ cifs_dbg(FYI, "bleft %d\n", bleft);
kfree(ses->serverOS);
ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
- cFYI(1, "serverOS=%s", ses->serverOS);
+ cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
bleft -= len;
@@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
kfree(ses->serverNOS);
ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
- cFYI(1, "serverNOS=%s", ses->serverNOS);
+ cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
bleft -= len;
@@ -305,41 +304,38 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
kfree(ses->serverDomain);
ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
- cFYI(1, "serverDomain=%s", ses->serverDomain);
+ cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain);
return;
}
-static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
- struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+ struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
{
- int rc = 0;
int len;
char *bcc_ptr = *pbcc_area;
- cFYI(1, "decode sessetup ascii. bleft %d", bleft);
+ cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft);
len = strnlen(bcc_ptr, bleft);
if (len >= bleft)
- return rc;
+ return;
kfree(ses->serverOS);
ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
if (ses->serverOS)
strncpy(ses->serverOS, bcc_ptr, len);
- if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
- cFYI(1, "OS/2 server");
- ses->flags |= CIFS_SES_OS2;
- }
+ if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+ cifs_dbg(FYI, "OS/2 server\n");
bcc_ptr += len + 1;
bleft -= len + 1;
len = strnlen(bcc_ptr, bleft);
if (len >= bleft)
- return rc;
+ return;
kfree(ses->serverNOS);
@@ -352,16 +348,14 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
len = strnlen(bcc_ptr, bleft);
if (len > bleft)
- return rc;
+ return;
/* No domain field in LANMAN case. Domain is
returned by old servers in the SMB negprot response */
/* BB For newer servers which do not support Unicode,
but thus do return domain here we could add parsing
for it later, but it is not very important */
- cFYI(1, "ascii: bytes left %d", bleft);
-
- return rc;
+ cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
}
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -373,16 +367,18 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
- cERROR(1, "challenge blob len %d too small", blob_len);
+ cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
return -EINVAL;
}
if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
- cERROR(1, "blob signature incorrect %s", pblob->Signature);
+ cifs_dbg(VFS, "blob signature incorrect %s\n",
+ pblob->Signature);
return -EINVAL;
}
if (pblob->MessageType != NtLmChallenge) {
- cERROR(1, "Incorrect message type %d", pblob->MessageType);
+ cifs_dbg(VFS, "Incorrect message type %d\n",
+ pblob->MessageType);
return -EINVAL;
}
@@ -395,16 +391,17 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
if (tioffset > blob_len || tioffset + tilen > blob_len) {
- cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+ cifs_dbg(VFS, "tioffset + tilen too high %u + %u",
+ tioffset, tilen);
return -EINVAL;
}
if (tilen) {
- ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+ ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
+ GFP_KERNEL);
if (!ses->auth_key.response) {
- cERROR(1, "Challenge target info allocation failure");
+ cifs_dbg(VFS, "Challenge target info alloc failure");
return -ENOMEM;
}
- memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
ses->auth_key.len = tilen;
}
@@ -429,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (ses->server->sign) {
flags |= NTLMSSP_NEGOTIATE_SIGN;
if (!ses->server->session_estab)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -468,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (ses->server->sign) {
flags |= NTLMSSP_NEGOTIATE_SIGN;
if (!ses->server->session_estab)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -486,7 +481,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
rc = setup_ntlmv2_rsp(ses, nls_cp);
if (rc) {
- cERROR(1, "Error %d during NTLMSSP authentication", rc);
+ cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
goto setup_ntlmv2_ret;
}
memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -555,6 +550,56 @@ setup_ntlmv2_ret:
return rc;
}
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+ switch (server->negflavor) {
+ case CIFS_NEGFLAVOR_EXTENDED:
+ switch (requested) {
+ case Kerberos:
+ case RawNTLMSSP:
+ return requested;
+ case Unspecified:
+ if (server->sec_ntlmssp &&
+ (global_secflags & CIFSSEC_MAY_NTLMSSP))
+ return RawNTLMSSP;
+ if ((server->sec_kerberos || server->sec_mskerberos) &&
+ (global_secflags & CIFSSEC_MAY_KRB5))
+ return Kerberos;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ case CIFS_NEGFLAVOR_UNENCAP:
+ switch (requested) {
+ case NTLM:
+ case NTLMv2:
+ return requested;
+ case Unspecified:
+ if (global_secflags & CIFSSEC_MAY_NTLMV2)
+ return NTLMv2;
+ if (global_secflags & CIFSSEC_MAY_NTLM)
+ return NTLM;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ case CIFS_NEGFLAVOR_LANMAN:
+ switch (requested) {
+ case LANMAN:
+ return requested;
+ case Unspecified:
+ if (global_secflags & CIFSSEC_MAY_LANMAN)
+ return LANMAN;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ default:
+ return Unspecified;
+ }
+}
+
int
CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp)
@@ -576,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
u16 blob_len;
char *ntlmsspblob = NULL;
- if (ses == NULL)
+ if (ses == NULL) {
+ WARN(1, "%s: ses == NULL!", __func__);
+ return -EINVAL;
+ }
+
+ type = select_sectype(ses->server, ses->sectype);
+ cifs_dbg(FYI, "sess setup type %d\n", type);
+ if (type == Unspecified) {
+ cifs_dbg(VFS, "Unable to select appropriate authentication method!");
return -EINVAL;
+ }
- type = ses->server->secType;
- cFYI(1, "sess setup type %d", type);
if (type == RawNTLMSSP) {
/* if memory allocation is successful, caller of this function
* frees it.
@@ -640,8 +692,6 @@ ssetup_ntlmssp_authenticate:
}
bcc_ptr = str_area;
- ses->flags &= ~CIFS_SES_LANMAN;
-
iov[1].iov_base = NULL;
iov[1].iov_len = 0;
@@ -665,7 +715,6 @@ ssetup_ntlmssp_authenticate:
ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
true : false, lnm_session_key);
- ses->flags |= CIFS_SES_LANMAN;
memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
bcc_ptr += CIFS_AUTH_RESP_SIZE;
@@ -674,7 +723,7 @@ ssetup_ntlmssp_authenticate:
changed to do higher than lanman dialect and
we reconnected would we ever calc signing_key? */
- cFYI(1, "Negotiating LANMAN setting up strings");
+ cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
/* Unicode not allowed for LANMAN dialects */
ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
#endif
@@ -688,7 +737,8 @@ ssetup_ntlmssp_authenticate:
/* calculate ntlm response and session key */
rc = setup_ntlm_response(ses, nls_cp);
if (rc) {
- cERROR(1, "Error %d during NTLM authentication", rc);
+ cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+ rc);
goto ssetup_exit;
}
@@ -718,7 +768,8 @@ ssetup_ntlmssp_authenticate:
/* calculate nlmv2 response and session key */
rc = setup_ntlmv2_rsp(ses, nls_cp);
if (rc) {
- cERROR(1, "Error %d during NTLMv2 authentication", rc);
+ cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
+ rc);
goto ssetup_exit;
}
memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -754,21 +805,21 @@ ssetup_ntlmssp_authenticate:
/* check version field to make sure that cifs.upcall is
sending us a response in an expected form */
if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
- cERROR(1, "incorrect version of cifs.upcall (expected"
- " %d but got %d)",
+ cifs_dbg(VFS, "incorrect version of cifs.upcall "
+ "expected %d but got %d)",
CIFS_SPNEGO_UPCALL_VERSION, msg->version);
rc = -EKEYREJECTED;
goto ssetup_exit;
}
- ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
if (!ses->auth_key.response) {
- cERROR(1, "Kerberos can't allocate (%u bytes) memory",
+ cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
msg->sesskey_len);
rc = -ENOMEM;
goto ssetup_exit;
}
- memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
ses->auth_key.len = msg->sesskey_len;
pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -790,18 +841,18 @@ ssetup_ntlmssp_authenticate:
/* BB: is this right? */
ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
#else /* ! CONFIG_CIFS_UPCALL */
- cERROR(1, "Kerberos negotiated but upcall support disabled!");
+ cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
rc = -ENOSYS;
goto ssetup_exit;
#endif /* CONFIG_CIFS_UPCALL */
} else if (type == RawNTLMSSP) {
if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
- cERROR(1, "NTLMSSP requires Unicode support");
+ cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
rc = -ENOSYS;
goto ssetup_exit;
}
- cFYI(1, "ntlmssp session setup phase %d", phase);
+ cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
capabilities |= CAP_EXTENDED_SECURITY;
pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -824,7 +875,6 @@ ssetup_ntlmssp_authenticate:
5*sizeof(struct _AUTHENTICATE_MESSAGE),
GFP_KERNEL);
if (!ntlmsspblob) {
- cERROR(1, "Can't allocate NTLMSSP blob");
rc = -ENOMEM;
goto ssetup_exit;
}
@@ -844,7 +894,7 @@ ssetup_ntlmssp_authenticate:
smb_buf->Uid = ses->Suid;
break;
default:
- cERROR(1, "invalid phase %d", phase);
+ cifs_dbg(VFS, "invalid phase %d\n", phase);
rc = -ENOSYS;
goto ssetup_exit;
}
@@ -855,7 +905,7 @@ ssetup_ntlmssp_authenticate:
}
unicode_oslm_strings(&bcc_ptr, nls_cp);
} else {
- cERROR(1, "secType %d not supported!", type);
+ cifs_dbg(VFS, "secType %d not supported!\n", type);
rc = -ENOSYS;
goto ssetup_exit;
}
@@ -880,7 +930,7 @@ ssetup_ntlmssp_authenticate:
(smb_buf->Status.CifsError ==
cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
if (phase != NtLmNegotiate) {
- cERROR(1, "Unexpected more processing error");
+ cifs_dbg(VFS, "Unexpected more processing error\n");
goto ssetup_exit;
}
/* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -892,14 +942,14 @@ ssetup_ntlmssp_authenticate:
if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
rc = -EIO;
- cERROR(1, "bad word count %d", smb_buf->WordCount);
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
goto ssetup_exit;
}
action = le16_to_cpu(pSMB->resp.Action);
if (action & GUEST_LOGIN)
- cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
- cFYI(1, "UID = %llu ", ses->Suid);
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
/* response can have either 3 or 4 word count - Samba sends 3 */
/* and lanman response is 3 */
bytes_remaining = get_bcc(smb_buf);
@@ -908,7 +958,8 @@ ssetup_ntlmssp_authenticate:
if (smb_buf->WordCount == 4) {
blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
if (blob_len > bytes_remaining) {
- cERROR(1, "bad security blob length %d", blob_len);
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
rc = -EINVAL;
goto ssetup_exit;
}
@@ -933,8 +984,7 @@ ssetup_ntlmssp_authenticate:
}
decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
} else {
- rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
- ses, nls_cp);
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
}
ssetup_exit:
@@ -946,7 +996,7 @@ ssetup_exit:
kfree(ntlmsspblob);
ntlmsspblob = NULL;
if (resp_buf_type == CIFS_SMALL_BUFFER) {
- cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
+ cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
cifs_small_buf_release(iov[0].iov_base);
} else if (resp_buf_type == CIFS_LARGE_BUFFER)
cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 47bc5a87f94e..e813f04511d8 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -61,10 +61,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
*/
--server->sequence_number;
rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+ if (rc < 0)
+ server->sequence_number--;
+
mutex_unlock(&server->srv_mutex);
- cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
- in_buf->Mid, rc);
+ cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
+ in_buf->Mid, rc);
return rc;
}
@@ -249,7 +252,7 @@ check2ndT2(char *buf)
/* check for plausible wct, bcc and t2 data and parm sizes */
/* check for parm and data offset going beyond end of smb */
if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
- cFYI(1, "invalid transact2 word count");
+ cifs_dbg(FYI, "invalid transact2 word count\n");
return -EINVAL;
}
@@ -261,18 +264,18 @@ check2ndT2(char *buf)
if (total_data_size == data_in_this_rsp)
return 0;
else if (total_data_size < data_in_this_rsp) {
- cFYI(1, "total data %d smaller than data in frame %d",
- total_data_size, data_in_this_rsp);
+ cifs_dbg(FYI, "total data %d smaller than data in frame %d\n",
+ total_data_size, data_in_this_rsp);
return -EINVAL;
}
remaining = total_data_size - data_in_this_rsp;
- cFYI(1, "missing %d bytes from transact2, check next response",
- remaining);
+ cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n",
+ remaining);
if (total_data_size > CIFSMaxBufSize) {
- cERROR(1, "TotalDataSize %d is over maximum buffer %d",
- total_data_size, CIFSMaxBufSize);
+ cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n",
+ total_data_size, CIFSMaxBufSize);
return -EINVAL;
}
return remaining;
@@ -293,28 +296,28 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
if (tgt_total_cnt != src_total_cnt)
- cFYI(1, "total data count of primary and secondary t2 differ "
- "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
+ cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n",
+ src_total_cnt, tgt_total_cnt);
total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
remaining = tgt_total_cnt - total_in_tgt;
if (remaining < 0) {
- cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
- "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
+ cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+ tgt_total_cnt, total_in_tgt);
return -EPROTO;
}
if (remaining == 0) {
/* nothing to do, ignore */
- cFYI(1, "no more data remains");
+ cifs_dbg(FYI, "no more data remains\n");
return 0;
}
total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
if (remaining < total_in_src)
- cFYI(1, "transact2 2nd response contains too much data");
+ cifs_dbg(FYI, "transact2 2nd response contains too much data\n");
/* find end of first SMB data area */
data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
@@ -329,7 +332,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
total_in_tgt += total_in_src;
/* is the result too big for the field? */
if (total_in_tgt > USHRT_MAX) {
- cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
+ cifs_dbg(FYI, "coalesced DataCount too large (%u)\n",
+ total_in_tgt);
return -EPROTO;
}
put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
@@ -339,7 +343,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
byte_count += total_in_src;
/* is the result too big for the field? */
if (byte_count > USHRT_MAX) {
- cFYI(1, "coalesced BCC too large (%u)", byte_count);
+ cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count);
return -EPROTO;
}
put_bcc(byte_count, target_hdr);
@@ -348,7 +352,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
byte_count += total_in_src;
/* don't allow buffer to overflow */
if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
- cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
+ cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n",
+ byte_count);
return -ENOBUFS;
}
target_hdr->smb_buf_length = cpu_to_be32(byte_count);
@@ -358,12 +363,12 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
if (remaining != total_in_src) {
/* more responses to go */
- cFYI(1, "waiting for more secondary responses");
+ cifs_dbg(FYI, "waiting for more secondary responses\n");
return 1;
}
/* we are done */
- cFYI(1, "found the last secondary response");
+ cifs_dbg(FYI, "found the last secondary response\n");
return 0;
}
@@ -388,7 +393,7 @@ cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
}
if (!server->large_buf) {
/*FIXME: switch to already allocated largebuf?*/
- cERROR(1, "1st trans2 resp needs bigbuf");
+ cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n");
} else {
/* Have first buffer */
mid->resp_buf = buf;
@@ -444,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
* WRITEX header, not including the 4 byte RFC1001 length.
*/
if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
- (!(server->capabilities & CAP_UNIX) &&
- (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
+ (!(server->capabilities & CAP_UNIX) && server->sign))
wsize = min_t(unsigned int, wsize,
server->maxBuf - sizeof(WRITE_REQ) + 4);
@@ -760,24 +764,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
}
tcon = tlink_tcon(tlink);
- /*
- * NT4 apparently returns success on this call, but it doesn't really
- * work.
- */
- if (!(tcon->ses->flags & CIFS_SES_NT4)) {
- rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
- cifs_sb->local_nls,
+ rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
- goto out;
- } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
- goto out;
+ if (rc == 0) {
+ cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+ goto out;
+ } else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
+ goto out;
}
- cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported "
- "by this server");
+ cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
&netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -960,4 +957,6 @@ struct smb_version_values smb1_values = {
.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
.cap_large_files = CAP_LARGE_FILES,
.oplock_read = OPLOCK_READ,
+ .signing_enabled = SECMODE_SIGN_ENABLED,
+ .signing_required = SECMODE_SIGN_REQUIRED,
};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 71e6aed4b382..5da1b55a2258 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,13 +43,13 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
cinode->clientCanCacheAll = true;
cinode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock granted on inode %p",
- &cinode->vfs_inode);
+ cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+ &cinode->vfs_inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
cinode->clientCanCacheAll = false;
cinode->clientCanCacheRead = true;
- cFYI(1, "Level II Oplock granted on inode %p",
- &cinode->vfs_inode);
+ cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+ &cinode->vfs_inode);
} else {
cinode->clientCanCacheAll = false;
cinode->clientCanCacheRead = false;
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..c38350851b08 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
#define SMB2_SIGNATURE_SIZE (16)
#define SMB2_NTLMV2_SESSKEY_SIZE (16)
#define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 706482452df4..fff6dfba6204 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -92,7 +92,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
(FILE_BASIC_INFO *)data);
break;
default:
- cERROR(1, "Invalid command");
+ cifs_dbg(VFS, "Invalid command\n");
break;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 494c912c76fe..7c2f45c06fc2 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2472,7 +2472,7 @@ map_smb2_to_linux_error(char *buf, bool log_err)
/* on error mapping not found - return EIO */
- cFYI(1, "Mapping SMB2 status code %d to POSIX err %d",
+ cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n",
smb2err, rc);
return rc;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b1c5e3287fb..b0c43345cd98 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -45,17 +45,17 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
if (hdr->Command == SMB2_OPLOCK_BREAK)
return 0;
else
- cERROR(1, "Received Request not response");
+ cifs_dbg(VFS, "Received Request not response\n");
}
} else { /* bad signature or mid */
if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
- cERROR(1, "Bad protocol string signature header %x",
- *(unsigned int *) hdr->ProtocolId);
+ cifs_dbg(VFS, "Bad protocol string signature header %x\n",
+ *(unsigned int *) hdr->ProtocolId);
if (mid != hdr->MessageId)
- cERROR(1, "Mids do not match: %llu and %llu", mid,
- hdr->MessageId);
+ cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
+ mid, hdr->MessageId);
}
- cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId);
+ cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId);
return 1;
}
@@ -101,7 +101,8 @@ smb2_check_message(char *buf, unsigned int length)
int command;
/* BB disable following printk later */
- cFYI(1, "%s length: 0x%x, smb_buf_length: 0x%x", __func__, length, len);
+ cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
+ __func__, length, len);
/*
* Add function to do table lookup of StructureSize by command
@@ -117,12 +118,13 @@ smb2_check_message(char *buf, unsigned int length)
*/
return 0;
} else {
- cERROR(1, "Length less than SMB header size");
+ cifs_dbg(VFS, "Length less than SMB header size\n");
}
return 1;
}
if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
- cERROR(1, "SMB length greater than maximum, mid=%llu", mid);
+ cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
+ mid);
return 1;
}
@@ -130,14 +132,14 @@ smb2_check_message(char *buf, unsigned int length)
return 1;
if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
- cERROR(1, "Illegal structure size %u",
- le16_to_cpu(hdr->StructureSize));
+ cifs_dbg(VFS, "Illegal structure size %u\n",
+ le16_to_cpu(hdr->StructureSize));
return 1;
}
command = le16_to_cpu(hdr->Command);
if (command >= NUMBER_OF_SMB2_COMMANDS) {
- cERROR(1, "Illegal SMB2 command %d", command);
+ cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
return 1;
}
@@ -145,30 +147,30 @@ smb2_check_message(char *buf, unsigned int length)
if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
/* error packets have 9 byte structure size */
- cERROR(1, "Illegal response size %u for command %d",
- le16_to_cpu(pdu->StructureSize2), command);
+ cifs_dbg(VFS, "Illegal response size %u for command %d\n",
+ le16_to_cpu(pdu->StructureSize2), command);
return 1;
} else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
&& (le16_to_cpu(pdu->StructureSize2) != 44)
&& (le16_to_cpu(pdu->StructureSize2) != 36)) {
/* special case for SMB2.1 lease break message */
- cERROR(1, "Illegal response size %d for oplock break",
- le16_to_cpu(pdu->StructureSize2));
+ cifs_dbg(VFS, "Illegal response size %d for oplock break\n",
+ le16_to_cpu(pdu->StructureSize2));
return 1;
}
}
if (4 + len != length) {
- cERROR(1, "Total length %u RFC1002 length %u mismatch mid %llu",
- length, 4 + len, mid);
+ cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n",
+ length, 4 + len, mid);
return 1;
}
clc_len = smb2_calc_size(hdr);
if (4 + len != clc_len) {
- cFYI(1, "Calculated size %u length %u mismatch mid %llu",
- clc_len, 4 + len, mid);
+ cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+ clc_len, 4 + len, mid);
/* Windows 7 server returns 24 bytes more */
if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
@@ -264,10 +266,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
break;
case SMB2_IOCTL:
+ *off = le32_to_cpu(
+ ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+ *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+ break;
case SMB2_CHANGE_NOTIFY:
default:
/* BB FIXME for unimplemented cases above */
- cERROR(1, "no length check for command");
+ cifs_dbg(VFS, "no length check for command\n");
break;
}
@@ -276,20 +282,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
* we have little choice but to ignore the data area in this case.
*/
if (*off > 4096) {
- cERROR(1, "offset %d too large, data area ignored", *off);
+ cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
*len = 0;
*off = 0;
} else if (*off < 0) {
- cERROR(1, "negative offset %d to data invalid ignore data area",
- *off);
+ cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
+ *off);
*off = 0;
*len = 0;
} else if (*len < 0) {
- cERROR(1, "negative data length %d invalid, data area ignored",
- *len);
+ cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
+ *len);
*len = 0;
} else if (*len > 128 * 1024) {
- cERROR(1, "data area larger than 128K: %d", *len);
+ cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
*len = 0;
}
@@ -324,7 +330,7 @@ smb2_calc_size(void *buf)
goto calc_size_exit;
smb2_get_data_area_len(&offset, &data_length, hdr);
- cFYI(1, "SMB2 data length %d offset %d", data_length, offset);
+ cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
if (data_length > 0) {
/*
@@ -335,15 +341,15 @@ smb2_calc_size(void *buf)
* the size of the RFC1001 hdr.
*/
if (offset + 4 + 1 < len) {
- cERROR(1, "data area offset %d overlaps SMB2 header %d",
- offset + 4 + 1, len);
+ cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
+ offset + 4 + 1, len);
data_length = 0;
} else {
len = 4 + offset + data_length;
}
}
calc_size_exit:
- cFYI(1, "SMB2 len %d", len);
+ cifs_dbg(FYI, "SMB2 len %d\n", len);
return len;
}
@@ -405,7 +411,7 @@ cifs_ses_oplock_break(struct work_struct *work)
rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
lw->lease_state);
- cFYI(1, "Lease release rc %d", rc);
+ cifs_dbg(FYI, "Lease release rc %d\n", rc);
cifs_put_tlink(lw->tlink);
kfree(lw);
}
@@ -426,15 +432,13 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
- if (!lw) {
- cERROR(1, "Memory allocation failed during lease break check");
+ if (!lw)
return false;
- }
INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
lw->lease_state = rsp->NewLeaseState;
- cFYI(1, "Checking for lease break");
+ cifs_dbg(FYI, "Checking for lease break\n");
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
@@ -455,9 +459,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
SMB2_LEASE_KEY_SIZE))
continue;
- cFYI(1, "found in the open list");
- cFYI(1, "lease key match, lease break 0x%d",
- le32_to_cpu(rsp->NewLeaseState));
+ cifs_dbg(FYI, "found in the open list\n");
+ cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+ le32_to_cpu(rsp->NewLeaseState));
smb2_set_oplock_level(cinode,
smb2_map_lease_to_oplock(rsp->NewLeaseState));
@@ -489,9 +493,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
&lw->lease_break);
}
- cFYI(1, "found in the pending open list");
- cFYI(1, "lease key match, lease break 0x%d",
- le32_to_cpu(rsp->NewLeaseState));
+ cifs_dbg(FYI, "found in the pending open list\n");
+ cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+ le32_to_cpu(rsp->NewLeaseState));
open->oplock =
smb2_map_lease_to_oplock(rsp->NewLeaseState);
@@ -506,7 +510,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
}
spin_unlock(&cifs_tcp_ses_lock);
kfree(lw);
- cFYI(1, "Can not process lease break - no lease matched");
+ cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
return false;
}
@@ -520,7 +524,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
struct cifsInodeInfo *cinode;
struct cifsFileInfo *cfile;
- cFYI(1, "Checking for oplock break");
+ cifs_dbg(FYI, "Checking for oplock break\n");
if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
return false;
@@ -533,7 +537,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return false;
}
- cFYI(1, "oplock level 0x%d", rsp->OplockLevel);
+ cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
@@ -553,7 +557,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cfile->fid.volatile_fid)
continue;
- cFYI(1, "file id match, oplock break");
+ cifs_dbg(FYI, "file id match, oplock break\n");
cinode = CIFS_I(cfile->dentry->d_inode);
if (!cinode->clientCanCacheAll &&
@@ -573,11 +577,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
}
spin_unlock(&cifs_file_list_lock);
spin_unlock(&cifs_tcp_ses_lock);
- cFYI(1, "No matching file for oplock break");
+ cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
- cFYI(1, "Can not process oplock break for non-existent connection");
+ cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
return false;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bceffe7b8f8d..6d15cab95b99 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -38,13 +38,13 @@ change_conf(struct TCP_Server_Info *server)
case 1:
server->echoes = false;
server->oplocks = false;
- cERROR(1, "disabling echoes and oplocks");
+ cifs_dbg(VFS, "disabling echoes and oplocks\n");
break;
case 2:
server->echoes = true;
server->oplocks = false;
server->echo_credits = 1;
- cFYI(1, "disabling oplocks");
+ cifs_dbg(FYI, "disabling oplocks\n");
break;
default:
server->echoes = true;
@@ -147,10 +147,10 @@ smb2_dump_detail(void *buf)
#ifdef CONFIG_CIFS_DEBUG2
struct smb2_hdr *smb = (struct smb2_hdr *)buf;
- cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d",
- smb->Command, smb->Status, smb->Flags, smb->MessageId,
- smb->ProcessId);
- cERROR(1, "smb buf %p len %u", smb, smb2_calc_size(smb));
+ cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
+ smb->Command, smb->Status, smb->Flags, smb->MessageId,
+ smb->ProcessId);
+ cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb));
#endif
}
@@ -281,6 +281,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
}
static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+ seq_puts(m, "\n\tShare Capabilities:");
+ if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+ seq_puts(m, " DFS,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ seq_puts(m, " CONTINUOUS AVAILABILITY,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+ seq_puts(m, " SCALEOUT,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+ seq_puts(m, " CLUSTER,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+ seq_puts(m, " ASYMMETRIC,");
+ if (tcon->capabilities == 0)
+ seq_puts(m, " None");
+ seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+}
+
+static void
smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
{
#ifdef CONFIG_CIFS_STATS
@@ -292,7 +311,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, "\nSessionSetups: %d sent %d failed",
atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
-#define SMB2LOGOFF 0x0002 /* trivial request/resp */
seq_printf(m, "\nLogoffs: %d sent %d failed",
atomic_read(&sent[SMB2_LOGOFF_HE]),
atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -436,7 +454,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
&oplock, NULL);
kfree(utf16_path);
if (rc) {
- cERROR(1, "open dir failed");
+ cifs_dbg(VFS, "open dir failed\n");
return rc;
}
@@ -448,7 +466,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
srch_inf);
if (rc) {
- cERROR(1, "query directory failed");
+ cifs_dbg(VFS, "query directory failed\n");
SMB2_close(xid, tcon, persistent_fid, volatile_fid);
}
return rc;
@@ -645,6 +663,7 @@ struct smb_version_operations smb30_operations = {
.dump_detail = smb2_dump_detail,
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
+ .dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
@@ -690,6 +709,7 @@ struct smb_version_operations smb30_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
+ .generate_signingkey = generate_smb3signingkey,
.calc_signature = smb3_calc_signature,
};
@@ -709,6 +729,8 @@ struct smb_version_values smb20_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
struct smb_version_values smb21_values = {
@@ -727,6 +749,8 @@ struct smb_version_values smb21_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
struct smb_version_values smb30_values = {
@@ -745,4 +769,26 @@ struct smb_version_values smb30_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+};
+
+struct smb_version_values smb302_values = {
+ .version_string = SMB302_VERSION_STRING,
+ .protocol_id = SMB302_PROT_ID,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 41d9d0725f0f..2b312e4eeaa6 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
/*
* fs/cifs/smb2pdu.c
*
- * Copyright (C) International Business Machines Corp., 2009, 2012
+ * Copyright (C) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
if (!tcon)
goto out;
+ /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
+ /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+ if ((tcon->ses) &&
+ (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ hdr->CreditCharge = cpu_to_le16(1);
+ /* else CreditCharge MBZ */
+
hdr->TreeId = tcon->tid;
/* Uid is not converted */
if (tcon->ses)
hdr->SessionId = tcon->ses->Suid;
- /* BB check following DFS flags BB */
- /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */
- if (tcon->share_flags & SHI1005_FLAGS_DFS)
- hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS;
- /* BB how does SMB2 do case sensitive? */
- /* if (tcon->nocase)
- hdr->Flags |= SMBFLG_CASELESS; */
- if (tcon->ses && tcon->ses->server &&
- (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED))
+
+ /*
+ * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
+ * to pass the path on the Open SMB prefixed by \\server\share.
+ * Not sure when we would need to do the augmented path (if ever) and
+ * setting this flag breaks the SMB2 open operation since it is
+ * illegal to send an empty path name (without \\server\share prefix)
+ * when the DFS flag is set in the SMB open header. We could
+ * consider setting the flag on all operations other than open
+ * but it is safer to net set it for now.
+ */
+/* if (tcon->share_flags & SHI1005_FLAGS_DFS)
+ hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+
+ if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
hdr->Flags |= SMB2_FLAGS_SIGNED;
out:
pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -155,8 +169,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if ((smb2_command != SMB2_WRITE) &&
(smb2_command != SMB2_CREATE) &&
(smb2_command != SMB2_TREE_DISCONNECT)) {
- cFYI(1, "can not send cmd %d while umounting",
- smb2_command);
+ cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+ smb2_command);
return -ENODEV;
}
}
@@ -200,7 +214,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
* back on-line
*/
if (!tcon->retry) {
- cFYI(1, "gave up waiting on reconnect in smb_init");
+ cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
return -EHOSTDOWN;
}
}
@@ -227,7 +241,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
cifs_mark_open_files_invalid(tcon);
rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
mutex_unlock(&tcon->ses->session_mutex);
- cFYI(1, "reconnect tcon rc = %d", rc);
+ cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc)
goto out;
atomic_inc(&tconInfoReconnectCount);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
struct kvec iov[1];
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server;
- unsigned int sec_flags;
- u16 temp = 0;
+ struct TCP_Server_Info *server = ses->server;
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
- cFYI(1, "Negotiate protocol");
+ cifs_dbg(FYI, "Negotiate protocol\n");
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
- else /* if override flags set only sign/seal OR them with global auth */
- sec_flags = global_secflags | ses->overrideSecFlg;
-
- cFYI(1, "sec_flags 0x%x", sec_flags);
-
req->hdr.SessionId = 0;
req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
inc_rfc1001_len(req, 2);
/* only one of SMB2 signing flags may be set in SMB2 request */
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
- temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
- req->SecurityMode = cpu_to_le16(temp);
+ if (ses->sign)
+ req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+ else if (global_secflags & CIFSSEC_MAY_SIGN)
+ req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+ else
+ req->SecurityMode = 0;
req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
@@ -389,24 +391,28 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if (rc != 0)
goto neg_exit;
- cFYI(1, "mode 0x%x", rsp->SecurityMode);
+ cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
/* BB we may eventually want to match the negotiated vs. requested
dialect, even though we are only requesting one at a time */
if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID))
- cFYI(1, "negotiated smb2.0 dialect");
+ cifs_dbg(FYI, "negotiated smb2.0 dialect\n");
else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID))
- cFYI(1, "negotiated smb2.1 dialect");
+ cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
- cFYI(1, "negotiated smb3.0 dialect");
+ cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+ else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+ cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
else {
- cERROR(1, "Illegal dialect returned by server %d",
- le16_to_cpu(rsp->DialectRevision));
+ cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+ le16_to_cpu(rsp->DialectRevision));
rc = -EIO;
goto neg_exit;
}
server->dialect = le16_to_cpu(rsp->DialectRevision);
+ /* SMB2 only has an extended negflavor */
+ server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
server->max_read = le32_to_cpu(rsp->MaxReadSize);
server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,45 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
&rsp->hdr);
- if (blob_length == 0) {
- cERROR(1, "missing security blob on negprot");
- rc = -EIO;
- goto neg_exit;
- }
-
- cFYI(1, "sec_flags 0x%x", sec_flags);
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
- cFYI(1, "Signing required");
- if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
- SMB2_NEGOTIATE_SIGNING_ENABLED))) {
- cERROR(1, "signing required but server lacks support");
- rc = -EOPNOTSUPP;
- goto neg_exit;
- }
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
- } else if (sec_flags & CIFSSEC_MAY_SIGN) {
- cFYI(1, "Signing optional");
- if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
- cFYI(1, "Server requires signing");
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
- } else {
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- }
- } else {
- cFYI(1, "Signing disabled");
- if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
- cERROR(1, "Server requires packet signing to be enabled"
- " in /proc/fs/cifs/SecurityFlags.");
- rc = -EOPNOTSUPP;
- goto neg_exit;
- }
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- }
+ /*
+ * See MS-SMB2 section 2.2.4: if no blob, client picks default which
+ * for us will be
+ * ses->sectype = RawNTLMSSP;
+ * but for time being this is our only auth choice so doesn't matter.
+ * We just found a server which sets blob length to zero expecting raw.
+ */
+ if (blob_length == 0)
+ cifs_dbg(FYI, "missing security blob on negprot\n");
+ rc = cifs_enable_signing(server, ses->sign);
#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
- rc = decode_neg_token_init(security_blob, blob_length,
+ if (rc)
+ goto neg_exit;
+ if (blob_length)
+ rc = decode_neg_token_init(security_blob, blob_length,
&server->sec_type);
if (rc == 1)
rc = 0;
@@ -481,21 +464,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
int rc = 0;
int resp_buftype;
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
- struct TCP_Server_Info *server;
- unsigned int sec_flags;
- u8 temp = 0;
+ struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
char *security_blob;
char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
- cFYI(1, "Session Setup");
+ cifs_dbg(FYI, "Session Setup\n");
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
/*
@@ -506,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
if (!ses->ntlmssp)
return -ENOMEM;
- ses->server->secType = RawNTLMSSP;
+ /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+ ses->sectype = RawNTLMSSP;
ssetup_ntlmssp_authenticate:
if (phase == NtLmChallenge)
@@ -516,28 +496,19 @@ ssetup_ntlmssp_authenticate:
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
- else /* if override flags set only sign/seal OR them with global auth */
- sec_flags = global_secflags | ses->overrideSecFlg;
-
- cFYI(1, "sec_flags 0x%x", sec_flags);
-
req->hdr.SessionId = 0; /* First session, not a reauthenticate */
req->VcNumber = 0; /* MBZ */
/* to enable echos and oplocks */
req->hdr.CreditRequest = cpu_to_le16(3);
/* only one of SMB2 signing flags may be set in SMB2 request */
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
- temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
- req->SecurityMode = temp;
+ if (server->sign)
+ req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+ else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
+ req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
+ else
+ req->SecurityMode = 0;
+
req->Capabilities = 0;
req->Channel = 0; /* MBZ */
@@ -558,7 +529,7 @@ ssetup_ntlmssp_authenticate:
sizeof(struct _NEGOTIATE_MESSAGE),
ntlmssp_blob); */
/* BB eventually need to add this */
- cERROR(1, "spnego not supported for SMB2 yet");
+ cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
rc = -EOPNOTSUPP;
kfree(ntlmssp_blob);
goto ssetup_exit;
@@ -572,14 +543,14 @@ ssetup_ntlmssp_authenticate:
ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
GFP_KERNEL);
if (ntlmssp_blob == NULL) {
- cERROR(1, "failed to malloc ntlmssp blob");
rc = -ENOMEM;
goto ssetup_exit;
}
rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses,
nls_cp);
if (rc) {
- cFYI(1, "build_ntlmssp_auth_blob failed %d", rc);
+ cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
+ rc);
goto ssetup_exit; /* BB double check error handling */
}
if (use_spnego) {
@@ -587,7 +558,7 @@ ssetup_ntlmssp_authenticate:
&security_blob,
blob_length,
ntlmssp_blob); */
- cERROR(1, "spnego not supported for SMB2 yet");
+ cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
rc = -EOPNOTSUPP;
kfree(ntlmssp_blob);
goto ssetup_exit;
@@ -595,7 +566,7 @@ ssetup_ntlmssp_authenticate:
security_blob = ntlmssp_blob;
}
} else {
- cERROR(1, "illegal ntlmssp phase");
+ cifs_dbg(VFS, "illegal ntlmssp phase\n");
rc = -EIO;
goto ssetup_exit;
}
@@ -620,13 +591,13 @@ ssetup_ntlmssp_authenticate:
if (resp_buftype != CIFS_NO_BUFFER &&
rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
if (phase != NtLmNegotiate) {
- cERROR(1, "Unexpected more processing error");
+ cifs_dbg(VFS, "Unexpected more processing error\n");
goto ssetup_exit;
}
if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
le16_to_cpu(rsp->SecurityBufferOffset)) {
- cERROR(1, "Invalid security buffer offset %d",
- le16_to_cpu(rsp->SecurityBufferOffset));
+ cifs_dbg(VFS, "Invalid security buffer offset %d\n",
+ le16_to_cpu(rsp->SecurityBufferOffset));
rc = -EIO;
goto ssetup_exit;
}
@@ -667,7 +638,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
int rc = 0;
struct TCP_Server_Info *server;
- cFYI(1, "disconnect session %p", ses);
+ cifs_dbg(FYI, "disconnect session %p\n", ses);
if (ses && (ses->server))
server = ses->server;
@@ -680,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
/* since no tcon, smb2_init can not do this, so do here */
req->hdr.SessionId = ses->Suid;
- if (server->sec_mode & SECMODE_SIGN_REQUIRED)
+ if (server->sign)
req->hdr.Flags |= SMB2_FLAGS_SIGNED;
rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -711,7 +682,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
struct TCP_Server_Info *server;
__le16 *unc_path = NULL;
- cFYI(1, "TCON");
+ cifs_dbg(FYI, "TCON\n");
if ((ses->server) && tree)
server = ses->server;
@@ -775,29 +746,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
}
if (rsp->ShareType & SMB2_SHARE_TYPE_DISK)
- cFYI(1, "connection to disk share");
+ cifs_dbg(FYI, "connection to disk share\n");
else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) {
tcon->ipc = true;
- cFYI(1, "connection to pipe share");
+ cifs_dbg(FYI, "connection to pipe share\n");
} else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) {
tcon->print = true;
- cFYI(1, "connection to printer");
+ cifs_dbg(FYI, "connection to printer\n");
} else {
- cERROR(1, "unknown share type %d", rsp->ShareType);
+ cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
rc = -EOPNOTSUPP;
goto tcon_error_exit;
}
tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+ tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
tcon->tid = rsp->hdr.TreeId;
- strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+ strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
- cERROR(1, "DFS capability contradicts DFS flag");
+ cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
tcon_exit:
free_rsp_buf(resp_buftype, rsp);
@@ -806,7 +778,7 @@ tcon_exit:
tcon_error_exit:
if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
- cERROR(1, "BAD_NETWORK_NAME: %s", tree);
+ cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
tcon->bad_network_name = true;
}
goto tcon_exit;
@@ -820,7 +792,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
- cFYI(1, "Tree Disconnect");
+ cifs_dbg(FYI, "Tree Disconnect\n");
if (ses && (ses->server))
server = ses->server;
@@ -846,12 +818,10 @@ create_lease_buf(u8 *lease_key, u8 oplock)
{
struct create_lease *buf;
- buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL);
+ buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
if (!buf)
return NULL;
- memset(buf, 0, sizeof(struct create_lease));
-
buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
@@ -925,7 +895,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
int rc = 0;
int num_iovecs = 2;
- cFYI(1, "create/open");
+ cifs_dbg(FYI, "create/open\n");
if (ses && (ses->server))
server = ses->server;
@@ -1039,6 +1009,122 @@ creat_exit:
return rc;
}
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+ u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb2_ioctl_req *req;
+ struct smb2_ioctl_rsp *rsp;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses = tcon->ses;
+ struct kvec iov[2];
+ int resp_buftype;
+ int num_iovecs;
+ int rc = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (ses && (ses->server))
+ server = ses->server;
+ else
+ return -EIO;
+
+ rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+ if (rc)
+ return rc;
+
+ req->CtlCode = cpu_to_le32(opcode);
+ req->PersistentFileId = persistent_fid;
+ req->VolatileFileId = volatile_fid;
+
+ if (indatalen) {
+ req->InputCount = cpu_to_le32(indatalen);
+ /* do not set InputOffset if no input data */
+ req->InputOffset =
+ cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+ iov[1].iov_base = in_data;
+ iov[1].iov_len = indatalen;
+ num_iovecs = 2;
+ } else
+ num_iovecs = 1;
+
+ req->OutputOffset = 0;
+ req->OutputCount = 0; /* MBZ */
+
+ /*
+ * Could increase MaxOutputResponse, but that would require more
+ * than one credit. Windows typically sets this smaller, but for some
+ * ioctls it may be useful to allow server to send more. No point
+ * limiting what the server can send as long as fits in one credit
+ */
+ req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+
+ if (is_fsctl)
+ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+ else
+ req->Flags = 0;
+
+ iov[0].iov_base = (char *)req;
+ /* 4 for rfc1002 length field */
+ iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+ if (indatalen)
+ inc_rfc1001_len(req, indatalen);
+
+ rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+ rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+
+ if (rc != 0) {
+ if (tcon)
+ cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+ goto ioctl_exit;
+ }
+
+ /* check if caller wants to look at return data or just return rc */
+ if ((plen == NULL) || (out_data == NULL))
+ goto ioctl_exit;
+
+ *plen = le32_to_cpu(rsp->OutputCount);
+
+ /* We check for obvious errors in the output buffer length and offset */
+ if (*plen == 0)
+ goto ioctl_exit; /* server returned no data */
+ else if (*plen > 0xFF00) {
+ cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+ *plen = 0;
+ rc = -EIO;
+ goto ioctl_exit;
+ }
+
+ if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+ cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+ le32_to_cpu(rsp->OutputOffset));
+ *plen = 0;
+ rc = -EIO;
+ goto ioctl_exit;
+ }
+
+ *out_data = kmalloc(*plen, GFP_KERNEL);
+ if (*out_data == NULL) {
+ rc = -ENOMEM;
+ goto ioctl_exit;
+ }
+
+ memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+ *plen);
+ioctl_exit:
+ free_rsp_buf(resp_buftype, rsp);
+ return rc;
+}
+
int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
@@ -1051,7 +1137,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype;
int rc = 0;
- cFYI(1, "Close");
+ cifs_dbg(FYI, "Close\n");
if (ses && (ses->server))
server = ses->server;
@@ -1097,20 +1183,20 @@ validate_buf(unsigned int offset, unsigned int buffer_length,
if (buffer_length < min_buf_size) {
- cERROR(1, "buffer length %d smaller than minimum size %d",
- buffer_length, min_buf_size);
+ cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n",
+ buffer_length, min_buf_size);
return -EINVAL;
}
/* check if beyond RFC1001 maximum length */
if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) {
- cERROR(1, "buffer length %d or smb length %d too large",
- buffer_length, smb_len);
+ cifs_dbg(VFS, "buffer length %d or smb length %d too large\n",
+ buffer_length, smb_len);
return -EINVAL;
}
if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) {
- cERROR(1, "illegal server response, bad offset to data");
+ cifs_dbg(VFS, "illegal server response, bad offset to data\n");
return -EINVAL;
}
@@ -1155,7 +1241,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
- cFYI(1, "Query Info");
+ cifs_dbg(FYI, "Query Info\n");
if (ses && (ses->server))
server = ses->server;
@@ -1247,7 +1333,7 @@ SMB2_echo(struct TCP_Server_Info *server)
struct smb_rqst rqst = { .rq_iov = &iov,
.rq_nvec = 1 };
- cFYI(1, "In echo request");
+ cifs_dbg(FYI, "In echo request\n");
rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
if (rc)
@@ -1262,7 +1348,7 @@ SMB2_echo(struct TCP_Server_Info *server)
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
CIFS_ECHO_OP);
if (rc)
- cFYI(1, "Echo request failed: %d", rc);
+ cifs_dbg(FYI, "Echo request failed: %d\n", rc);
cifs_small_buf_release(req);
return rc;
@@ -1279,7 +1365,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int resp_buftype;
int rc = 0;
- cFYI(1, "Flush");
+ cifs_dbg(FYI, "Flush\n");
if (ses && (ses->server))
server = ses->server;
@@ -1379,21 +1465,21 @@ smb2_readv_callback(struct mid_q_entry *mid)
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
- cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
- mid->mid, mid->mid_state, rdata->result, rdata->bytes);
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+ __func__, mid->mid, mid->mid_state, rdata->result,
+ rdata->bytes);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
credits_received = le16_to_cpu(buf->CreditRequest);
/* result already set, check signature */
- if (server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
int rc;
rc = smb2_verify_signature(&rqst, server);
if (rc)
- cERROR(1, "SMB signature verification returned "
- "error = %d", rc);
+ cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+ rc);
}
/* FIXME: should this be counted toward the initiating task? */
task_io_account_read(rdata->bytes);
@@ -1426,8 +1512,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
struct smb_rqst rqst = { .rq_iov = &rdata->iov,
.rq_nvec = 1 };
- cFYI(1, "%s: offset=%llu bytes=%u", __func__,
- rdata->offset, rdata->bytes);
+ cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+ __func__, rdata->offset, rdata->bytes);
io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
io_parms.offset = rdata->offset;
@@ -1481,13 +1567,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
- cERROR(1, "Send error in read = %d", rc);
+ cifs_dbg(VFS, "Send error in read = %d\n", rc);
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
if ((*nbytes > CIFS_MAX_MSGSIZE) ||
(*nbytes > io_parms->length)) {
- cFYI(1, "bad length %d for count %d", *nbytes,
- io_parms->length);
+ cifs_dbg(FYI, "bad length %d for count %d\n",
+ *nbytes, io_parms->length);
rc = -EIO;
*nbytes = 0;
}
@@ -1597,7 +1683,8 @@ smb2_async_writev(struct cifs_writedata *wdata)
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
- cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+ cifs_dbg(FYI, "async write at %llu %u bytes\n",
+ wdata->offset, wdata->bytes);
req->Length = cpu_to_le32(wdata->bytes);
@@ -1670,7 +1757,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
- cERROR(1, "Send error in write = %d", rc);
+ cifs_dbg(VFS, "Send error in write = %d\n", rc);
} else
*nbytes = le32_to_cpu(rsp->DataLength);
@@ -1696,14 +1783,14 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
((char *)entryptr + next_offset);
if ((char *)entryptr + size > end_of_buf) {
- cERROR(1, "malformed search entry would overflow");
+ cifs_dbg(VFS, "malformed search entry would overflow\n");
break;
}
len = le32_to_cpu(entryptr->FileNameLength);
if ((char *)entryptr + len + size > end_of_buf) {
- cERROR(1, "directory entry name would overflow frame "
- "end of buf %p", end_of_buf);
+ cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
+ end_of_buf);
break;
}
@@ -1759,8 +1846,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
break;
default:
- cERROR(1, "info level %u isn't supported",
- srch_inf->info_level);
+ cifs_dbg(VFS, "info level %u isn't supported\n",
+ srch_inf->info_level);
rc = -EINVAL;
goto qdir_exit;
}
@@ -1824,15 +1911,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
num_entries(srch_inf->srch_entries_start, end_of_smb,
&srch_inf->last_entry, info_buf_size);
srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
- cFYI(1, "num entries %d last_index %lld srch start %p srch end %p",
- srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
- srch_inf->srch_entries_start, srch_inf->last_entry);
+ cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
+ srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
+ srch_inf->srch_entries_start, srch_inf->last_entry);
if (resp_buftype == CIFS_LARGE_BUFFER)
srch_inf->smallBuf = false;
else if (resp_buftype == CIFS_SMALL_BUFFER)
srch_inf->smallBuf = true;
else
- cERROR(1, "illegal search buffer type");
+ cifs_dbg(VFS, "illegal search buffer type\n");
if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
srch_inf->endOfSearch = 1;
@@ -2017,7 +2104,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_oplock_break *req = NULL;
- cFYI(1, "SMB2_oplock_break");
+ cifs_dbg(FYI, "SMB2_oplock_break\n");
rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
if (rc)
@@ -2033,7 +2120,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- cFYI(1, "Send error in Oplock Break = %d", rc);
+ cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
}
return rc;
@@ -2058,7 +2145,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
int rc;
struct smb2_query_info_req *req;
- cFYI(1, "Query FSInfo level %d", level);
+ cifs_dbg(FYI, "Query FSInfo level %d\n", level);
if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
return -EIO;
@@ -2131,7 +2218,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buf_type;
unsigned int count;
- cFYI(1, "smb2_lockv num lock %d", num_lock);
+ cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
if (rc)
@@ -2155,7 +2242,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
if (rc) {
- cFYI(1, "Send error in smb2_lockv = %d", rc);
+ cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
}
@@ -2186,7 +2273,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_lease_ack *req = NULL;
- cFYI(1, "SMB2_lease_break");
+ cifs_dbg(FYI, "SMB2_lease_break\n");
rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
if (rc)
@@ -2204,7 +2291,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- cFYI(1, "Send error in Lease Break = %d", rc);
+ cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
}
return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258cb..f31043b26bd3 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
/*
* fs/cifs/smb2pdu.h
*
- * Copyright (c) International Business Machines Corp., 2009, 2010
+ * Copyright (c) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
#define SMB20_PROT_ID 0x0202
#define SMB21_PROT_ID 0x0210
#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
#define BAD_PROT_ID 0xFFFF
/* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
+#define SHI1005_FLAGS_ALL 0x0000FF33
/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008)
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
struct smb2_tree_disconnect_req {
struct smb2_hdr hdr;
@@ -477,6 +485,75 @@ struct create_lease {
struct lease_context lcontext;
} __packed;
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+ char SourceKey[24];
+ __le32 ChunkCount; /* we are only sending 1 */
+ __le32 Reserved;
+ /* array will only be one chunk long for us */
+ __le64 SourceOffset;
+ __le64 TargetOffset;
+ __le32 Length; /* how many bytes to copy */
+ __u32 Reserved2;
+} __packed;
+
+/* Response and Request are the same format */
+struct validate_negotiate_info {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 DialectCount;
+ __le16 Dialect[1];
+} __packed;
+
+#define RSS_CAPABLE 0x00000001
+#define RDMA_CAPABLE 0x00000002
+
+struct network_interface_info_ioctl_rsp {
+ __le32 Next; /* next interface. zero if this is last one */
+ __le32 IfIndex;
+ __le32 Capability; /* RSS or RDMA Capable */
+ __le32 Reserved;
+ __le64 LinkSpeed;
+ char SockAddr_Storage[128];
+} __packed;
+
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+
+struct smb2_ioctl_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 InputOffset;
+ __le32 InputCount;
+ __le32 MaxInputResponse;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 MaxOutputResponse;
+ __le32 Flags;
+ __u32 Reserved2;
+ char Buffer[0];
+} __packed;
+
+struct smb2_ioctl_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 InputOffset;
+ __le32 InputCount;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 Flags;
+ __u32 Reserved2;
+ /* char * buffer[] */
+} __packed;
+
/* Currently defined values for close flags */
#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
struct smb2_close_req {
@@ -517,17 +594,25 @@ struct smb2_flush_rsp {
__le16 Reserved;
} __packed;
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED 0x01
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE 0x00000000
+#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+
struct smb2_read_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 49 */
__u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Reserved;
+ __u8 Flags; /* MBZ unless SMB3.02 or later */
__le32 Length;
__le64 Offset;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
__le32 MinimumCount;
- __le32 Channel; /* Reserved MBZ */
+ __le32 Channel; /* MBZ except for SMB3 or later */
__le32 RemainingBytes;
__le16 ReadChannelInfoOffset; /* Reserved MBZ */
__le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +630,9 @@ struct smb2_read_rsp {
__u8 Buffer[1];
} __packed;
-/* For write request Flags field below the following flag is defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
struct smb2_write_req {
struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38ce..d4e1eb807457 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -111,6 +111,10 @@ extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
__u32 desired_access, __u32 create_disposition,
__u32 file_attributes, __u32 create_options,
__u8 *oplock, struct smb2_file_all_info *buf);
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ char **out_data, u32 *plen /* returned data len */);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8dd73e61d762..09b4fbaadeb6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -55,13 +55,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_setkey(server->secmech.hmacsha256,
server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
- cERROR(1, "%s: Could not update with response\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
if (rc) {
- cERROR(1, "%s: Could not init md5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
return rc;
}
@@ -69,7 +69,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (iov[i].iov_len == 0)
continue;
if (iov[i].iov_base == NULL) {
- cERROR(1, "null iovec entry");
+ cifs_dbg(VFS, "null iovec entry\n");
return -EIO;
}
/*
@@ -90,8 +90,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
iov[i].iov_base, iov[i].iov_len);
}
if (rc) {
- cERROR(1, "%s: Could not update with payload\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with payload\n",
+ __func__);
return rc;
}
}
@@ -109,18 +109,162 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
sigptr);
if (rc)
- cERROR(1, "%s: Could not generate sha256 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
return rc;
}
+void
+generate_smb3signingkey(struct TCP_Server_Info *server)
+{
+ unsigned char zero = 0x0;
+ __u8 i[4] = {0, 0, 0, 1};
+ __u8 L[4] = {0, 0, 0, 128};
+ int rc = 0;
+ unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+ unsigned char *hashptr = prfhash;
+
+ memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+ memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+
+ rc = crypto_shash_setkey(server->secmech.hmacsha256,
+ server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ i, 4);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ "SMB2AESCMAC", 12);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ &zero, 1);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ "SmbSign", 8);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ L, 4);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+ hashptr);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+
+smb3signkey_ret:
+ return;
+}
+
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- cFYI(1, "smb3 signatures not supported yet");
- return -EOPNOTSUPP;
+ int i, rc;
+ unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+ unsigned char *sigptr = smb3_signature;
+ struct kvec *iov = rqst->rq_iov;
+ int n_vec = rqst->rq_nvec;
+ struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+
+ memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+ memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+ rc = crypto_shash_setkey(server->secmech.cmacaes,
+ server->smb3signingkey, SMB2_CMACAES_SIZE);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+ return rc;
+ }
+
+ rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+ return rc;
+ }
+
+ for (i = 0; i < n_vec; i++) {
+ if (iov[i].iov_len == 0)
+ continue;
+ if (iov[i].iov_base == NULL) {
+ cifs_dbg(VFS, "null iovec entry");
+ return -EIO;
+ }
+ /*
+ * The first entry includes a length field (which does not get
+ * signed that occupies the first 4 bytes before the header).
+ */
+ if (i == 0) {
+ if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+ break; /* nothing to sign or corrupt header */
+ rc =
+ crypto_shash_update(
+ &server->secmech.sdesccmacaes->shash,
+ iov[i].iov_base + 4, iov[i].iov_len - 4);
+ } else {
+ rc =
+ crypto_shash_update(
+ &server->secmech.sdesccmacaes->shash,
+ iov[i].iov_base, iov[i].iov_len);
+ }
+ if (rc) {
+ cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+ __func__);
+ return rc;
+ }
+ }
+
+ /* now hash over the rq_pages array */
+ for (i = 0; i < rqst->rq_npages; i++) {
+ struct kvec p_iov;
+
+ cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+ crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+ p_iov.iov_base, p_iov.iov_len);
+ kunmap(rqst->rq_pages[i]);
+ }
+
+ rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+ sigptr);
+ if (rc)
+ cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+
+ memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+ return rc;
}
/* must be called with server->srv_mutex held */
@@ -163,8 +307,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
/* Do not need to verify session setups with signature "BSRSPYL " */
if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
- cFYI(1, "dummy signature received for smb command 0x%x",
- smb2_pdu->Command);
+ cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+ smb2_pdu->Command);
/*
* Save off the origiginal signature so we can modify the smb and check
@@ -205,7 +349,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
struct mid_q_entry *temp;
if (server == NULL) {
- cERROR(1, "Null TCP session in smb2_mid_entry_alloc");
+ cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
return NULL;
}
@@ -241,7 +385,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
return -ENOENT;
if (ses->server->tcpStatus == CifsNeedReconnect) {
- cFYI(1, "tcp session dead - return to caller to retry");
+ cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
@@ -275,14 +419,13 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dump_smb(mid->resp_buf, min_t(u32, 80, len));
/* convert the length into a more usable form */
- if ((len > 24) &&
- (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
+ if (len > 24 && server->sign) {
int rc;
rc = smb2_verify_signature(&rqst, server);
if (rc)
- cERROR(1, "SMB signature verification returned error = "
- "%d", rc);
+ cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+ rc);
}
return map_smb2_to_linux_error(mid->resp_buf, log_error);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a0a58fbe2c10..43eb1367b103 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -78,7 +78,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_des)) {
rc = PTR_ERR(tfm_des);
- cERROR(1, "could not allocate des crypto API");
+ cifs_dbg(VFS, "could not allocate des crypto API\n");
goto smbhash_err;
}
@@ -91,7 +91,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
if (rc)
- cERROR(1, "could not encrypt crypt key rc: %d", rc);
+ cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
crypto_free_blkcipher(tfm_des);
smbhash_err:
@@ -139,14 +139,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
md4 = crypto_alloc_shash("md4", 0, 0);
if (IS_ERR(md4)) {
rc = PTR_ERR(md4);
- cERROR(1, "%s: Crypto md4 allocation error %d", __func__, rc);
+ cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n",
+ __func__, rc);
return rc;
}
size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
sdescmd4 = kmalloc(size, GFP_KERNEL);
if (!sdescmd4) {
rc = -ENOMEM;
- cERROR(1, "%s: Memory allocation failure", __func__);
goto mdfour_err;
}
sdescmd4->shash.tfm = md4;
@@ -154,17 +154,17 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
rc = crypto_shash_init(&sdescmd4->shash);
if (rc) {
- cERROR(1, "%s: Could not init md4 shash", __func__);
+ cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
goto mdfour_err;
}
rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
if (rc) {
- cERROR(1, "%s: Could not update with link_str", __func__);
+ cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
goto mdfour_err;
}
rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
if (rc)
- cERROR(1, "%s: Could not genereate md4 hash", __func__);
+ cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
mdfour_err:
crypto_free_shash(md4);
@@ -238,7 +238,8 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
rc = E_md4hash(passwd, p16, codepage);
if (rc) {
- cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+ cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+ __func__, rc);
return rc;
}
memcpy(p21, p16, 16);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e087..d952ee48f4dc 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
/*
* fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
*
- * Copyright (c) International Business Machines Corp., 2002,2009
+ * Copyright (c) International Business Machines Corp., 2002,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
/* IOCTL information */
/*
* List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2 client. There is probably
+ * future to remote clients like cifs or SMB2/SMB3 client. This is probably
* a slightly larger set of fsctls that NTFS local filesystem could handle,
* including the seven below that we do not have struct definitions for.
* Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
* remotely. Some of the following, such as the encryption/compression ones
* could be invoked from tools via a specialized hook into the VFS rather
* than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
*/
+#define FSCTL_DFS_GET_REFERRALS 0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0
#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
@@ -71,14 +77,31 @@
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
#define FSCTL_SIS_LINK_FILES 0x0009C104
#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
/* strange that the number for this op is not sequential with previous op */
#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */
#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
#define IO_REPARSE_TAG_HSM 0xC0000004
#define IO_REPARSE_TAG_SIS 0x80000007
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
+
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1a528680ec5a..6fdcb1b4a106 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -49,7 +49,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
struct mid_q_entry *temp;
if (server == NULL) {
- cERROR(1, "Null TCP session in AllocMidQEntry");
+ cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
return NULL;
}
@@ -61,7 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
temp->mid = smb_buffer->Mid; /* always LE */
temp->pid = current->pid;
temp->command = cpu_to_le16(smb_buffer->Command);
- cFYI(1, "For smb_command %d", smb_buffer->Command);
+ cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
/* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
/* when mid allocated can be before when sent */
temp->when_alloc = jiffies;
@@ -179,17 +179,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
*/
rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
n_vec - first_vec, remaining);
- if (rc == -ENOSPC || rc == -EAGAIN) {
- /*
- * Catch if a low level driver returns -ENOSPC. This
- * WARN_ON will be removed by 3.10 if no one reports
- * seeing this.
- */
- WARN_ON_ONCE(rc == -ENOSPC);
+ if (rc == -EAGAIN) {
i++;
if (i >= 14 || (!server->noblocksnd && (i > 2))) {
- cERROR(1, "sends on sock %p stuck for 15 "
- "seconds", ssocket);
+ cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+ ssocket);
rc = -EAGAIN;
break;
}
@@ -209,14 +203,14 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
}
if (rc > remaining) {
- cERROR(1, "sent %d requested %d", rc, remaining);
+ cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining);
break;
}
if (rc == 0) {
/* should never happen, letting socket clear before
retrying is our only obvious option here */
- cERROR(1, "tcp sent no data");
+ cifs_dbg(VFS, "tcp sent no data\n");
msleep(500);
continue;
}
@@ -291,7 +285,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
if (ssocket == NULL)
return -ENOTSOCK;
- cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
+ cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
dump_smb(iov[0].iov_base, iov[0].iov_len);
/* cork the socket */
@@ -324,8 +318,8 @@ uncork:
(char *)&val, sizeof(val));
if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
- cFYI(1, "partial send (wanted=%u sent=%zu): terminating "
- "session", smb_buf_length + 4, total_len);
+ cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
+ smb_buf_length + 4, total_len);
/*
* If we have only sent part of an SMB then the next SMB could
* be taken as the remainder of this one. We need to kill the
@@ -335,7 +329,8 @@ uncork:
}
if (rc < 0 && rc != -EINTR)
- cERROR(1, "Error %d sending data on socket to server", rc);
+ cifs_dbg(VFS, "Error %d sending data on socket to server\n",
+ rc);
else
rc = 0;
@@ -427,7 +422,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
}
if (ses->server->tcpStatus == CifsNeedReconnect) {
- cFYI(1, "tcp session dead - return to caller to retry");
+ cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
@@ -452,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_freezekillable(server->response_q,
+ error = wait_event_freezekillable_unsafe(server->response_q,
midQ->mid_state != MID_REQUEST_SUBMITTED);
if (error < 0)
return -ERESTARTSYS;
@@ -468,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
struct mid_q_entry *mid;
/* enable signing if server requires it */
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (server->sign)
hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
mid = AllocMidQEntry(hdr, server);
@@ -527,6 +522,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
rc = smb_send_rqst(server, rqst);
cifs_in_send_dec(server);
cifs_save_when_sent(mid);
+
+ if (rc < 0)
+ server->sequence_number -= 2;
mutex_unlock(&server->srv_mutex);
if (rc == 0)
@@ -559,7 +557,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
flags |= CIFS_NO_RESP;
rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
- cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
+ cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
return rc;
}
@@ -569,8 +567,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
int rc = 0;
- cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__,
- le16_to_cpu(mid->command), mid->mid, mid->mid_state);
+ cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
+ __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
spin_lock(&GlobalMid_Lock);
switch (mid->mid_state) {
@@ -588,8 +586,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
break;
default:
list_del_init(&mid->qhead);
- cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__,
- mid->mid, mid->mid_state);
+ cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
+ __func__, mid->mid, mid->mid_state);
rc = -EIO;
}
spin_unlock(&GlobalMid_Lock);
@@ -614,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dump_smb(mid->resp_buf, min_t(u32, 92, len));
/* convert the length into a more usable form */
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
struct kvec iov;
int rc = 0;
struct smb_rqst rqst = { .rq_iov = &iov,
@@ -624,10 +622,10 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
iov.iov_len = len;
/* FIXME: add code to kill session */
rc = cifs_verify_signature(&rqst, server,
- mid->sequence_number + 1);
+ mid->sequence_number);
if (rc)
- cERROR(1, "SMB signature verification returned error = "
- "%d", rc);
+ cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+ rc);
}
/* BB special case reconnect tid and uid here? */
@@ -672,7 +670,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
if ((ses == NULL) || (ses->server == NULL)) {
cifs_small_buf_release(buf);
- cERROR(1, "Null session");
+ cifs_dbg(VFS, "Null session\n");
return -EIO;
}
@@ -716,6 +714,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
cifs_in_send_dec(ses->server);
cifs_save_when_sent(midQ);
+ if (rc < 0)
+ ses->server->sequence_number -= 2;
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
@@ -752,7 +752,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
- cFYI(1, "Bad MID state?");
+ cifs_dbg(FYI, "Bad MID state?\n");
goto out;
}
@@ -788,11 +788,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
struct mid_q_entry *midQ;
if (ses == NULL) {
- cERROR(1, "Null smb session");
+ cifs_dbg(VFS, "Null smb session\n");
return -EIO;
}
if (ses->server == NULL) {
- cERROR(1, "Null tcp session");
+ cifs_dbg(VFS, "Null tcp session\n");
return -EIO;
}
@@ -805,8 +805,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
MAX_CIFS_HDR_SIZE - 4) {
- cERROR(1, "Illegal length, greater than maximum frame, %d",
- be32_to_cpu(in_buf->smb_buf_length));
+ cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+ be32_to_cpu(in_buf->smb_buf_length));
return -EIO;
}
@@ -840,6 +840,10 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
cifs_in_send_dec(ses->server);
cifs_save_when_sent(midQ);
+
+ if (rc < 0)
+ ses->server->sequence_number -= 2;
+
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0)
@@ -871,7 +875,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (!midQ->resp_buf || !out_buf ||
midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
- cERROR(1, "Bad MID state?");
+ cifs_dbg(VFS, "Bad MID state?\n");
goto out;
}
@@ -921,13 +925,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_ses *ses;
if (tcon == NULL || tcon->ses == NULL) {
- cERROR(1, "Null smb session");
+ cifs_dbg(VFS, "Null smb session\n");
return -EIO;
}
ses = tcon->ses;
if (ses->server == NULL) {
- cERROR(1, "Null tcp session");
+ cifs_dbg(VFS, "Null tcp session\n");
return -EIO;
}
@@ -940,8 +944,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
MAX_CIFS_HDR_SIZE - 4) {
- cERROR(1, "Illegal length, greater than maximum frame, %d",
- be32_to_cpu(in_buf->smb_buf_length));
+ cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+ be32_to_cpu(in_buf->smb_buf_length));
return -EIO;
}
@@ -973,6 +977,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
cifs_in_send_dec(ses->server);
cifs_save_when_sent(midQ);
+
+ if (rc < 0)
+ ses->server->sequence_number -= 2;
+
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
@@ -1038,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* rcvd frame is ok */
if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
rc = -EIO;
- cERROR(1, "Bad MID state?");
+ cifs_dbg(VFS, "Bad MID state?\n");
goto out;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 5142f2c60278..09afda4cc58e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -68,12 +68,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
goto remove_ea_exit;
}
if (ea_name == NULL) {
- cFYI(1, "Null xattr names not supported");
+ cifs_dbg(FYI, "Null xattr names not supported\n");
} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
&& (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
- cFYI(1,
- "illegal xattr request %s (only user namespace supported)",
- ea_name);
+ cifs_dbg(FYI,
+ "illegal xattr request %s (only user namespace supported)\n",
+ ea_name);
/* BB what if no namespace prefix? */
/* Should we just pass them to server, except for
system and perhaps security prefixes? */
@@ -134,19 +134,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
search server for EAs or streams to
returns as xattrs */
if (value_size > MAX_EA_VALUE_SIZE) {
- cFYI(1, "size of EA value too large");
+ cifs_dbg(FYI, "size of EA value too large\n");
rc = -EOPNOTSUPP;
goto set_ea_exit;
}
if (ea_name == NULL) {
- cFYI(1, "Null xattr names not supported");
+ cifs_dbg(FYI, "Null xattr names not supported\n");
} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
== 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
- cFYI(1, "attempt to set cifs inode metadata");
+ cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -167,8 +167,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
struct cifs_ntsd *pacl;
pacl = kmalloc(value_size, GFP_KERNEL);
if (!pacl) {
- cFYI(1, "%s: Can't allocate memory for ACL",
- __func__);
rc = -ENOMEM;
} else {
memcpy(pacl, ea_value, value_size);
@@ -179,7 +177,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
kfree(pacl);
}
#else
- cFYI(1, "Set CIFS ACL not supported yet");
+ cifs_dbg(FYI, "Set CIFS ACL not supported yet\n");
#endif /* CONFIG_CIFS_ACL */
} else {
int temp;
@@ -193,9 +191,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
ACL_TYPE_ACCESS, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- cFYI(1, "set POSIX ACL rc %d", rc);
+ cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
#else
- cFYI(1, "set POSIX ACL not supported");
+ cifs_dbg(FYI, "set POSIX ACL not supported\n");
#endif
} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -206,13 +204,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
ACL_TYPE_DEFAULT, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- cFYI(1, "set POSIX default ACL rc %d", rc);
+ cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
#else
- cFYI(1, "set default POSIX ACL not supported");
+ cifs_dbg(FYI, "set default POSIX ACL not supported\n");
#endif
} else {
- cFYI(1, "illegal xattr request %s (only user namespace"
- " supported)", ea_name);
+ cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n",
+ ea_name);
/* BB what if no namespace prefix? */
/* Should we just pass them to server, except for
system and perhaps security prefixes? */
@@ -263,14 +261,14 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
/* return dos attributes as pseudo xattr */
/* return alt name if available as pseudo attr */
if (ea_name == NULL) {
- cFYI(1, "Null xattr names not supported");
+ cifs_dbg(FYI, "Null xattr names not supported\n");
} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
== 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
- cFYI(1, "attempt to query cifs inode metadata");
+ cifs_dbg(FYI, "attempt to query cifs inode metadata\n");
/* revalidate/getattr then populate from inode */
} /* BB add else when above is implemented */
ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
@@ -295,7 +293,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
#else
- cFYI(1, "Query POSIX ACL not supported yet");
+ cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -307,7 +305,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
#else
- cFYI(1, "Query POSIX default ACL not supported yet");
+ cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
@@ -319,8 +317,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
full_path, &acllen);
if (IS_ERR(pacl)) {
rc = PTR_ERR(pacl);
- cERROR(1, "%s: error %zd getting sec desc",
- __func__, rc);
+ cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
+ __func__, rc);
} else {
if (ea_value) {
if (acllen > buf_size)
@@ -332,18 +330,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
kfree(pacl);
}
#else
- cFYI(1, "Query CIFS ACL not supported yet");
+ cifs_dbg(FYI, "Query CIFS ACL not supported yet\n");
#endif /* CONFIG_CIFS_ACL */
} else if (strncmp(ea_name,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
- cFYI(1, "Trusted xattr namespace not supported yet");
+ cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n");
} else if (strncmp(ea_name,
XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
- cFYI(1, "Security xattr namespace not supported yet");
+ cifs_dbg(FYI, "Security xattr namespace not supported yet\n");
} else
- cFYI(1,
- "illegal xattr request %s (only user namespace supported)",
- ea_name);
+ cifs_dbg(FYI,
+ "illegal xattr request %s (only user namespace supported)\n",
+ ea_name);
/* We could add an additional check for streams ie
if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
struct inode *new_inode, struct dentry *new_dentry);
/* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
/* dentry ops */
static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
static int coda_dentry_delete(const struct dentry *);
/* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
- filldir_t filldir);
+static int coda_venus_readdir(struct file *, struct dir_context *);
/* same as fs/bad_inode.c */
static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
const struct file_operations coda_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = coda_readdir,
+ .iterate = coda_readdir,
.open = coda_open,
.release = coda_release,
.fsync = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
/* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
{
struct coda_file_info *cfi;
struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
if (!host_file->f_op)
return -ENOTDIR;
- if (host_file->f_op->readdir)
- {
- /* potemkin case: we were handed a directory inode.
- * We can't use vfs_readdir because we have to keep the file
- * position in sync between the coda_file and the host_file.
- * and as such we need grab the inode mutex. */
+ if (host_file->f_op->iterate) {
struct inode *host_inode = file_inode(host_file);
-
mutex_lock(&host_inode->i_mutex);
- host_file->f_pos = coda_file->f_pos;
-
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
- ret = host_file->f_op->readdir(host_file, buf, filldir);
+ ret = host_file->f_op->iterate(host_file, ctx);
file_accessed(host_file);
}
-
- coda_file->f_pos = host_file->f_pos;
mutex_unlock(&host_inode->i_mutex);
+ return ret;
}
- else /* Venus: we must read Venus dirents from a file */
- ret = coda_venus_readdir(coda_file, buf, filldir);
-
- return ret;
+ /* Venus: we must read Venus dirents from a file */
+ return coda_venus_readdir(coda_file, ctx);
}
static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
}
/* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
- filldir_t filldir)
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
{
- int result = 0; /* # of entries returned */
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
if (!vdir) return -ENOMEM;
- if (coda_file->f_pos == 0) {
- ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
- if (ret < 0)
- goto out;
- result++;
- coda_file->f_pos++;
- }
- if (coda_file->f_pos == 1) {
- ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
- if (ret < 0)
- goto out;
- result++;
- coda_file->f_pos++;
- }
+ if (!dir_emit_dots(coda_file, ctx))
+ goto out;
+
while (1) {
/* read entries from the directory file */
- ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+ ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
sizeof(*vdir));
if (ret < 0) {
printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
/* Make sure we skip '.' and '..', we already got those */
if (name.name[0] == '.' && (name.len == 1 ||
- (vdir->d_name[1] == '.' && name.len == 2)))
+ (name.name[1] == '.' && name.len == 2)))
vdir->d_fileno = name.len = 0;
/* skip null entries */
if (vdir->d_fileno && name.len) {
- /* try to look up this entry in the dcache, that way
- * userspace doesn't have to worry about breaking
- * getcwd by having mismatched inode numbers for
- * internal volume mountpoints. */
- ino = find_inode_number(de, &name);
- if (!ino) ino = vdir->d_fileno;
-
+ ino = vdir->d_fileno;
type = CDT2DT(vdir->d_type);
- ret = filldir(buf, name.name, name.len,
- coda_file->f_pos, ino, type);
- /* failure means no space for filling in this round */
- if (ret < 0) break;
- result++;
+ if (!dir_emit(ctx, name.name, name.len, ino, type))
+ break;
}
/* we'll always have progress because d_reclen is unsigned and
* we've already established it is non-zero. */
- coda_file->f_pos += vdir->d_reclen;
+ ctx->pos += vdir->d_reclen;
}
out:
kfree(vdir);
- return result ? result : ret;
+ return 0;
}
/* called when a cache lookup succeeds */
@@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
if (cii->c_flags & C_FLUSH)
coda_flag_inode_children(inode, C_FLUSH);
- if (de->d_count > 1)
+ if (d_count(de) > 1)
/* pretend it's valid, but don't change the flags */
goto out;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index fa4c100bdc7d..380b798f8443 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -79,6 +79,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
return -EINVAL;
host_inode = file_inode(host_file);
+ file_start_write(host_file);
mutex_lock(&coda_inode->i_mutex);
ret = host_file->f_op->write(host_file, buf, count, ppos);
@@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
mutex_unlock(&coda_inode->i_mutex);
+ file_end_write(host_file);
return ret;
}
diff --git a/fs/compat.c b/fs/compat.c
index d487985dd0ea..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,10 +44,10 @@
#include <linux/signal.h>
#include <linux/poll.h>
#include <linux/mm.h>
-#include <linux/eventpoll.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -68,8 +68,6 @@ int compat_printk(const char *fmt, ...)
return ret;
}
-#include "read_write.h"
-
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
@@ -834,6 +832,7 @@ struct compat_old_linux_dirent {
};
struct compat_readdir_callback {
+ struct dir_context ctx;
struct compat_old_linux_dirent __user *dirent;
int result;
};
@@ -875,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
{
int error;
struct fd f = fdget(fd);
- struct compat_readdir_callback buf;
+ struct compat_readdir_callback buf = {
+ .ctx.actor = compat_fillonedir,
+ .dirent = dirent
+ };
if (!f.file)
return -EBADF;
- buf.result = 0;
- buf.dirent = dirent;
-
- error = vfs_readdir(f.file, compat_fillonedir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (buf.result)
error = buf.result;
@@ -899,6 +898,7 @@ struct compat_linux_dirent {
};
struct compat_getdents_callback {
+ struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
struct compat_linux_dirent __user *previous;
int count;
@@ -953,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
{
struct fd f;
struct compat_linux_dirent __user * lastdirent;
- struct compat_getdents_callback buf;
+ struct compat_getdents_callback buf = {
+ .ctx.actor = compat_filldir,
+ .current_dir = dirent,
+ .count = count
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -963,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, compat_filldir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- if (put_user(f.file->f_pos, &lastdirent->d_off))
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
@@ -985,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
struct compat_getdents_callback64 {
+ struct dir_context ctx;
struct linux_dirent64 __user *current_dir;
struct linux_dirent64 __user *previous;
int count;
@@ -1038,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
- struct compat_getdents_callback64 buf;
+ struct compat_getdents_callback64 buf = {
+ .ctx.actor = compat_filldir64,
+ .current_dir = dirent,
+ .count = count
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1048,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, compat_filldir64, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- typeof(lastdirent->d_off) d_off = f.file->f_pos;
+ typeof(lastdirent->d_off) d_off = buf.ctx.pos;
if (__put_user_unaligned(d_off, &lastdirent->d_off))
error = -EFAULT;
else
@@ -1069,210 +1068,6 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
}
#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
-static ssize_t compat_do_readv_writev(int type, struct file *file,
- const struct compat_iovec __user *uvector,
- unsigned long nr_segs, loff_t *pos)
-{
- compat_ssize_t tot_len;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- ssize_t ret;
- io_fn_t fn;
- iov_fn_t fnv;
-
- ret = -EINVAL;
- if (!file->f_op)
- goto out;
-
- ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov);
- if (ret <= 0)
- goto out;
-
- tot_len = ret;
- ret = rw_verify_area(type, file, pos, tot_len);
- if (ret < 0)
- goto out;
-
- fnv = NULL;
- if (type == READ) {
- fn = file->f_op->read;
- fnv = file->f_op->aio_read;
- } else {
- fn = (io_fn_t)file->f_op->write;
- fnv = file->f_op->aio_write;
- }
-
- if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
- else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
-
-out:
- if (iov != iovstack)
- kfree(iov);
- if ((ret + (type == READ)) > 0) {
- if (type == READ)
- fsnotify_access(file);
- else
- fsnotify_modify(file);
- }
- return ret;
-}
-
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
-{
- ssize_t ret = -EBADF;
-
- if (!(file->f_mode & FMODE_READ))
- goto out;
-
- ret = -EINVAL;
- if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
- goto out;
-
- ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
-
-out:
- if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen)
-{
- struct fd f = fdget(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos);
- f.file->f_pos = pos;
- fdput(f);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos);
- fdput(f);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, u32 pos_low, u32 pos_high)
-{
- loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return compat_sys_preadv64(fd, vec, vlen, pos);
-}
-
-static size_t compat_writev(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
-{
- ssize_t ret = -EBADF;
-
- if (!(file->f_mode & FMODE_WRITE))
- goto out;
-
- ret = -EINVAL;
- if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
- goto out;
-
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
-
-out:
- if (ret > 0)
- add_wchar(current, ret);
- inc_syscw(current);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen)
-{
- struct fd f = fdget(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos);
- f.file->f_pos = pos;
- fdput(f);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos);
- fdput(f);
- return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
- unsigned long vlen, u32 pos_low, u32 pos_high)
-{
- loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return compat_sys_pwritev64(fd, vec, vlen, pos);
-}
-
-asmlinkage long
-compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
- unsigned int nr_segs, unsigned int flags)
-{
- unsigned i;
- struct iovec __user *iov;
- if (nr_segs > UIO_MAXIOV)
- return -EINVAL;
- iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
- for (i = 0; i < nr_segs; i++) {
- struct compat_iovec v;
- if (get_user(v.iov_base, &iov32[i].iov_base) ||
- get_user(v.iov_len, &iov32[i].iov_len) ||
- put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
- put_user(v.iov_len, &iov[i].iov_len))
- return -EFAULT;
- }
- return sys_vmsplice(fd, iov, nr_segs, flags);
-}
-
/*
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
@@ -1658,84 +1453,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
return ret;
}
-#ifdef CONFIG_EPOLL
-
-asmlinkage long compat_sys_epoll_pwait(int epfd,
- struct compat_epoll_event __user *events,
- int maxevents, int timeout,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
-{
- long err;
- compat_sigset_t csigmask;
- sigset_t ksigmask, sigsaved;
-
- /*
- * If the caller wants a certain signal mask to be set during the wait,
- * we apply it here.
- */
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &csigmask);
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (err == -EINTR) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
-
- return err;
-}
-
-#endif /* CONFIG_EPOLL */
-
-#ifdef CONFIG_SIGNALFD
-
-asmlinkage long compat_sys_signalfd4(int ufd,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize, int flags)
-{
- compat_sigset_t ss32;
- sigset_t tmp;
- sigset_t __user *ksigmask;
-
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&tmp, &ss32);
- ksigmask = compat_alloc_user_space(sizeof(sigset_t));
- if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
- return -EFAULT;
-
- return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
-}
-
-asmlinkage long compat_sys_signalfd(int ufd,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
-{
- return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
-}
-#endif /* CONFIG_SIGNALFD */
-
#ifdef CONFIG_FHANDLE
/*
* Exactly like fs/open.c:sys_open_by_handle_at(), except that it
@@ -1747,25 +1464,3 @@ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
return do_handle_open(mountdirfd, handle, flags);
}
#endif
-
-#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
-asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
- compat_off_t __user *offset, compat_size_t count)
-{
- loff_t pos;
- off_t off;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(get_user(off, offset)))
- return -EFAULT;
- pos = off;
- ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3ced75f765ca..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
#include <linux/gigaset_dev.h>
#ifdef CONFIG_BLOCK
-#include <linux/loop.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <scsi/scsi.h>
@@ -608,7 +607,6 @@ struct serial_struct32 {
static int serial_struct_ioctl(unsigned fd, unsigned cmd,
struct serial_struct32 __user *ss32)
{
- typedef struct serial_struct SS;
typedef struct serial_struct32 SS32;
int err;
struct serial_struct ss;
@@ -955,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
/* Socket level stuff */
COMPATIBLE_IOCTL(FIOQSIZE)
#ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
/* md calls this on random blockdevs */
IGNORE_IOCTL(RAID_VERSION)
/* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..5e7c60c1cb63 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
- pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+ pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
dput(parent);
}
@@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
return (sd->s_mode >> 12) & 15;
}
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct super_block *sb = dentry->d_sb;
struct configfs_dirent * parent_sd = dentry->d_fsdata;
- struct configfs_dirent *cursor = filp->private_data;
+ struct configfs_dirent *cursor = file->private_data;
struct list_head *p, *q = &cursor->s_sibling;
ino_t ino = 0;
- int i = filp->f_pos;
- switch (i) {
- case 0:
- ino = dentry->d_inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- default:
- if (filp->f_pos == 2) {
- spin_lock(&configfs_dirent_lock);
- list_move(q, &parent_sd->s_children);
- spin_unlock(&configfs_dirent_lock);
- }
- for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
- struct configfs_dirent *next;
- const char * name;
- int len;
- struct inode *inode = NULL;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ if (ctx->pos == 2) {
+ spin_lock(&configfs_dirent_lock);
+ list_move(q, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+ struct configfs_dirent *next;
+ const char *name;
+ int len;
+ struct inode *inode = NULL;
+
+ next = list_entry(p, struct configfs_dirent, s_sibling);
+ if (!next->s_element)
+ continue;
- next = list_entry(p, struct configfs_dirent,
- s_sibling);
- if (!next->s_element)
- continue;
-
- name = configfs_get_name(next);
- len = strlen(name);
-
- /*
- * We'll have a dentry and an inode for
- * PINNED items and for open attribute
- * files. We lock here to prevent a race
- * with configfs_d_iput() clearing
- * s_dentry before calling iput().
- *
- * Why do we go to the trouble? If
- * someone has an attribute file open,
- * the inode number should match until
- * they close it. Beyond that, we don't
- * care.
- */
- spin_lock(&configfs_dirent_lock);
- dentry = next->s_dentry;
- if (dentry)
- inode = dentry->d_inode;
- if (inode)
- ino = inode->i_ino;
- spin_unlock(&configfs_dirent_lock);
- if (!inode)
- ino = iunique(sb, 2);
+ name = configfs_get_name(next);
+ len = strlen(name);
+
+ /*
+ * We'll have a dentry and an inode for
+ * PINNED items and for open attribute
+ * files. We lock here to prevent a race
+ * with configfs_d_iput() clearing
+ * s_dentry before calling iput().
+ *
+ * Why do we go to the trouble? If
+ * someone has an attribute file open,
+ * the inode number should match until
+ * they close it. Beyond that, we don't
+ * care.
+ */
+ spin_lock(&configfs_dirent_lock);
+ dentry = next->s_dentry;
+ if (dentry)
+ inode = dentry->d_inode;
+ if (inode)
+ ino = inode->i_ino;
+ spin_unlock(&configfs_dirent_lock);
+ if (!inode)
+ ino = iunique(sb, 2);
- if (filldir(dirent, name, len, filp->f_pos, ino,
- dt_type(next)) < 0)
- return 0;
+ if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+ return 0;
- spin_lock(&configfs_dirent_lock);
- list_move(q, p);
- spin_unlock(&configfs_dirent_lock);
- p = q;
- filp->f_pos++;
- }
+ spin_lock(&configfs_dirent_lock);
+ list_move(q, p);
+ spin_unlock(&configfs_dirent_lock);
+ p = q;
+ ctx->pos++;
}
return 0;
}
@@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = {
.release = configfs_dir_close,
.llseek = configfs_dir_lseek,
.read = generic_read_dir,
- .readdir = configfs_readdir,
+ .iterate = configfs_readdir,
};
int configfs_register_subsystem(struct configfs_subsystem *subsys)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, count);
+ len = flush_write_buffer(file->f_path.dentry, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index c6479658d487..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
#include <trace/events/sched.h>
int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
struct core_name {
char *corename;
int used, size;
};
-static atomic_t call_count = ATOMIC_INIT(1);
/* The maximal length of core_pattern is also specified in sysctl.c */
-static int expand_corename(struct core_name *cn)
+static int expand_corename(struct core_name *cn, int size)
{
- char *old_corename = cn->corename;
+ char *corename = krealloc(cn->corename, size, GFP_KERNEL);
- cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
- cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
- if (!cn->corename) {
- kfree(old_corename);
+ if (!corename)
return -ENOMEM;
- }
+ if (size > core_name_size) /* racy but harmless */
+ core_name_size = size;
+
+ cn->size = ksize(corename);
+ cn->corename = corename;
return 0;
}
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+ int free, need;
+
+again:
+ free = cn->size - cn->used;
+ need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+ if (need < free) {
+ cn->used += need;
+ return 0;
+ }
+
+ if (!expand_corename(cn, cn->size + need - free + 1))
+ goto again;
+
+ return -ENOMEM;
+}
+
static int cn_printf(struct core_name *cn, const char *fmt, ...)
{
- char *cur;
- int need;
- int ret;
va_list arg;
+ int ret;
va_start(arg, fmt);
- need = vsnprintf(NULL, 0, fmt, arg);
+ ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
- if (likely(need < cn->size - cn->used - 1))
- goto out_printf;
+ return ret;
+}
- ret = expand_corename(cn);
- if (ret)
- goto expand_fail;
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+{
+ int cur = cn->used;
+ va_list arg;
+ int ret;
-out_printf:
- cur = cn->corename + cn->used;
va_start(arg, fmt);
- vsnprintf(cur, need + 1, fmt, arg);
+ ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
- cn->used += need;
- return 0;
-expand_fail:
+ for (; cur < cn->used; ++cur) {
+ if (cn->corename[cur] == '/')
+ cn->corename[cur] = '!';
+ }
return ret;
}
-static void cn_escape(char *str)
-{
- for (; *str; str++)
- if (*str == '/')
- *str = '!';
-}
-
static int cn_print_exe_file(struct core_name *cn)
{
struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
int ret;
exe_file = get_mm_exe_file(current->mm);
- if (!exe_file) {
- char *commstart = cn->corename + cn->used;
- ret = cn_printf(cn, "%s (path unknown)", current->comm);
- cn_escape(commstart);
- return ret;
- }
+ if (!exe_file)
+ return cn_esc_printf(cn, "%s (path unknown)", current->comm);
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
goto free_buf;
}
- cn_escape(path);
-
- ret = cn_printf(cn, "%s", path);
+ ret = cn_esc_printf(cn, "%s", path);
free_buf:
kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
int pid_in_pattern = 0;
int err = 0;
- cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
- cn->corename = kmalloc(cn->size, GFP_KERNEL);
cn->used = 0;
-
- if (!cn->corename)
+ cn->corename = NULL;
+ if (expand_corename(cn, core_name_size))
return -ENOMEM;
+ cn->corename[0] = '\0';
+
+ if (ispipe)
+ ++pat_ptr;
/* Repeat as long as we have more pattern to process and more output
space */
while (*pat_ptr) {
if (*pat_ptr != '%') {
- if (*pat_ptr == 0)
- goto out;
err = cn_printf(cn, "%c", *pat_ptr++);
} else {
switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
}
/* hostname */
- case 'h': {
- char *namestart = cn->corename + cn->used;
+ case 'h':
down_read(&uts_sem);
- err = cn_printf(cn, "%s",
+ err = cn_esc_printf(cn, "%s",
utsname()->nodename);
up_read(&uts_sem);
- cn_escape(namestart);
break;
- }
/* executable */
- case 'e': {
- char *commstart = cn->corename + cn->used;
- err = cn_printf(cn, "%s", current->comm);
- cn_escape(commstart);
+ case 'e':
+ err = cn_esc_printf(cn, "%s", current->comm);
break;
- }
case 'E':
err = cn_print_exe_file(cn);
break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
return err;
}
+out:
/* Backward compatibility with core_uses_pid:
*
* If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
if (err)
return err;
}
-out:
return ispipe;
}
@@ -263,7 +261,6 @@ static int zap_process(struct task_struct *start, int exit_code)
struct task_struct *t;
int nr = 0;
- start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
@@ -280,8 +277,8 @@ static int zap_process(struct task_struct *start, int exit_code)
return nr;
}
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
- struct core_state *core_state, int exit_code)
+static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+ struct core_state *core_state, int exit_code)
{
struct task_struct *g, *p;
unsigned long flags;
@@ -291,11 +288,16 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
nr = zap_process(tsk, exit_code);
+ tsk->signal->group_exit_task = tsk;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(nr < 0))
return nr;
+ tsk->flags = PF_DUMPCORE;
if (atomic_read(&mm->mm_users) == nr + 1)
goto done;
/*
@@ -340,6 +342,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
if (unlikely(p->mm == mm)) {
lock_task_sighand(p, &flags);
nr += zap_process(p, exit_code);
+ p->signal->flags = SIGNAL_GROUP_EXIT;
unlock_task_sighand(p, &flags);
}
break;
@@ -386,11 +389,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}
-static void coredump_finish(struct mm_struct *mm)
+static void coredump_finish(struct mm_struct *mm, bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
+ spin_lock_irq(&current->sighand->siglock);
+ if (core_dumped && !__fatal_signal_pending(current))
+ current->signal->group_exit_code |= 0x80;
+ current->signal->group_exit_task = NULL;
+ current->signal->flags = SIGNAL_GROUP_EXIT;
+ spin_unlock_irq(&current->sighand->siglock);
+
next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
@@ -407,26 +417,38 @@ static void coredump_finish(struct mm_struct *mm)
mm->core_state = NULL;
}
-static void wait_for_dump_helpers(struct file *file)
+static bool dump_interrupted(void)
{
- struct pipe_inode_info *pipe;
+ /*
+ * SIGKILL or freezing() interrupt the coredumping. Perhaps we
+ * can do try_to_freeze() and check __fatal_signal_pending(),
+ * but then we need to teach dump_write() to restart and clear
+ * TIF_SIGPENDING.
+ */
+ return signal_pending(current);
+}
- pipe = file_inode(file)->i_pipe;
+static void wait_for_dump_helpers(struct file *file)
+{
+ struct pipe_inode_info *pipe = file->private_data;
pipe_lock(pipe);
pipe->readers++;
pipe->writers--;
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ pipe_unlock(pipe);
- while ((pipe->readers > 1) && (!signal_pending(current))) {
- wake_up_interruptible_sync(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- pipe_wait(pipe);
- }
+ /*
+ * We actually want wait_event_freezable() but then we need
+ * to clear TIF_SIGPENDING and improve dump_interrupted().
+ */
+ wait_event_interruptible(pipe->wait, pipe->readers == 1);
+ pipe_lock(pipe);
pipe->readers--;
pipe->writers++;
pipe_unlock(pipe);
-
}
/*
@@ -471,6 +493,7 @@ void do_coredump(siginfo_t *siginfo)
int ispipe;
struct files_struct *displaced;
bool need_nonrelative = false;
+ bool core_dumped = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
@@ -514,22 +537,17 @@ void do_coredump(siginfo_t *siginfo)
old_cred = override_creds(cred);
- /*
- * Clear any false indication of pending signals that might
- * be seen by the filesystem code called to write the core file.
- */
- clear_thread_flag(TIF_SIGPENDING);
-
ispipe = format_corename(&cn, &cprm);
- if (ispipe) {
+ if (ispipe) {
int dump_count;
char **helper_argv;
+ struct subprocess_info *sub_info;
if (ispipe < 0) {
printk(KERN_WARNING "format_corename failed\n");
printk(KERN_WARNING "Aborting core\n");
- goto fail_corename;
+ goto fail_unlock;
}
if (cprm.limit == 1) {
@@ -564,22 +582,27 @@ void do_coredump(siginfo_t *siginfo)
goto fail_dropcount;
}
- helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+ helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
goto fail_dropcount;
}
- retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
- NULL, UMH_WAIT_EXEC, umh_pipe_setup,
- NULL, &cprm);
+ retval = -ENOMEM;
+ sub_info = call_usermodehelper_setup(helper_argv[0],
+ helper_argv, NULL, GFP_KERNEL,
+ umh_pipe_setup, NULL, &cprm);
+ if (sub_info)
+ retval = call_usermodehelper_exec(sub_info,
+ UMH_WAIT_EXEC);
+
argv_free(helper_argv);
if (retval) {
- printk(KERN_INFO "Core dump to %s pipe failed\n",
+ printk(KERN_INFO "Core dump to |%s pipe failed\n",
cn.corename);
goto close_fail;
- }
+ }
} else {
struct inode *inode;
@@ -629,10 +652,11 @@ void do_coredump(siginfo_t *siginfo)
goto close_fail;
if (displaced)
put_files_struct(displaced);
- retval = binfmt->core_dump(&cprm);
- if (retval)
- current->signal->group_exit_code |= 0x80;
-
+ if (!dump_interrupted()) {
+ file_start_write(cprm.file);
+ core_dumped = binfmt->core_dump(&cprm);
+ file_end_write(cprm.file);
+ }
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
close_fail:
@@ -643,8 +667,7 @@ fail_dropcount:
atomic_dec(&core_dump_count);
fail_unlock:
kfree(cn.corename);
-fail_corename:
- coredump_finish(mm);
+ coredump_finish(mm, core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
@@ -659,7 +682,9 @@ fail:
*/
int dump_write(struct file *file, const void *addr, int nr)
{
- return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+ return !dump_interrupted() &&
+ access_ok(VERIFY_READ, addr, nr) &&
+ file->f_op->write(file, addr, nr, &file->f_pos) == nr;
}
EXPORT_SYMBOL(dump_write);
@@ -668,7 +693,8 @@ int dump_seek(struct file *file, loff_t off)
int ret = 1;
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+ if (dump_interrupted() ||
+ file->f_op->llseek(file, off, SEEK_CUR) < 0)
return 0;
} else {
char *buf = (char *)get_zeroed_page(GFP_KERNEL);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* Read a cramfs directory entry.
*/
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
char *buf;
unsigned int offset;
- int copied;
/* Offset within the thing. */
- offset = filp->f_pos;
- if (offset >= inode->i_size)
+ if (ctx->pos >= inode->i_size)
return 0;
+ offset = ctx->pos;
/* Directory entries are always 4-byte aligned */
if (offset & 3)
return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!buf)
return -ENOMEM;
- copied = 0;
while (offset < inode->i_size) {
struct cramfs_inode *de;
unsigned long nextoffset;
char *name;
ino_t ino;
umode_t mode;
- int namelen, error;
+ int namelen;
mutex_lock(&read_mutex);
de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
break;
namelen--;
}
- error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
- if (error)
+ if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
break;
- offset = nextoffset;
- filp->f_pos = offset;
- copied++;
+ ctx->pos = offset = nextoffset;
}
kfree(buf);
return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
static const struct file_operations cramfs_directory_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = cramfs_readdir,
+ .iterate = cramfs_readdir,
};
static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index e8bc3420d63e..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -337,23 +337,6 @@ static void dentry_lru_del(struct dentry *dentry)
}
}
-/*
- * Remove a dentry that is unreferenced and about to be pruned
- * (unhashed and destroyed) from the LRU, and inform the file system.
- * This wrapper should be called _prior_ to unhashing a victim dentry.
- */
-static void dentry_lru_prune(struct dentry *dentry)
-{
- if (!list_empty(&dentry->d_lru)) {
- if (dentry->d_flags & DCACHE_OP_PRUNE)
- dentry->d_op->d_prune(dentry);
-
- spin_lock(&dcache_lru_lock);
- __dentry_lru_del(dentry);
- spin_unlock(&dcache_lru_lock);
- }
-}
-
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
spin_lock(&dcache_lru_lock);
@@ -486,11 +469,13 @@ relock:
if (ref)
dentry->d_count--;
/*
- * if dentry was on the d_lru list delete it from there.
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
- dentry_lru_prune(dentry);
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
+
+ dentry_lru_del(dentry);
/* if it was on the hash then remove it */
__d_drop(dentry);
return d_kill(dentry, parent);
@@ -919,11 +904,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
struct inode *inode;
/*
- * remove the dentry from the lru, and inform
- * the fs that this dentry is about to be
+ * inform the fs that this dentry is about to be
* unhashed and destroyed.
*/
- dentry_lru_prune(dentry);
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
+
+ dentry_lru_del(dentry);
__d_shrink(dentry);
if (dentry->d_count != 0) {
@@ -1230,8 +1217,10 @@ void shrink_dcache_parent(struct dentry * parent)
LIST_HEAD(dispose);
int found;
- while ((found = select_parent(parent, &dispose)) != 0)
+ while ((found = select_parent(parent, &dispose)) != 0) {
shrink_dentry_list(&dispose);
+ cond_resched();
+ }
}
EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1623,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
@@ -1647,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
- } else
- d_add(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry))
+ d_rehash(dentry);
+ }
return new;
}
EXPORT_SYMBOL(d_splice_alias);
@@ -1734,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
* Do the slow-case of the dentry name compare.
*
* Unlike the dentry_cmp() function, we need to atomically
- * load the name, length and inode information, so that the
+ * load the name and length information, so that the
* filesystem can rely on them, and can use the 'name' and
* 'len' information without worrying about walking off the
* end of memory etc.
@@ -1752,22 +1748,18 @@ enum slow_d_compare {
static noinline enum slow_d_compare slow_dentry_cmp(
const struct dentry *parent,
- struct inode *inode,
struct dentry *dentry,
unsigned int seq,
const struct qstr *name)
{
int tlen = dentry->d_name.len;
const char *tname = dentry->d_name.name;
- struct inode *i = dentry->d_inode;
if (read_seqcount_retry(&dentry->d_seq, seq)) {
cpu_relax();
return D_COMP_SEQRETRY;
}
- if (parent->d_op->d_compare(parent, inode,
- dentry, i,
- tlen, tname, name))
+ if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
return D_COMP_NOMATCH;
return D_COMP_OK;
}
@@ -1777,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
* @parent: parent dentry
* @name: qstr of name we wish to find
* @seqp: returns d_seq value at the point where the dentry was found
- * @inode: returns dentry->d_inode when the inode was found valid.
* Returns: dentry, or NULL
*
* __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1804,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
*/
struct dentry *__d_lookup_rcu(const struct dentry *parent,
const struct qstr *name,
- unsigned *seqp, struct inode *inode)
+ unsigned *seqp)
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
@@ -1838,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
seqretry:
/*
* The dentry sequence count protects us from concurrent
- * renames, and thus protects inode, parent and name fields.
+ * renames, and thus protects parent and name fields.
*
* The caller must perform a seqcount check in order
- * to do anything useful with the returned dentry,
- * including using the 'd_inode' pointer.
+ * to do anything useful with the returned dentry.
*
* NOTE! We do a "raw" seqcount_begin here. That means that
* we don't wait for the sequence count to stabilize if it
@@ -1856,12 +1846,12 @@ seqretry:
continue;
if (d_unhashed(dentry))
continue;
- *seqp = seq;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
- switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+ *seqp = seq;
+ switch (slow_dentry_cmp(parent, dentry, seq, name)) {
case D_COMP_OK:
return dentry;
case D_COMP_NOMATCH:
@@ -1873,6 +1863,7 @@ seqretry:
if (dentry->d_name.hash_len != hashlen)
continue;
+ *seqp = seq;
if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
return dentry;
}
@@ -1970,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
if (parent->d_flags & DCACHE_OP_COMPARE) {
int tlen = dentry->d_name.len;
const char *tname = dentry->d_name.name;
- if (parent->d_op->d_compare(parent, parent->d_inode,
- dentry, dentry->d_inode,
- tlen, tname, name))
+ if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
goto next;
} else {
if (dentry->d_name.len != len)
@@ -2009,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
*/
name->hash = full_name_hash(name->name, name->len);
if (dir->d_flags & DCACHE_OP_HASH) {
- int err = dir->d_op->d_hash(dir, dir->d_inode, name);
+ int err = dir->d_op->d_hash(dir, name);
if (unlikely(err < 0))
return ERR_PTR(err);
}
@@ -2408,8 +2397,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
dentry->d_parent = dentry;
list_del_init(&dentry->d_u.d_child);
anon->d_parent = dparent;
- list_del(&anon->d_u.d_child);
- list_add(&anon->d_u.d_child, &dparent->d_subdirs);
+ list_move(&anon->d_u.d_child, &dparent->d_subdirs);
write_seqcount_end(&dentry->d_seq);
write_seqcount_end(&anon->d_seq);
@@ -2980,34 +2968,21 @@ rename_retry:
goto again;
}
-/**
- * find_inode_number - check for dentry with name
- * @dir: directory to check
- * @name: Name to find.
- *
- * Check whether a dentry already exists for the given name,
- * and return the inode number if it has an inode. Otherwise
- * 0 is returned.
- *
- * This routine is used to post-process directory listings for
- * filesystems using synthetic inode numbers, and is necessary
- * to keep getcwd() working.
- */
-
-ino_t find_inode_number(struct dentry *dir, struct qstr *name)
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
- struct dentry * dentry;
- ino_t ino = 0;
-
- dentry = d_hash_and_lookup(dir, name);
- if (!IS_ERR_OR_NULL(dentry)) {
- if (dentry->d_inode)
- ino = dentry->d_inode->i_ino;
- dput(dentry);
- }
- return ino;
+ inode_dec_link_count(inode);
+ BUG_ON(dentry->d_name.name != dentry->d_iname ||
+ !hlist_unhashed(&dentry->d_alias) ||
+ !d_unlinked(dentry));
+ spin_lock(&dentry->d_parent->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+ (unsigned long long)inode->i_ino);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dentry->d_parent->d_lock);
+ d_instantiate(dentry, inode);
}
-EXPORT_SYMBOL(find_inode_number);
+EXPORT_SYMBOL(d_tmpfile);
static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 17c779967828..ab5954b50267 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -25,6 +25,7 @@
#include <linux/dcookies.h>
#include <linux/mutex.h>
#include <linux/path.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
/* The dcookies are allocated from a kmem_cache and
@@ -145,7 +146,7 @@ out:
/* And here is where the userspace process can look up the cookie value
* to retrieve the path.
*/
-SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
{
unsigned long cookie = (unsigned long)cookie64;
int err = -EINVAL;
@@ -201,12 +202,16 @@ out:
mutex_unlock(&dcookie_mutex);
return err;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
{
- return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+#ifdef __BIG_ENDIAN
+ return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
+#else
+ return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
+#endif
}
-SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
#endif
static int dcookie_init(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/atomic.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+ atomic_set((atomic_t *)data, val);
+ return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+ *val = atomic_read((atomic_t *)data);
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+ debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_wo);
+
+ return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
static ssize_t read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
+ buf[buf_size] = '\0';
if (strtobool(buf, &bv) == 0)
*val = bv;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f853263cf74f..7ab90f5081ee 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio)
static int dio_bio_complete(struct dio *dio, struct bio *bio)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec;
- int page_no;
+ struct bio_vec *bvec;
+ unsigned i;
if (!uptodate)
dio->io_error = -EIO;
@@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->rw == READ) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
- for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
- struct page *page = bvec[page_no].bv_page;
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
if (dio->rw == READ && !PageCompound(page))
set_page_dirty_lock(page);
@@ -672,12 +673,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
if (sdio->final_block_in_bio != sdio->cur_page_block ||
cur_offset != bio_next_offset)
dio_bio_submit(dio, sdio);
- /*
- * Submit now if the underlying fs is about to perform a
- * metadata read
- */
- else if (sdio->boundary)
- dio_bio_submit(dio, sdio);
}
if (sdio->bio == NULL) {
@@ -737,16 +732,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_block +
(sdio->cur_page_len >> sdio->blkbits) == blocknr) {
sdio->cur_page_len += len;
-
- /*
- * If sdio->boundary then we want to schedule the IO now to
- * avoid metadata seeks.
- */
- if (sdio->boundary) {
- ret = dio_send_cur_page(dio, sdio, map_bh);
- page_cache_release(sdio->cur_page);
- sdio->cur_page = NULL;
- }
goto out;
}
@@ -758,7 +743,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
page_cache_release(sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
- goto out;
+ return ret;
}
page_cache_get(page); /* It is in dio */
@@ -768,6 +753,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_block = blocknr;
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
+ /*
+ * If sdio->boundary then we want to schedule the IO now to
+ * avoid metadata seeks.
+ */
+ if (sdio->boundary) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ dio_bio_submit(dio, sdio);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
+ }
return ret;
}
@@ -969,7 +964,8 @@ do_holes:
this_chunk_bytes = this_chunk_blocks << blkbits;
BUG_ON(this_chunk_bytes == 0);
- sdio->boundary = buffer_boundary(map_bh);
+ if (this_chunk_blocks == sdio->blocks_available)
+ sdio->boundary = buffer_boundary(map_bh);
ret = submit_page_section(dio, sdio, page,
offset_in_page,
this_chunk_bytes,
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
const char *buf, size_t len)
{
- strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
- strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strlcpy(dlm_config.ci_cluster_name, buf,
+ sizeof(dlm_config.ci_cluster_name));
+ strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
int len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > r->res_ls->ls_lvblen)
+ len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
lkb->lkb_lvbseq = ms->m_lvbseq;
}
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > ls->ls_lvblen)
+ len = ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
}
return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
void dlm_stop_lockspaces(void)
{
struct dlm_ls *ls;
+ int count;
restart:
+ count = 0;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
- if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+ count++;
continue;
+ }
spin_unlock(&lslist_lock);
log_error(ls, "no userland control daemon, stopping lockspace");
dlm_ls_stop(ls);
goto restart;
}
spin_unlock(&lslist_lock);
+
+ if (count)
+ log_print("dlm user daemon left %d lockspaces", count);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 4f5ad246582f..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -53,7 +53,6 @@
#include <linux/sctp.h>
#include <linux/slab.h>
#include <net/sctp/sctp.h>
-#include <net/sctp/user.h>
#include <net/ipv6.h>
#include "dlm_internal.h"
@@ -126,6 +125,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
+ bool try_new_addr;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
struct list_head list;
int nodeid;
int addr_count;
+ int curr_addr_index;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
}
static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
- struct sockaddr *sa_out)
+ struct sockaddr *sa_out, bool try_new_addr)
{
struct sockaddr_storage sas;
struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
- if (na && na->addr_count)
- memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+ if (na && na->addr_count) {
+ if (try_new_addr) {
+ na->curr_addr_index++;
+ if (na->curr_addr_index == na->addr_count)
+ na->curr_addr_index = 0;
+ }
+
+ memcpy(&sas, na->addr[na->curr_addr_index ],
+ sizeof(struct sockaddr_storage));
+ }
spin_unlock(&dlm_node_addrs_spin);
if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
struct dlm_node_addr *na;
int rv = -EEXIST;
+ int addr_i;
spin_lock(&dlm_node_addrs_spin);
list_for_each_entry(na, &dlm_node_addrs, list) {
if (!na->addr_count)
continue;
- if (!addr_compare(na->addr[0], addr))
- continue;
-
- *nodeid = na->nodeid;
- rv = 0;
- break;
+ for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+ if (addr_compare(na->addr[addr_i], addr)) {
+ *nodeid = na->nodeid;
+ rv = 0;
+ goto unlock;
+ }
+ }
}
+unlock:
spin_unlock(&dlm_node_addrs_spin);
return rv;
}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
static void sctp_init_failed_foreach(struct connection *con)
{
+
+ /*
+ * Don't try to recover base con and handle race where the
+ * other node's assoc init creates a assoc and we get that
+ * notification, then we get a notification that our attempt
+ * failed due. This happens when we are still trying the primary
+ * address, but the other node has already tried secondary addrs
+ * and found one that worked.
+ */
+ if (!con->nodeid || con->sctp_assoc)
+ return;
+
+ log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+ con->try_new_addr = true;
con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
mutex_unlock(&connections_lock);
}
+static void retry_failed_sctp_send(struct connection *recv_con,
+ struct sctp_send_failed *sn_send_failed,
+ char *buf)
+{
+ int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+ struct dlm_mhandle *mh;
+ struct connection *con;
+ char *retry_buf;
+ int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+ log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ log_print("Could not look up con for nodeid %d\n",
+ nodeid);
+ return;
+ }
+
+ mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+ if (!mh) {
+ log_print("Could not allocate buf for retry.");
+ return;
+ }
+ memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+ dlm_lowcomms_commit_buffer(mh);
+
+ /*
+ * If we got a assoc changed event before the send failed event then
+ * we only need to retry the send.
+ */
+ if (con->sctp_assoc) {
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+ } else
+ sctp_init_failed_foreach(con);
+}
+
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
- if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_header.sn_type) {
+ case SCTP_SEND_FAILED:
+ retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+ break;
+ case SCTP_ASSOC_CHANGE:
switch (sn->sn_assoc_change.sac_state) {
-
case SCTP_COMM_UP:
case SCTP_RESTART:
{
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+ new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+ new_con->try_new_addr = false;
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
+ clear_bit(CF_INIT_PENDING, &new_con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
}
break;
- /* We don't know which INIT failed, so clear the PENDING flags
- * on them all. if assoc_id is zero then it will then try
- * again */
-
case SCTP_CANT_STR_ASSOC:
{
+ /* Will retry init when we get the send failed notification */
log_print("Can't start SCTP association - retrying");
- sctp_init_failed();
}
break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
+ default:
+ ; /* fall through */
}
}
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
kfree(e);
}
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ }
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
int addrlen;
struct kvec iov[1];
+ mutex_lock(&con->sock_mutex);
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- return;
-
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
+ goto unlock;
- if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+ if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+ con->try_new_addr)) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
if (list_empty(&con->writequeue)) {
spin_unlock(&con->writequeue_lock);
log_print("writequeue empty for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
- spin_unlock(&con->writequeue_lock);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
+ spin_unlock(&con->writequeue_lock);
+
+ if (rem_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+ log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+ log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+ }
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+ sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo->sinfo_flags |= SCTP_ADDR_OVER;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
}
else {
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
spin_unlock(&con->writequeue_lock);
}
+
+unlock:
+ mutex_unlock(&con->sock_mutex);
}
/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
int result = -EINVAL, num = 1, i, addr_len;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
+ int one = 1;
if (!con)
return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
goto create_delsock;
}
+ result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+ sizeof(one));
+ if (result < 0)
+ log_print("Could not set SCTP NODELAY error %d\n", result);
+
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
}
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
out:
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 01fd5c11a7fb..f704458ea5f5 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -247,6 +247,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
struct dlm_ls *ls;
struct plock_op *op;
int rv;
+ unsigned char fl_flags = fl->fl_flags;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
@@ -258,9 +259,18 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
goto out;
}
- if (posix_lock_file_wait(file, fl) < 0)
- log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
- (unsigned long long)number);
+ /* cause the vfs unlock to return ENOENT if lock is not found */
+ fl->fl_flags |= FL_EXISTS;
+
+ rv = posix_lock_file_wait(file, fl);
+ if (rv == -ENOENT) {
+ rv = 0;
+ goto out_free;
+ }
+ if (rv < 0) {
+ log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx",
+ rv, (unsigned long long)number);
+ }
op->info.optype = DLM_PLOCK_OP_UNLOCK;
op->info.pid = fl->fl_pid;
@@ -296,9 +306,11 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
if (rv == -ENOENT)
rv = 0;
+out_free:
kfree(op);
out:
dlm_put_lockspace(ls);
+ fl->fl_flags = fl_flags;
return rv;
}
EXPORT_SYMBOL_GPL(dlm_posix_unlock);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index d5c25db4398f..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
+#define DECRYPT 0
+#define ENCRYPT 1
/**
* ecryptfs_to_hex
@@ -243,7 +235,7 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
if (crypt_stat->tfm)
- crypto_free_blkcipher(crypt_stat->tfm);
+ crypto_free_ablkcipher(crypt_stat->tfm);
if (crypt_stat->hash_tfm)
crypto_free_hash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
@@ -319,26 +311,40 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
return i;
}
+struct extent_crypt_result {
+ struct completion completion;
+ int rc;
+};
+
+static void extent_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct extent_crypt_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->rc = rc;
+ complete(&ecr->completion);
+}
+
/**
- * encrypt_scatterlist
+ * crypt_scatterlist
* @crypt_stat: Pointer to the crypt_stat struct to initialize.
- * @dest_sg: Destination of encrypted data
- * @src_sg: Data to be encrypted
- * @size: Length of data to be encrypted
- * @iv: iv to use during encryption
+ * @dst_sg: Destination of the data after performing the crypto operation
+ * @src_sg: Data to be encrypted or decrypted
+ * @size: Length of data
+ * @iv: IV to use
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Returns the number of bytes encrypted; negative value on error
+ * Returns the number of bytes encrypted or decrypted; negative value on error
*/
-static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
+static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+ struct scatterlist *dst_sg,
+ struct scatterlist *src_sg, int size,
+ unsigned char *iv, int op)
{
- struct blkcipher_desc desc = {
- .tfm = crypt_stat->tfm,
- .info = iv,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct ablkcipher_request *req = NULL;
+ struct extent_crypt_result ecr;
int rc = 0;
BUG_ON(!crypt_stat || !crypt_stat->tfm
@@ -349,63 +355,88 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
ecryptfs_dump_hex(crypt_stat->key,
crypt_stat->key_size);
}
- /* Consider doing this once, when the file is opened */
+
+ init_completion(&ecr.completion);
+
mutex_lock(&crypt_stat->cs_tfm_mutex);
- if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
- rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
- crypt_stat->flags |= ECRYPTFS_KEY_SET;
- }
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
- rc);
+ req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+ if (!req) {
mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -EINVAL;
+ rc = -ENOMEM;
goto out;
}
- ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
- crypto_blkcipher_encrypt_iv(&desc, dest_sg, src_sg, size);
+
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ extent_crypt_complete, &ecr);
+ /* Consider doing this once, when the file is opened */
+ if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
+ rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+ crypt_stat->key_size);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR,
+ "Error setting key; rc = [%d]\n",
+ rc);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ rc = -EINVAL;
+ goto out;
+ }
+ crypt_stat->flags |= ECRYPTFS_KEY_SET;
+ }
mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+ rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
+ crypto_ablkcipher_decrypt(req);
+ if (rc == -EINPROGRESS || rc == -EBUSY) {
+ struct extent_crypt_result *ecr = req->base.data;
+
+ wait_for_completion(&ecr->completion);
+ rc = ecr->rc;
+ INIT_COMPLETION(ecr->completion);
+ }
out:
+ ablkcipher_request_free(req);
return rc;
}
/**
- * ecryptfs_lower_offset_for_extent
+ * lower_offset_for_page
*
* Convert an eCryptfs page index into a lower byte offset
*/
-static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
- struct ecryptfs_crypt_stat *crypt_stat)
+static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *page)
{
- (*offset) = ecryptfs_lower_header_size(crypt_stat)
- + (crypt_stat->extent_size * extent_num);
+ return ecryptfs_lower_header_size(crypt_stat) +
+ (page->index << PAGE_CACHE_SHIFT);
}
/**
- * ecryptfs_encrypt_extent
- * @enc_extent_page: Allocated page into which to encrypt the data in
- * @page
+ * crypt_extent
* @crypt_stat: crypt_stat containing cryptographic context for the
* encryption operation
- * @page: Page containing plaintext data extent to encrypt
+ * @dst_page: The page to write the result into
+ * @src_page: The page to read from
* @extent_offset: Page extent offset for use in generating IV
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Encrypts one extent of data.
+ * Encrypts or decrypts one extent of data.
*
* Return zero on success; non-zero otherwise
*/
-static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page,
- unsigned long extent_offset)
+static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page,
+ struct page *src_page,
+ unsigned long extent_offset, int op)
{
+ pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+ struct scatterlist src_sg, dst_sg;
+ size_t extent_size = crypt_stat->extent_size;
int rc;
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
+ extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
@@ -414,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
- page, (extent_offset
- * crypt_stat->extent_size),
- crypt_stat->extent_size, extent_iv);
+
+ sg_init_table(&src_sg, 1);
+ sg_init_table(&dst_sg, 1);
+
+ sg_set_page(&src_sg, src_page, extent_size,
+ extent_offset * extent_size);
+ sg_set_page(&dst_sg, dst_page, extent_size,
+ extent_offset * extent_size);
+
+ rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
+ extent_iv, op);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to encrypt page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ printk(KERN_ERR "%s: Error attempting to crypt page with "
+ "page_index = [%ld], extent_offset = [%ld]; "
+ "rc = [%d]\n", __func__, page_index, extent_offset, rc);
goto out;
}
rc = 0;
@@ -453,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
char *enc_extent_virt;
struct page *enc_extent_page = NULL;
loff_t extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
@@ -466,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
"encrypted extent\n");
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, enc_extent_page, page,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
goto out;
}
- ecryptfs_lower_offset_for_extent(
- &offset, ((((loff_t)page->index)
- * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
- offset, crypt_stat->extent_size);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to write lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
}
- rc = 0;
-out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
- }
- return rc;
-}
-static int ecryptfs_decrypt_extent(struct page *page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *enc_extent_page,
- unsigned long extent_offset)
-{
- loff_t extent_base;
- char extent_iv[ECRYPTFS_MAX_IV_BYTES];
- int rc;
-
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
- rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
- (extent_offset
- * crypt_stat->extent_size),
- enc_extent_page, 0,
- crypt_stat->extent_size, extent_iv);
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ enc_extent_virt = kmap(enc_extent_page);
+ rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
+ PAGE_CACHE_SIZE);
+ kunmap(enc_extent_page);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to decrypt to page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to write lower page; rc = [%d]\n",
+ rc);
goto out;
}
rc = 0;
out:
+ if (enc_extent_page) {
+ __free_page(enc_extent_page);
+ }
return rc;
}
@@ -558,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- char *enc_extent_virt;
- struct page *enc_extent_page = NULL;
+ char *page_virt;
unsigned long extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- enc_extent_page = alloc_page(GFP_USER);
- if (!enc_extent_page) {
- rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR, "Error allocating memory for "
- "encrypted extent\n");
+
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ page_virt = kmap(page);
+ rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+ ecryptfs_inode);
+ kunmap(page);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to read lower page; rc = [%d]\n",
+ rc);
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- ecryptfs_lower_offset_for_extent(
- &offset, ((page->index * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_read_lower(enc_extent_virt, offset,
- crypt_stat->extent_size,
- ecryptfs_inode);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to read lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
- rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, page, page,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -602,116 +590,9 @@ int ecryptfs_decrypt_page(struct page *page)
}
}
out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
- }
- return rc;
-}
-
-/**
- * decrypt_scatterlist
- * @crypt_stat: Cryptographic context
- * @dest_sg: The destination scatterlist to decrypt into
- * @src_sg: The source scatterlist to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted; negative value on error
- */
-static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
-{
- struct blkcipher_desc desc = {
- .tfm = crypt_stat->tfm,
- .info = iv,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
- int rc = 0;
-
- /* Consider doing this once, when the file is opened */
- mutex_lock(&crypt_stat->cs_tfm_mutex);
- rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
- rc);
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -EINVAL;
- goto out;
- }
- ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
- rc = crypto_blkcipher_decrypt_iv(&desc, dest_sg, src_sg, size);
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
- rc);
- goto out;
- }
- rc = size;
-out:
return rc;
}
-/**
- * ecryptfs_encrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to encrypt into
- * @dst_offset: The offset in the page to encrypt into
- * @src_page: The page to encrypt from
- * @src_offset: The offset in the page to encrypt from
- * @size: The number of bytes to encrypt
- * @iv: The initialization vector to use for the encryption
- *
- * Returns the number of bytes encrypted
- */
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_init_table(&dst_sg, 1);
-
- sg_set_page(&src_sg, src_page, size, src_offset);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
- return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
-
-/**
- * ecryptfs_decrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to decrypt into
- * @dst_offset: The offset in the page to decrypt into
- * @src_page: The page to decrypt from
- * @src_offset: The offset in the page to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted
- */
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_set_page(&src_sg, src_page, size, src_offset);
-
- sg_init_table(&dst_sg, 1);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
-
- return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
-
#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
/**
@@ -746,8 +627,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
crypt_stat->cipher, "cbc");
if (rc)
goto out_unlock;
- crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
- CRYPTO_ALG_ASYNC);
+ crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
kfree(full_alg_name);
if (IS_ERR(crypt_stat->tfm)) {
rc = PTR_ERR(crypt_stat->tfm);
@@ -757,7 +637,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
crypt_stat->cipher);
goto out_unlock;
}
- crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
rc = 0;
out_unlock:
mutex_unlock(&crypt_stat->cs_tfm_mutex);
@@ -2182,12 +2062,11 @@ out:
*/
int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
size_t *plaintext_name_size,
- struct dentry *ecryptfs_dir_dentry,
+ struct super_block *sb,
const char *name, size_t name_size)
{
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
- &ecryptfs_superblock_to_private(
- ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+ &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
char *decoded_name;
size_t decoded_name_size;
size_t packet_size;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dd299b389d4e..df19d34a033b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -38,6 +38,7 @@
#include <linux/nsproxy.h>
#include <linux/backing-dev.h>
#include <linux/ecryptfs.h>
+#include <linux/crypto.h>
#define ECRYPTFS_DEFAULT_IV_BYTES 16
#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -233,7 +234,7 @@ struct ecryptfs_crypt_stat {
size_t extent_shift;
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct crypto_blkcipher *tfm;
+ struct crypto_ablkcipher *tfm;
struct crypto_hash *hash_tfm; /* Crypto context for generating
* the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
@@ -574,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
struct inode *ecryptfs_inode);
int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
size_t *decrypted_name_size,
- struct dentry *ecryptfs_dentry,
+ struct super_block *sb,
const char *name, size_t name_size);
int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
int ecryptfs_encrypt_and_encode_filename(
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 63b1f54b6a1f..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
@@ -48,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
unsigned long nr_segs, loff_t pos)
{
ssize_t rc;
- struct path lower;
+ struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -59,17 +60,16 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
if (-EIOCBQUEUED == rc)
rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
- lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
- lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
- touch_atime(&lower);
+ path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
+ touch_atime(path);
}
return rc;
}
struct ecryptfs_getdents_callback {
- void *dirent;
- struct dentry *dentry;
- filldir_t filldir;
+ struct dir_context ctx;
+ struct dir_context *caller;
+ struct super_block *sb;
int filldir_called;
int entries_written;
};
@@ -87,7 +87,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
buf->filldir_called++;
rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
- buf->dentry, lower_name,
+ buf->sb, lower_name,
lower_namelen);
if (rc) {
printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -95,9 +95,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
rc);
goto out;
}
- rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+ buf->caller->pos = buf->ctx.pos;
+ rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
- if (rc >= 0)
+ if (!rc)
buf->entries_written++;
out:
return rc;
@@ -106,27 +107,22 @@ out:
/**
* ecryptfs_readdir
* @file: The eCryptfs directory file
- * @dirent: Directory entry handle
- * @filldir: The filldir callback function
+ * @ctx: The actor to feed the entries to
*/
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
{
int rc;
struct file *lower_file;
- struct inode *inode;
- struct ecryptfs_getdents_callback buf;
-
+ struct inode *inode = file_inode(file);
+ struct ecryptfs_getdents_callback buf = {
+ .ctx.actor = ecryptfs_filldir,
+ .caller = ctx,
+ .sb = inode->i_sb,
+ };
lower_file = ecryptfs_file_to_lower(file);
- lower_file->f_pos = file->f_pos;
- inode = file_inode(file);
- memset(&buf, 0, sizeof(buf));
- buf.dirent = dirent;
- buf.dentry = file->f_path.dentry;
- buf.filldir = filldir;
- buf.filldir_called = 0;
- buf.entries_written = 0;
- rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
- file->f_pos = lower_file->f_pos;
+ lower_file->f_pos = ctx->pos;
+ rc = iterate_dir(lower_file, &buf.ctx);
+ ctx->pos = buf.ctx.pos;
if (rc < 0)
goto out;
if (buf.filldir_called && !buf.entries_written)
@@ -294,6 +290,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
static int
ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
+ int rc;
+
+ rc = filemap_write_and_wait(file->f_mapping);
+ if (rc)
+ return rc;
+
return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
}
@@ -337,7 +339,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
const struct file_operations ecryptfs_dir_fops = {
- .readdir = ecryptfs_readdir,
+ .iterate = ecryptfs_readdir,
.read = generic_read_dir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
@@ -358,7 +360,7 @@ const struct file_operations ecryptfs_main_fops = {
.aio_read = ecryptfs_read_update_atime,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
- .readdir = ecryptfs_readdir,
+ .iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5eab400e2590..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
- BUG_ON(!lower_dentry->d_count);
+ BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
ecryptfs_set_dentry_lower(dentry, lower_dentry);
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
set_fs(old_fs);
if (rc < 0)
goto out;
- rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+ rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
lower_buf, rc);
out:
kfree(lower_buf);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
+ rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+ "rc = [%d]\n", path->dentry, path->mnt, rc);
(*lower_file) = NULL;
}
return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
goto unlock;
}
msg_size = (sizeof(*msg) + msg->data_len);
- msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
+ msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
if (!msg_ctx->msg) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
"GFP_KERNEL memory\n", __func__, msg_size);
goto unlock;
}
- memcpy(msg_ctx->msg, msg, msg_size);
msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
wake_up_process(msg_ctx->task);
rc = 0;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 6a160539cd23..09fe622274e4 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -232,17 +232,10 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode)
{
struct file *lower_file;
- mm_segment_t fs_save;
- ssize_t rc;
-
lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
if (!lower_file)
return -EIO;
- fs_save = get_fs();
- set_fs(get_ds());
- rc = vfs_read(lower_file, data, size, &offset);
- set_fs(fs_save);
- return rc;
+ return kernel_read(lower_file, offset, data, size);
}
/**
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
new file mode 100644
index 000000000000..367bbb10c543
--- /dev/null
+++ b/fs/efivarfs/Kconfig
@@ -0,0 +1,12 @@
+config EFIVAR_FS
+ tristate "EFI Variable filesystem"
+ depends on EFI
+ help
+ efivarfs is a replacement filesystem for the old EFI
+ variable support via sysfs, as it doesn't suffer from the
+ same 1024-byte variable size limit.
+
+ To compile this file system support as a module, choose M
+ here. The module will be called efivarfs.
+
+ If unsure, say N.
diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
new file mode 100644
index 000000000000..955d478177d5
--- /dev/null
+++ b/fs/efivarfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the efivarfs filesystem
+#
+
+obj-$(CONFIG_EFIVAR_FS) += efivarfs.o
+
+efivarfs-objs := inode.o file.o super.o
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
new file mode 100644
index 000000000000..8dd524f32284
--- /dev/null
+++ b/fs/efivarfs/file.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static ssize_t efivarfs_file_write(struct file *file,
+ const char __user *userbuf, size_t count, loff_t *ppos)
+{
+ struct efivar_entry *var = file->private_data;
+ void *data;
+ u32 attributes;
+ struct inode *inode = file->f_mapping->host;
+ unsigned long datasize = count - sizeof(attributes);
+ ssize_t bytes = 0;
+ bool set = false;
+
+ if (count < sizeof(attributes))
+ return -EINVAL;
+
+ if (copy_from_user(&attributes, userbuf, sizeof(attributes)))
+ return -EFAULT;
+
+ if (attributes & ~(EFI_VARIABLE_MASK))
+ return -EINVAL;
+
+ data = kmalloc(datasize, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
+ bytes = -EFAULT;
+ goto out;
+ }
+
+ bytes = efivar_entry_set_get_size(var, attributes, &datasize,
+ data, &set);
+ if (!set && bytes) {
+ if (bytes == -ENOENT)
+ bytes = -EIO;
+ goto out;
+ }
+
+ if (bytes == -ENOENT) {
+ drop_nlink(inode);
+ d_delete(file->f_dentry);
+ dput(file->f_dentry);
+ } else {
+ mutex_lock(&inode->i_mutex);
+ i_size_write(inode, datasize + sizeof(attributes));
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ bytes = count;
+
+out:
+ kfree(data);
+
+ return bytes;
+}
+
+static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct efivar_entry *var = file->private_data;
+ unsigned long datasize = 0;
+ u32 attributes;
+ void *data;
+ ssize_t size = 0;
+ int err;
+
+ err = efivar_entry_size(var, &datasize);
+
+ /*
+ * efivarfs represents uncommitted variables with
+ * zero-length files. Reading them should return EOF.
+ */
+ if (err == -ENOENT)
+ return 0;
+ else if (err)
+ return err;
+
+ data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ size = efivar_entry_get(var, &attributes, &datasize,
+ data + sizeof(attributes));
+ if (size)
+ goto out_free;
+
+ memcpy(data, &attributes, sizeof(attributes));
+ size = simple_read_from_buffer(userbuf, count, ppos,
+ data, datasize + sizeof(attributes));
+out_free:
+ kfree(data);
+
+ return size;
+}
+
+const struct file_operations efivarfs_file_operations = {
+ .open = simple_open,
+ .read = efivarfs_file_read,
+ .write = efivarfs_file_write,
+ .llseek = no_llseek,
+};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
new file mode 100644
index 000000000000..7e787fb90293
--- /dev/null
+++ b/fs/efivarfs/inode.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+struct inode *efivarfs_get_inode(struct super_block *sb,
+ const struct inode *dir, int mode, dev_t dev)
+{
+ struct inode *inode = new_inode(sb);
+
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_fop = &efivarfs_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &efivarfs_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inc_nlink(inode);
+ break;
+ }
+ }
+ return inode;
+}
+
+/*
+ * Return true if 'str' is a valid efivarfs filename of the form,
+ *
+ * VariableName-12345678-1234-1234-1234-1234567891bc
+ */
+bool efivarfs_valid_name(const char *str, int len)
+{
+ static const char dashes[EFI_VARIABLE_GUID_LEN] = {
+ [8] = 1, [13] = 1, [18] = 1, [23] = 1
+ };
+ const char *s = str + len - EFI_VARIABLE_GUID_LEN;
+ int i;
+
+ /*
+ * We need a GUID, plus at least one letter for the variable name,
+ * plus the '-' separator
+ */
+ if (len < EFI_VARIABLE_GUID_LEN + 2)
+ return false;
+
+ /* GUID must be preceded by a '-' */
+ if (*(s - 1) != '-')
+ return false;
+
+ /*
+ * Validate that 's' is of the correct format, e.g.
+ *
+ * 12345678-1234-1234-1234-123456789abc
+ */
+ for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
+ if (dashes[i]) {
+ if (*s++ != '-')
+ return false;
+ } else {
+ if (!isxdigit(*s++))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
+{
+ guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
+ guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
+ guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
+ guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
+ guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
+ guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
+ guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
+ guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
+ guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
+ guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
+ guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
+ guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
+ guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
+ guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
+ guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
+ guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+}
+
+static int efivarfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ struct inode *inode;
+ struct efivar_entry *var;
+ int namelen, i = 0, err = 0;
+
+ if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
+ return -EINVAL;
+
+ inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOMEM;
+
+ var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
+ if (!var) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* length of the variable name itself: remove GUID and separator */
+ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
+
+ efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
+ &var->var.VendorGuid);
+
+ for (i = 0; i < namelen; i++)
+ var->var.VariableName[i] = dentry->d_name.name[i];
+
+ var->var.VariableName[i] = '\0';
+
+ inode->i_private = var;
+
+ efivar_entry_add(var, &efivarfs_list);
+ d_instantiate(dentry, inode);
+ dget(dentry);
+out:
+ if (err) {
+ kfree(var);
+ iput(inode);
+ }
+ return err;
+}
+
+static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct efivar_entry *var = dentry->d_inode->i_private;
+
+ if (efivar_entry_delete(var))
+ return -EINVAL;
+
+ drop_nlink(dentry->d_inode);
+ dput(dentry);
+ return 0;
+};
+
+/*
+ * Handle negative dentry.
+ */
+static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+
+const struct inode_operations efivarfs_dir_inode_operations = {
+ .lookup = efivarfs_lookup,
+ .unlink = efivarfs_unlink,
+ .create = efivarfs_create,
+};
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
new file mode 100644
index 000000000000..b5ff16addb7c
--- /dev/null
+++ b/fs/efivarfs/internal.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef EFIVAR_FS_INTERNAL_H
+#define EFIVAR_FS_INTERNAL_H
+
+#include <linux/list.h>
+
+extern const struct file_operations efivarfs_file_operations;
+extern const struct inode_operations efivarfs_dir_inode_operations;
+extern bool efivarfs_valid_name(const char *str, int len);
+extern struct inode *efivarfs_get_inode(struct super_block *sb,
+ const struct inode *dir, int mode, dev_t dev);
+
+extern struct list_head efivarfs_list;
+
+#endif /* EFIVAR_FS_INTERNAL_H */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
new file mode 100644
index 000000000000..a8766b880c07
--- /dev/null
+++ b/fs/efivarfs/super.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ctype.h>
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/ucs2_string.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+
+#include "internal.h"
+
+LIST_HEAD(efivarfs_list);
+
+static void efivarfs_evict_inode(struct inode *inode)
+{
+ clear_inode(inode);
+}
+
+static const struct super_operations efivarfs_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = efivarfs_evict_inode,
+ .show_options = generic_show_options,
+};
+
+static struct super_block *efivarfs_sb;
+
+/*
+ * Compare two efivarfs file names.
+ *
+ * An efivarfs filename is composed of two parts,
+ *
+ * 1. A case-sensitive variable name
+ * 2. A case-insensitive GUID
+ *
+ * So we need to perform a case-sensitive match on part 1 and a
+ * case-insensitive match on part 2.
+ */
+static int efivarfs_d_compare(const struct dentry *parent,
+ const struct dentry *dentry,
+ unsigned int len, const char *str,
+ const struct qstr *name)
+{
+ int guid = len - EFI_VARIABLE_GUID_LEN;
+
+ if (name->len != len)
+ return 1;
+
+ /* Case-sensitive compare for the variable name */
+ if (memcmp(str, name->name, guid))
+ return 1;
+
+ /* Case-insensitive compare for the GUID */
+ return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
+}
+
+static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+ unsigned long hash = init_name_hash();
+ const unsigned char *s = qstr->name;
+ unsigned int len = qstr->len;
+
+ if (!efivarfs_valid_name(s, len))
+ return -EINVAL;
+
+ while (len-- > EFI_VARIABLE_GUID_LEN)
+ hash = partial_name_hash(*s++, hash);
+
+ /* GUID is case-insensitive. */
+ while (len--)
+ hash = partial_name_hash(tolower(*s++), hash);
+
+ qstr->hash = end_name_hash(hash);
+ return 0;
+}
+
+/*
+ * Retaining negative dentries for an in-memory filesystem just wastes
+ * memory and lookup time: arrange for them to be deleted immediately.
+ */
+static int efivarfs_delete_dentry(const struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry_operations efivarfs_d_ops = {
+ .d_compare = efivarfs_d_compare,
+ .d_hash = efivarfs_d_hash,
+ .d_delete = efivarfs_delete_dentry,
+};
+
+static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
+{
+ struct dentry *d;
+ struct qstr q;
+ int err;
+
+ q.name = name;
+ q.len = strlen(name);
+
+ err = efivarfs_d_hash(NULL, &q);
+ if (err)
+ return ERR_PTR(err);
+
+ d = d_alloc(parent, &q);
+ if (d)
+ return d;
+
+ return ERR_PTR(-ENOMEM);
+}
+
+static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
+ unsigned long name_size, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct efivar_entry *entry;
+ struct inode *inode = NULL;
+ struct dentry *dentry, *root = sb->s_root;
+ unsigned long size = 0;
+ char *name;
+ int len, i;
+ int err = -ENOMEM;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return err;
+
+ memcpy(entry->var.VariableName, name16, name_size);
+ memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+ len = ucs2_strlen(entry->var.VariableName);
+
+ /* name, plus '-', plus GUID, plus NUL*/
+ name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+ if (!name)
+ goto fail;
+
+ for (i = 0; i < len; i++)
+ name[i] = entry->var.VariableName[i] & 0xFF;
+
+ name[len] = '-';
+
+ efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+
+ name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+
+ inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0);
+ if (!inode)
+ goto fail_name;
+
+ dentry = efivarfs_alloc_dentry(root, name);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto fail_inode;
+ }
+
+ /* copied by the above to local storage in the dentry. */
+ kfree(name);
+
+ efivar_entry_size(entry, &size);
+ efivar_entry_add(entry, &efivarfs_list);
+
+ mutex_lock(&inode->i_mutex);
+ inode->i_private = entry;
+ i_size_write(inode, size + sizeof(entry->var.Attributes));
+ mutex_unlock(&inode->i_mutex);
+ d_add(dentry, inode);
+
+ return 0;
+
+fail_inode:
+ iput(inode);
+fail_name:
+ kfree(name);
+fail:
+ kfree(entry);
+ return err;
+}
+
+static int efivarfs_destroy(struct efivar_entry *entry, void *data)
+{
+ efivar_entry_remove(entry);
+ kfree(entry);
+ return 0;
+}
+
+static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *inode = NULL;
+ struct dentry *root;
+ int err;
+
+ efivarfs_sb = sb;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = EFIVARFS_MAGIC;
+ sb->s_op = &efivarfs_ops;
+ sb->s_d_op = &efivarfs_d_ops;
+ sb->s_time_gran = 1;
+
+ inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ if (!inode)
+ return -ENOMEM;
+ inode->i_op = &efivarfs_dir_inode_operations;
+
+ root = d_make_root(inode);
+ sb->s_root = root;
+ if (!root)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&efivarfs_list);
+
+ err = efivar_init(efivarfs_callback, (void *)sb, false,
+ true, &efivarfs_list);
+ if (err)
+ __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+
+ return err;
+}
+
+static struct dentry *efivarfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_single(fs_type, flags, data, efivarfs_fill_super);
+}
+
+static void efivarfs_kill_sb(struct super_block *sb)
+{
+ kill_litter_super(sb);
+ efivarfs_sb = NULL;
+
+ /* Remove all entries and destroy */
+ __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+}
+
+static struct file_system_type efivarfs_type = {
+ .name = "efivarfs",
+ .mount = efivarfs_mount,
+ .kill_sb = efivarfs_kill_sb,
+};
+
+static __init int efivarfs_init(void)
+{
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
+ if (!efivars_kobject())
+ return 0;
+
+ return register_filesystem(&efivarfs_type);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr");
+MODULE_DESCRIPTION("EFI Variable Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS("efivarfs");
+
+module_init(efivarfs_init);
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
#include <linux/buffer_head.h>
#include "efs.h"
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
const struct file_operations efs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = efs_readdir,
+ .iterate = efs_readdir,
};
const struct inode_operations efs_dir_inode_operations = {
.lookup = efs_lookup,
};
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
- struct inode *inode = file_inode(filp);
- struct buffer_head *bh;
-
- struct efs_dir *dirblock;
- struct efs_dentry *dirslot;
- efs_ino_t inodenum;
+static int efs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
efs_block_t block;
- int slot, namelen;
- char *nameptr;
+ int slot;
if (inode->i_size & (EFS_DIRBSIZE-1))
printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
/* work out where this entry can be found */
- block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+ block = ctx->pos >> EFS_DIRBSIZE_BITS;
/* each block contains at most 256 slots */
- slot = filp->f_pos & 0xff;
+ slot = ctx->pos & 0xff;
/* look at all blocks */
while (block < inode->i_blocks) {
+ struct efs_dir *dirblock;
+ struct buffer_head *bh;
+
/* read the dir block */
bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
break;
}
- while (slot < dirblock->slots) {
- if (dirblock->space[slot] == 0) {
- slot++;
+ for (; slot < dirblock->slots; slot++) {
+ struct efs_dentry *dirslot;
+ efs_ino_t inodenum;
+ const char *nameptr;
+ int namelen;
+
+ if (dirblock->space[slot] == 0)
continue;
- }
dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
#ifdef DEBUG
printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
#endif
- if (namelen > 0) {
- /* found the next entry */
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-
- /* copy filename and data in dirslot */
- filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
-
- /* sanity check */
- if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
- printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
- slot++;
- continue;
- }
-
- /* store position of next slot */
- if (++slot == dirblock->slots) {
- slot = 0;
- block++;
- }
+ if (!namelen)
+ continue;
+ /* found the next entry */
+ ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
+
+ /* sanity check */
+ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+ printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+ continue;
+ }
+
+ /* copy filename and data in dirslot */
+ if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
brelse(bh);
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
- goto out;
+ return 0;
}
- slot++;
}
brelse(bh);
slot = 0;
block++;
}
-
- filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
+ ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
return 0;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057a..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,12 +34,14 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/compat.h>
/*
* LOCKING:
@@ -104,7 +106,7 @@
struct epoll_filefd {
struct file *file;
int fd;
-};
+} __packed;
/*
* Structure used to track possible nested calls, for too deep recursions
@@ -128,6 +130,8 @@ struct nested_calls {
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
*/
struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
@@ -158,7 +162,7 @@ struct epitem {
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
- struct wakeup_source *ws;
+ struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
@@ -536,6 +540,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
}
}
+/* call only when ep->mtx is held */
+static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
+{
+ return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
+}
+
+/* call only when ep->mtx is held */
+static inline void ep_pm_stay_awake(struct epitem *epi)
+{
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ if (ws)
+ __pm_stay_awake(ws);
+}
+
+static inline bool ep_has_wakeup_source(struct epitem *epi)
+{
+ return rcu_access_pointer(epi->ws) ? true : false;
+}
+
+/* call when ep->mtx cannot be held (ep_poll_callback) */
+static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
+{
+ struct wakeup_source *ws;
+
+ rcu_read_lock();
+ ws = rcu_dereference(epi->ws);
+ if (ws)
+ __pm_stay_awake(ws);
+ rcu_read_unlock();
+}
+
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
@@ -599,7 +635,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
/*
@@ -668,7 +704,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
+ wakeup_source_unregister(ep_wakeup_source(epi));
/* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi);
@@ -711,11 +747,15 @@ static void ep_free(struct eventpoll *ep)
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock".
+ * We do not need to lock ep->mtx, either, we only do it to prevent
+ * a lockdep warning.
*/
+ mutex_lock(&ep->mtx);
while ((rbp = rb_first(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
}
+ mutex_unlock(&ep->mtx);
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
@@ -734,6 +774,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}
+static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+{
+ pt->_key = epi->event.events;
+
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+}
+
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
@@ -741,10 +788,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
poll_table pt;
init_poll_funcptr(&pt, NULL);
+
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- pt._key = epi->event.events;
- if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events)
+ if (ep_item_poll(epi, &pt))
return POLLIN | POLLRDNORM;
else {
/*
@@ -752,7 +798,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
- __pm_relax(epi->ws);
+ __pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
@@ -984,7 +1030,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1146,6 +1192,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
const char *name;
+ struct wakeup_source *ws;
if (!epi->ep->ws) {
epi->ep->ws = wakeup_source_register("eventpoll");
@@ -1154,17 +1201,29 @@ static int ep_create_wakeup_source(struct epitem *epi)
}
name = epi->ffd.file->f_path.dentry->d_name.name;
- epi->ws = wakeup_source_register(name);
- if (!epi->ws)
+ ws = wakeup_source_register(name);
+
+ if (!ws)
return -ENOMEM;
+ rcu_assign_pointer(epi->ws, ws);
return 0;
}
-static void ep_destroy_wakeup_source(struct epitem *epi)
+/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
+static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
- wakeup_source_unregister(epi->ws);
- epi->ws = NULL;
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ RCU_INIT_POINTER(epi->ws, NULL);
+
+ /*
+ * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
+ * used internally by wakeup_source_remove, too (called by
+ * wakeup_source_unregister), so we cannot use call_rcu
+ */
+ synchronize_rcu();
+ wakeup_source_unregister(ws);
}
/*
@@ -1199,13 +1258,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
if (error)
goto error_create_wakeup_source;
} else {
- epi->ws = NULL;
+ RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
- epq.pt._key = event->events;
/*
* Attach the item to the poll hooks and get current event bits.
@@ -1214,7 +1272,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = tfile->f_op->poll(tfile, &epq.pt);
+ revents = ep_item_poll(epi, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
@@ -1247,7 +1305,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1288,7 +1346,7 @@ error_unregister:
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
+ wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
@@ -1314,12 +1372,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events; /* need barrier below */
- pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
- if (!epi->ws)
+ if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
- } else if (epi->ws) {
+ } else if (ep_has_wakeup_source(epi)) {
ep_destroy_wakeup_source(epi);
}
@@ -1347,7 +1404,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
+ revents = ep_item_poll(epi, &pt);
/*
* If the item is "hot" and it is not registered inside the ready
@@ -1357,7 +1414,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1383,6 +1440,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
+ struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
@@ -1405,14 +1463,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
- if (epi->ws && epi->ws->active)
- __pm_stay_awake(ep->ws);
- __pm_relax(epi->ws);
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
list_del_init(&epi->rdllink);
- pt._key = epi->event.events;
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events;
+ revents = ep_item_poll(epi, &pt);
/*
* If the event mask intersect the caller-requested one,
@@ -1424,7 +1484,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
@@ -1444,7 +1504,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
}
@@ -1543,7 +1603,8 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!freezable_schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
@@ -1916,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
return -EINVAL;
if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
}
error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1934,12 +1995,58 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
sizeof(sigsaved));
set_restore_sigmask();
} else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ set_current_blocked(&sigsaved);
}
return error;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ long err;
+ compat_sigset_t csigmask;
+ sigset_t ksigmask, sigsaved;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &csigmask);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
+ }
+
+ err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+ /*
+ * If we changed the signal mask, we need to restore the original one.
+ * In case we've got a signal while waiting, we do not restore the
+ * signal mask yet, and we allow do_signal() to deliver the signal on
+ * the way back to userspace, before the signal mask is restored.
+ */
+ if (sigmask) {
+ if (err == -EINTR) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ } else
+ set_current_blocked(&sigsaved);
+ }
+
+ return err;
+}
+#endif
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
@@ -1964,6 +2071,12 @@ static int __init eventpoll_init(void)
/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);
+ /*
+ * We can have many thousands of epitems, so prevent this from
+ * using an extra cache line on 64-bit (and smaller) CPUs
+ */
+ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
diff --git a/fs/exec.c b/fs/exec.c
index a96a4885bbbf..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
- .intent = LOOKUP_OPEN
+ .intent = LOOKUP_OPEN,
+ .lookup_flags = LOOKUP_FOLLOW,
};
if (IS_ERR(tmp))
goto out;
- file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
+ file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
putname(tmp);
error = PTR_ERR(file);
if (IS_ERR(file))
@@ -613,7 +614,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -622,7 +623,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb, new_end, old_end);
@@ -756,10 +757,11 @@ struct file *open_exec(const char *name)
static const struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC | MAY_OPEN,
- .intent = LOOKUP_OPEN
+ .intent = LOOKUP_OPEN,
+ .lookup_flags = LOOKUP_FOLLOW,
};
- file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
+ file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
if (IS_ERR(file))
goto out;
@@ -802,6 +804,15 @@ int kernel_read(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_read);
+ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
+{
+ ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+ if (res > 0)
+ flush_icache_range(addr, addr + len);
+ return res;
+}
+EXPORT_SYMBOL(read_code);
+
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
@@ -898,11 +909,13 @@ static int de_thread(struct task_struct *tsk)
sig->notify_count = -1; /* for exit_notify() */
for (;;) {
+ threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
write_unlock_irq(&tasklist_lock);
+ threadgroup_change_end(tsk);
schedule();
if (unlikely(__fatal_signal_pending(tsk)))
goto killed;
@@ -919,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
* also take its birthdate (always earlier than our own).
*/
tsk->start_time = leader->start_time;
+ tsk->real_start_time = leader->real_start_time;
BUG_ON(!same_thread_group(leader, tsk));
BUG_ON(has_group_leader_pid(tsk));
@@ -934,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
* Note: The old leader also uses this pid until release_task
* is called. Odd but simple and correct.
*/
- detach_pid(tsk, PIDTYPE_PID);
tsk->pid = leader->pid;
- attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
+ change_pid(tsk, PIDTYPE_PID, task_pid(leader));
transfer_pid(leader, tsk, PIDTYPE_PGID);
transfer_pid(leader, tsk, PIDTYPE_SID);
@@ -960,6 +973,7 @@ static int de_thread(struct task_struct *tsk)
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
write_unlock_irq(&tasklist_lock);
+ threadgroup_change_end(tsk);
release_task(leader);
}
@@ -1027,17 +1041,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
void set_task_comm(struct task_struct *tsk, char *buf)
{
task_lock(tsk);
-
trace_task_rename(tsk, buf);
-
- /*
- * Threads may access current->comm without holding
- * the task lock, so write the string carefully.
- * Readers without a lock may see incomplete new
- * names but are safe from non-terminating string reads.
- */
- memset(tsk->comm, 0, TASK_COMM_LEN);
- wmb();
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
perf_event_comm(tsk);
@@ -1133,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm)
set_dumpable(current->mm, suid_dumpable);
}
- /*
- * Flush performance counters when crossing a
- * security domain:
- */
- if (!get_dumpable(current->mm))
- perf_event_exit_task(current);
-
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1203,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
+
+ /*
+ * Disable monitoring for regular users
+ * when executing setuid binaries. Must
+ * wait until new credentials are committed
+ * by commit_creds() above
+ */
+ if (get_dumpable(current->mm) != SUID_DUMP_USER)
+ perf_event_exit_task(current);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
@@ -1459,7 +1465,6 @@ static int do_execve_common(const char *filename,
struct files_struct *displaced;
bool clear_in_exec;
int retval;
- const struct cred *cred = current_cred();
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1468,7 +1473,7 @@ static int do_execve_common(const char *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+ atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
}
static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- unsigned char *types = NULL;
- int need_revalidate = (filp->f_version != inode->i_version);
+ int need_revalidate = (file->f_version != inode->i_version);
if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
return 0;
- types = exofs_filetype_table;
-
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
return -EIO;
}
if (de->inode_no) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < EXOFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < EXOFS_FT_MAX)
+ t = exofs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode_no),
- d_type);
- if (over) {
+ t)) {
exofs_put_page(page);
return 0;
}
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
exofs_put_page(page);
}
-
return 0;
}
@@ -669,5 +663,5 @@ not_empty:
const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = exofs_readdir,
+ .iterate = exofs_readdir,
};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index f936cb50dc0d..b74422888604 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
struct bio_vec *bv;
unsigned i;
- __bio_for_each_segment(bv, bio, i, 0) {
+ bio_for_each_segment_all(bv, bio, i) {
unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index b963f38ac298..7682b970d0f1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
if (!bio)
continue;
- __bio_for_each_segment(bv, bio, i, 0) {
+ bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
SetPageUptodate(page);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
}
struct getdents_callback {
+ struct dir_context ctx;
char *name; /* name that was found. It already points to a
buffer NAME_MAX+1 is size */
unsigned long ino; /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
struct inode *dir = path->dentry->d_inode;
int error;
struct file *file;
- struct getdents_callback buffer;
+ struct getdents_callback buffer = {
+ .ctx.actor = filldir_one,
+ .name = name,
+ .ino = child->d_inode->i_ino
+ };
error = -ENOTDIR;
if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
goto out;
error = -EINVAL;
- if (!file->f_op->readdir)
+ if (!file->f_op->iterate)
goto out_close;
- buffer.name = name;
- buffer.ino = child->d_inode->i_ino;
- buffer.found = 0;
buffer.sequence = 0;
while (1) {
int old_seq = buffer.sequence;
- error = vfs_readdir(file, filldir_one, &buffer);
+ error = iterate_dir(file, &buffer.ctx);
if (buffer.found) {
error = 0;
break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
}
static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
unsigned char *types = NULL;
- int need_revalidate = filp->f_version != inode->i_version;
+ int need_revalidate = file->f_version != inode->i_version;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
ext2_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
return -EIO;
}
if (de->inode) {
- int over;
unsigned char d_type = DT_UNKNOWN;
if (types && de->file_type < EXT2_FT_MAX)
d_type = types[de->file_type];
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
- le32_to_cpu(de->inode), d_type);
- if (over) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ d_type)) {
ext2_put_page(page);
return 0;
}
}
- filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext2_rec_len_from_disk(de->rec_len);
}
ext2_put_page(page);
}
@@ -724,7 +721,7 @@ not_empty:
const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ext2_readdir,
+ .iterate = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fe60cc1117d8..0a87bb10998d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
return ext2_add_nondir(dentry, inode);
}
+static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode = ext2_new_inode(dir, mode, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &ext2_file_inode_operations;
+ if (ext2_use_xip(inode->i_sb)) {
+ inode->i_mapping->a_ops = &ext2_aops_xip;
+ inode->i_fop = &ext2_xip_file_operations;
+ } else if (test_opt(inode->i_sb, NOBH)) {
+ inode->i_mapping->a_ops = &ext2_nobh_aops;
+ inode->i_fop = &ext2_file_operations;
+ } else {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_file_operations;
+ }
+ mark_inode_dirty(inode);
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ return 0;
+}
+
static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
+ .tmpfile = ext2_tmpfile,
};
const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
+static int ext3_dx_readdir(struct file *, struct dir_context *);
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
return error_msg == NULL ? 1 : 0;
}
-static int ext3_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned long offset;
- int i, stored;
+ int i;
struct ext3_dir_entry_2 *de;
int err;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- int ret = 0;
int dir_has_error = 0;
if (is_dx_dir(inode)) {
- err = ext3_dx_readdir(filp, dirent, filldir);
- if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
- }
+ err = ext3_dx_readdir(file, ctx);
+ if (err != ERR_BAD_DX_DIR)
+ return err;
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
}
- stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
- unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+ while (ctx->pos < inode->i_size) {
+ unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
struct buffer_head map_bh;
struct buffer_head *bh = NULL;
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext3_bread(NULL, inode, blk, 0, &err);
}
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
if (!dir_has_error) {
ext3_error(sb, __func__, "directory #%lu "
"contains a hole at offset %lld",
- inode->i_ino, filp->f_pos);
+ inode->i_ino, ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (offset && file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext3_dir_entry_2 *)
(bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
i += ext3_rec_len_from_disk(de->rec_len);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
bh, offset)) {
- /* On error, skip the f_pos to the
+ /* On error, skip the to the
next block. */
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext3_rec_len_from_disk(de->rec_len);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext3_rec_len_from_disk(de->rec_len);
}
offset = 0;
brelse (bh);
+ if (ctx->pos < inode->i_size)
+ if (!dir_relax(inode))
+ return 0;
}
-out:
- return ret;
+ return 0;
}
static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
- filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = file_inode(filp);
- struct super_block * sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
printk("call_filldir: called with null fname?!?\n");
- return 0;
+ return true;
}
- curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name, fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return false;
}
fname = fname->next;
}
- return 0;
+ return true;
}
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext3_htree_create_dir_info(filp, filp->f_pos);
+ info = ext3_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == ext3_get_htree_eof(filp))
+ if (ctx->pos == ext3_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp, filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (!call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext3_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext3_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = ext3_get_htree_eof(filp);
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (!call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = ext3_get_htree_eof(filp);
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
const struct file_operations ext3_dir_operations = {
.llseek = ext3_dir_llseek,
.read = generic_read_dir,
- .readdir = ext3_readdir,
+ .iterate = ext3_readdir,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext3_sync_file_enter(file, datasync);
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated state */
+ smp_rmb();
+ if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+ return -EROFS;
return 0;
-
+ }
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d512c4bc4ad7..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -218,7 +219,8 @@ void ext3_evict_inode (struct inode *inode)
*/
if (inode->i_nlink && ext3_should_journal_data(inode) &&
EXT3_SB(inode->i_sb)->s_journal &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT3_JOURNAL_INO) {
tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
@@ -1823,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
- trace_ext3_invalidatepage(page, offset);
+ trace_ext3_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
@@ -1982,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..998ea111e537 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -1762,6 +1759,45 @@ retry:
return err;
}
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT3_XATTR_TRANS_BLOCKS);
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ inode = ext3_new_inode (handle, dir, NULL, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ ext3_set_aops(inode);
+ err = ext3_orphan_add(handle, inode);
+ if (err)
+ goto err_drop_inode;
+ mark_inode_dirty(inode);
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ }
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_drop_inode:
+ ext3_journal_stop(handle);
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
+}
+
static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
handle_t *handle;
@@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2317,6 +2353,11 @@ retry:
err = ext3_add_entry(handle, dentry, inode);
if (!err) {
ext3_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext3_orphan_del(handle, inode);
d_instantiate(dentry, inode);
} else {
drop_nlink(inode);
@@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.mkdir = ext3_mkdir,
.rmdir = ext3_rmdir,
.mknod = ext3_mknod,
+ .tmpfile = ext3_tmpfile,
.rename = ext3_rename,
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fb5120a5505c..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
if (test_opt (sb, ERRORS_RO)) {
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- sb->s_flags |= MS_RDONLY;
set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
@@ -362,22 +373,19 @@ fail:
/*
* Release the journal device
*/
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
{
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext3_blkdev_put(bdev);
+ ext3_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -2067,7 +2075,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- sb->s_flags |= MS_SNAP_STABLE;
return 0;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb9..efea5d5c44ce 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
- with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+ with a command such as:
+ echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92e68b33fffd..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
*/
/*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block)
+{
+ ext4_group_t group;
+
+ if (test_opt2(sb, STD_GROUP_SIZE))
+ group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+ block) >>
+ (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+ else
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ return group;
+}
+
+/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
*/
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
- ext4_group_t block_group)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+ ext4_fsblk_t block,
+ ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
- if (actual_group == block_group)
- return 1;
- return 0;
+
+ actual_group = ext4_get_group_number(sb, block);
+ return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
- s64 free_clusters, dirty_clusters, root_clusters;
+ s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
+ resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
- root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+ resv_clusters;
- if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+ if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
- if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+ if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ if (free_clusters >= (nclusters + dirty_clusters +
+ resv_clusters))
+ return 1;
+ }
+ /* No free blocks. Let's see if we can dip into reserved pool */
+ if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
@@ -653,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d8cd1f0f4661..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
#include "ext4.h"
#include "xattr.h"
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
* Check if the given dir-inode refers to an htree-indexed directory
@@ -46,7 +45,8 @@ static int is_dx_dir(struct inode *inode)
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+ ext4_has_inline_data(inode)))
return 1;
return 0;
@@ -102,59 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
return 1;
}
-static int ext4_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned int offset;
int i, stored;
struct ext4_dir_entry_2 *de;
int err;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- int ret = 0;
int dir_has_error = 0;
- if (ext4_has_inline_data(inode)) {
- int has_inline_data = 1;
- ret = ext4_read_inline_dir(filp, dirent, filldir,
- &has_inline_data);
- if (has_inline_data)
- return ret;
- }
-
if (is_dx_dir(inode)) {
- err = ext4_dx_readdir(filp, dirent, filldir);
+ err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
+ return err;
}
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- ext4_clear_inode_flag(file_inode(filp),
+ ext4_clear_inode_flag(file_inode(file),
EXT4_INODE_INDEX);
}
+
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ int ret = ext4_read_inline_dir(file, ctx,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
+ while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
@@ -164,16 +161,16 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- EXT4_ERROR_FILE(filp, 0,
+ EXT4_ERROR_FILE(file, 0,
"directory contains a "
"hole at offset %llu",
- (unsigned long long) filp->f_pos);
+ (unsigned long long) ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -181,21 +178,20 @@ static int ext4_readdir(struct file *filp,
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(inode,
(struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+ EXT4_ERROR_FILE(file, 0, "directory fails checksum "
"at offset %llu",
- (unsigned long long)filp->f_pos);
- filp->f_pos += sb->s_blocksize - offset;
+ (unsigned long long)ctx->pos);
+ ctx->pos += sb->s_blocksize - offset;
brelse(bh);
continue;
}
set_buffer_verified(bh);
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -212,57 +208,46 @@ revalidate:
sb->s_blocksize);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (ext4_check_dir_entry(inode, filp, de, bh,
+ if (ext4_check_dir_entry(inode, file, de, bh,
bh->b_data, bh->b_size,
offset)) {
/*
- * On error, skip the f_pos to the next block
+ * On error, skip to the next block
*/
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse(bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- filp->f_pos,
le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored++;
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
offset = 0;
brelse(bh);
+ if (ctx->pos < inode->i_size) {
+ if (!dir_relax(inode))
+ return 0;
+ }
}
-out:
- return ret;
+ return 0;
}
static inline int is_32bit_api(void)
@@ -490,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file *filp, void *dirent,
- filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = file_inode(filp);
- struct super_block *sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -507,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
inode->i_ino, current->comm);
return 0;
}
- curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name,
+ fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return 1;
}
fname = fname->next;
}
return 0;
}
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext4_htree_create_dir_info(filp, filp->f_pos);
+ info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == ext4_get_htree_eof(filp))
+ if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp, filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -555,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -569,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = ext4_get_htree_eof(filp);
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -588,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -599,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = ext4_get_htree_eof(filp);
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -607,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -622,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
- .readdir = ext4_readdir,
+ .iterate = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3b83cd604796..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_STREAM_ALLOC 0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED 0x2000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -175,58 +177,34 @@ struct ext4_map_blocks {
};
/*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_DIRECT 0x0004
-
-struct ext4_io_page {
- struct page *p_page;
- atomic_t p_count;
-};
-
-#define MAX_IO_PAGES 128
+#define EXT4_IO_END_DIRECT 0x0002
/*
- * For converting uninitialized extents on a work queue.
- *
- * 'page' is only used from the writepage() path; 'pages' is only used for
- * buffered writes; they are used to keep page references until conversion
- * takes place. For AIO/DIO, neither field is filled in.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
- int num_io_pages; /* for writepages() */
- struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
+ atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
int io_op;
struct bio *io_bio;
ext4_io_end_t *io_end;
- struct ext4_io_page *io_page;
sector_t io_next_block;
};
@@ -403,7 +381,7 @@ struct flex_groups {
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -557,9 +535,8 @@ enum {
#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
- /* Caller is from the delayed allocation writeout path,
- so set the magic i_delalloc_reserve_flag after taking the
- inode allocation semaphore for */
+ /* Caller is from the delayed allocation writeout path
+ * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
@@ -571,8 +548,9 @@ enum {
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
- /* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
+ /* Eventual metadata allocation (due to growing extent tree)
+ * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
@@ -593,11 +571,6 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
/*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
-
-/*
* ioctl commands
*/
#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -616,6 +589,7 @@ enum {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -890,6 +864,7 @@ struct ext4_inode_info {
rwlock_t i_es_lock;
struct list_head i_es_lru;
unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -914,12 +889,22 @@ struct ext4_inode_info {
qsize_t i_reserved_quota;
#endif
- /* completed IOs that might need unwritten extents handling */
- struct list_head i_completed_io_list;
+ /* Lock protecting lists below */
spinlock_t i_completed_io_lock;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
+ struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
- struct work_struct i_unwritten_work; /* deferred extent conversion */
+ struct work_struct i_rsv_conversion_work;
+ struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -949,7 +934,7 @@ struct ext4_inode_info {
#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
/*
- * Mount flags
+ * Mount flags set via mount options or defaults
*/
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -981,8 +966,16 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
+ size of blocksize * 8
+ blocks */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1179,6 +1172,7 @@ struct ext4_sb_info {
unsigned int s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
+ atomic64_t s_resv_clusters;
kuid_t s_resuid;
kgid_t s_resgid;
unsigned short s_mount_state;
@@ -1247,7 +1241,6 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
- unsigned int s_max_writeback_mb_bump;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
@@ -1283,8 +1276,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for unreserved extent convertions (dio) */
+ struct workqueue_struct *unrsv_conversion_wq;
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
/* timer for periodic error stats printing */
struct timer_list s_err_report;
@@ -1309,6 +1304,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
@@ -1333,6 +1329,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
return ino == EXT4_ROOT_INO ||
ino == EXT4_USR_QUOTA_INO ||
ino == EXT4_GRP_QUOTA_INO ||
+ ino == EXT4_BOOT_LOADER_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
@@ -1343,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ /* Writeback has to have coversion transaction reserved */
+ WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+ !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -1374,6 +1374,7 @@ enum {
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
+ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR -75000
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
- ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct buffer_head *bh);
/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp,
+ ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block);
+
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
unsigned int block_group,
@@ -1995,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2084,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2092,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2107,9 +2114,10 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
-extern void ext4_ind_truncate(struct inode *inode);
-extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2117,6 +2125,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2160,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
- __LINE__, ## message)
extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(4, 5)
void __ext4_abort(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
- __LINE__, ## message)
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
- __LINE__, ## message)
extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
- __LINE__, msg)
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
struct super_block *, ext4_group_t,
unsigned long, ext4_fsblk_t,
const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
- __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -2306,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2509,8 +2573,13 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
+ struct dir_context *ctx,
int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
@@ -2547,6 +2616,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2569,19 +2656,18 @@ struct ext4_extent;
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
- int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
- loff_t length);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2609,18 +2695,27 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+ struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
/* page-io.c */
extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2633,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
extern int ext4_mmp_csum_verify(struct super_block *sb,
struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
+ = BH_JBDPrivateStart,
BH_AllocFromCluster, /* allocated blocks were part of already
- * allocated cluster. Note that this flag will
- * never, ever appear in a buffer_head's state
- * flag. See EXT4_MAP_FROM_CLUSTER to see where
- * this is used. */
+ * allocated cluster. */
};
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 8643ff5bbeb7..51bc821ade90 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path);
+
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7058975e3a55..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,29 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
/*
* Wrappers for jbd2_journal_start/end.
*/
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
{
journal_t *journal;
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
+ might_sleep();
if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
+ return -EROFS;
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
/*
* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly.
*/
- if (is_journal_aborted(journal)) {
+ if (journal && is_journal_aborted(journal)) {
ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
+ return -EROFS;
}
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -84,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return err;
}
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
+{
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, struct buffer_head *bh,
handle_t *handle, int err)
@@ -113,6 +151,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
{
int err = 0;
+ might_sleep();
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
@@ -209,6 +249,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
{
int err = 0;
+ might_sleep();
+
+ set_buffer_meta(bh);
+ set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 4c216b1bf20c..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 27U : 8U)
+ ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -132,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
#define EXT4_HT_MIGRATE 8
#define EXT4_HT_MOVE_EXTENTS 9
#define EXT4_HT_XATTR 10
-#define EXT4_HT_MAX 11
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
@@ -194,16 +197,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
*/
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
+ bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
+ deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
+ return deleted;
}
int
@@ -259,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks);
+ int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -294,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int nblocks)
+ int blocks, int rsv_blocks)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c6d06dcef8b..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-#define ext4_ext_dirty(handle, inode, path) \
- __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-static int __ext4_ext_dirty(const char *where, unsigned int line,
- handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path)
{
int err;
if (path->p_bh) {
@@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
return -EIO;
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
- && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
- ext4_ext_get_actual_len(ex),
- ext4_ext_pblock(ex));
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- return err;
+ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
/*
- * ext4_can_extents_be_merged should have checked that either
- * both extents are uninitialized, or both aren't. Thus we
- * need to check only one of them here.
+ * Try to see whether we should rather test the extent on
+ * right from ex, or from the left of ex. This is because
+ * ext4_ext_find_extent() can return either extent on the
+ * left, or on the right from the searched position. This
+ * will make merging more effective.
*/
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
- ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ if (ex < EXT_LAST_EXTENT(eh) &&
+ (le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex) <
+ le32_to_cpu(newext->ee_block))) {
+ ex += 1;
+ goto prepend;
+ } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+ (le32_to_cpu(newext->ee_block) +
+ ext4_ext_get_actual_len(newext) <
+ le32_to_cpu(ex->ee_block)))
+ ex -= 1;
+
+ /* Try to append newex to the ex */
+ if (ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ /*
+ * ext4_can_extents_be_merged should have checked
+ * that either both extents are uninitialized, or
+ * both aren't. Thus we need to check only one of
+ * them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
- eh = path[depth].p_hdr;
- nearex = ex;
- goto merge;
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+prepend:
+ /* Try to prepend newex to the ex */
+ if (ext4_can_extents_be_merged(inode, newext, ex)) {
+ ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ /*
+ * ext4_can_extents_be_merged should have checked
+ * that either both extents are uninitialized, or
+ * both aren't. Thus we need to check only one of
+ * them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_block = newext->ee_block;
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
}
depth = ext_depth(inode);
@@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
- flags = EXT4_MB_USE_ROOT_BLOCKS;
+ if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ flags = EXT4_MB_USE_RESERVED;
err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
if (err)
goto cleanup;
@@ -2066,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
- flags |= FIEMAP_EXTENT_DELALLOC;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
}
up_read(&EXT4_I(inode)->i_data_sem);
@@ -2269,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
}
/*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
*
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
*
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
*/
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
int depth;
@@ -2290,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
depth = ext_depth(inode);
- if (chunk)
+ if (extents <= 1)
index = depth * 2;
else
index = depth * 3;
@@ -2298,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
return index;
}
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- ext4_fsblk_t *partial_cluster,
+ long long *partial_cluster,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = 0;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
- else if (ext4_should_journal_data(inode))
- flags |= EXT4_FREE_BLOCKS_FORGET;
+ int flags = get_default_free_blocks_flags(inode);
/*
* For bigalloc file systems, we never free a partial cluster
@@ -2329,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2355,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/*
* If the block range to be freed didn't start at the
* beginning of a cluster, and we removed the entire
- * extent, save the partial cluster here, since we
- * might need to delete if we determine that the
- * truncate operation has removed all of the blocks in
- * the cluster.
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
*/
- if (pblk & (sbi->s_cluster_ratio - 1) &&
- (ee_len == num))
+ unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
- else
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
*partial_cluster = 0;
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* head removal */
- ext4_lblk_t num;
- ext4_fsblk_t start;
-
- num = to - from;
- start = ext4_ext_pblock(ex);
-
- ext_debug("free first %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -2402,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
* @start: The first block to remove
* @end: The last block to remove
*/
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2420,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2431,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
return -EIO;
}
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2458,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (pblk & (sbi->s_cluster_ratio - 1))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2533,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else
+ } else if (*partial_cluster > 0)
*partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2551,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * If there is still a entry in the leaf node, check to see if
- * it references the partial cluster. This is the only place
- * where it could; if it doesn't, we can free the cluster.
+ * Free the partial cluster only if the current extent does not
+ * reference it. Otherwise we might free used cluster.
*/
- if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ if (*partial_cluster > 0 &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
@@ -2599,13 +2680,13 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- ext4_fsblk_t partial_cluster = 0;
+ long long partial_cluster = 0;
handle_t *handle;
int i = 0, err = 0;
@@ -2617,7 +2698,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
- trace_ext4_ext_remove_space(inode, start, depth);
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
* Check if we are removing extents inside the extent tree. If that
@@ -2667,12 +2748,14 @@ again:
/*
* Split the extent in two so that 'end' is the last
- * block in the first new extent
+ * block in the first new extent. Also we should not
+ * fail removing space due to ENOSPC so try to use
+ * reserved block if that happens.
*/
err = ext4_split_extent_at(handle, inode, path,
- end + 1, split_flag,
- EXT4_GET_BLOCKS_PRE_IO |
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+ end + 1, split_flag,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
if (err < 0)
goto out;
@@ -2783,17 +2866,14 @@ again:
}
}
- trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
- path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
/* If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
* cluster as well. */
- if (partial_cluster && path->p_hdr->eh_entries == 0) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -3147,35 +3227,35 @@ out:
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path,
+ int flags)
{
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
- struct ext4_extent *ex;
+ struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
- unsigned int ee_len, depth;
- int allocated, max_zeroout = 0;
+ unsigned int ee_len, depth, map_len = map->m_len;
+ int allocated = 0, max_zeroout = 0;
int err = 0;
int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ (unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
- if (eof_block < map->m_lblk + map->m_len)
- eof_block = map->m_lblk + map->m_len;
+ if (eof_block < map->m_lblk + map_len)
+ eof_block = map->m_lblk + map_len;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
zero_ex.ee_len = 0;
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3266,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
/*
* Attempt to transfer newly initialized blocks from the currently
- * uninitialized extent to its left neighbor. This is much cheaper
+ * uninitialized extent to its neighbor. This is much cheaper
* than an insertion followed by a merge as those involve costly
- * memmove() calls. This is the common case in steady state for
- * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
- * writes.
+ * memmove() calls. Transferring to the left is the common case in
+ * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+ * followed by append writes.
*
* Limitations of the current logic:
- * - L1: we only deal with writes at the start of the extent.
- * The approach could be extended to writes at the end
- * of the extent but this scenario was deemed less common.
- * - L2: we do not deal with writes covering the whole extent.
+ * - L1: we do not deal with writes covering the whole extent.
* This would require removing the extent if the transfer
* is possible.
- * - L3: we only attempt to merge with an extent stored in the
+ * - L2: we only attempt to merge with an extent stored in the
* same extent tree node.
*/
- if ((map->m_lblk == ee_block) && /*L1*/
- (map->m_len < ee_len) && /*L2*/
- (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
- struct ext4_extent *prev_ex;
+ if ((map->m_lblk == ee_block) &&
+ /* See if we can merge left */
+ (map_len < ee_len) && /*L1*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
ext4_lblk_t prev_lblk;
ext4_fsblk_t prev_pblk, ee_pblk;
- unsigned int prev_len, write_len;
+ unsigned int prev_len;
- prev_ex = ex - 1;
- prev_lblk = le32_to_cpu(prev_ex->ee_block);
- prev_len = ext4_ext_get_actual_len(prev_ex);
- prev_pblk = ext4_ext_pblock(prev_ex);
+ abut_ex = ex - 1;
+ prev_lblk = le32_to_cpu(abut_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(abut_ex);
+ prev_pblk = ext4_ext_pblock(abut_ex);
ee_pblk = ext4_ext_pblock(ex);
- write_len = map->m_len;
/*
- * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
* upon those conditions:
- * - C1: prev_ex is initialized,
- * - C2: prev_ex is logically abutting ex,
- * - C3: prev_ex is physically abutting ex,
- * - C4: prev_ex can receive the additional blocks without
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
((prev_lblk + prev_len) == ee_block) && /*C2*/
((prev_pblk + prev_len) == ee_pblk) && /*C3*/
- (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
- map, ex, prev_ex);
+ map, ex, abut_ex);
- /* Shift the start of ex by 'write_len' blocks */
- ex->ee_block = cpu_to_le32(ee_block + write_len);
- ext4_ext_store_pblock(ex, ee_pblk + write_len);
- ex->ee_len = cpu_to_le16(ee_len - write_len);
+ /* Shift the start of ex by 'map_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + map_len);
+ ext4_ext_store_pblock(ex, ee_pblk + map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
ext4_ext_mark_uninitialized(ex); /* Restore the flag */
- /* Extend prev_ex by 'write_len' blocks */
- prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
+ }
+ } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+ (map_len < ee_len) && /*L1*/
+ ex < EXT_LAST_EXTENT(eh)) { /*L2*/
+ /* See if we can merge right */
+ ext4_lblk_t next_lblk;
+ ext4_fsblk_t next_pblk, ee_pblk;
+ unsigned int next_len;
+
+ abut_ex = ex + 1;
+ next_lblk = le32_to_cpu(abut_ex->ee_block);
+ next_len = ext4_ext_get_actual_len(abut_ex);
+ next_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+
+ /*
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ ((map->m_lblk + map_len) == next_lblk) && /*C2*/
+ ((ee_pblk + ee_len) == next_pblk) && /*C3*/
+ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
- /* Mark the block containing both extents as dirty */
- ext4_ext_dirty(handle, inode, path + depth);
+ /* Shift the start of abut_ex by 'map_len' blocks */
+ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+ ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
- /* Update path to point to the right extent */
- path[depth].p_ext = prev_ex;
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = write_len;
- goto out;
+ allocated = map_len;
}
}
+ if (allocated) {
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = abut_ex;
+ goto out;
+ } else
+ allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block);
/*
@@ -3330,7 +3454,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, 0);
+ &split_map, split_flag, flags);
if (allocated < 0)
err = allocated;
@@ -3537,7 +3661,7 @@ int ext4_find_delalloc_range(struct inode *inode,
{
struct extent_status es;
- ext4_es_find_delayed_extent(inode, lblk_start, &es);
+ ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
else if (es.es_lblk <= lblk_start &&
@@ -3650,6 +3774,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
+ /*
+ * When writing into uninitialized space, we should not fail to
+ * allocate metadata blocks for the new extent block if needed.
+ */
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
allocated, newblock);
@@ -3713,7 +3843,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out:
@@ -4252,53 +4382,18 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+ trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
- handle_t *handle;
- loff_t page_len;
int err = 0;
/*
- * finish any pending end_io work so we won't run the risk of
- * converting any truncated blocks to initialized later
- */
- ext4_flush_unwritten_io(inode);
-
- /*
- * probably first extent we're gonna free will be last in block
- */
- err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
- if (IS_ERR(handle))
- return;
-
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out_stop;
- }
-
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
-
- down_write(&EXT4_I(inode)->i_data_sem);
-
- ext4_discard_preallocations(inode);
-
- /*
* TODO: optimization is possible here.
* Probably we need not scan at all,
* because page truncation is enough.
@@ -4313,29 +4408,6 @@ void ext4_ext_truncate(struct inode *inode)
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-
- /* In a multi-transaction truncate, we only make the final
- * transaction synchronous.
- */
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out_stop:
- /*
- * If this was a simple ftruncate() and the file will remain alive,
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
- */
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
-
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
}
static void ext4_falloc_update_inode(struct inode *inode,
@@ -4393,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(file, offset, len);
+ return ext4_punch_hole(inode, offset, len);
ret = ext4_convert_inline_data(inode);
if (ret)
@@ -4495,10 +4567,9 @@ retry:
* function, to convert the fallocated extents after IO is completed.
* Returns 0 on success.
*/
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
{
- handle_t *handle;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
@@ -4513,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
map.m_lblk);
/*
- * credits to insert 1 extent into extent tree
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
*/
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4533,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret <= 0 || ret2 )
+ if (credits)
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
}
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
@@ -4555,9 +4645,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status es;
ext4_lblk_t block, next_del;
- ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
-
if (newes->es_pblk == 0) {
+ ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1, &es);
+
/*
* No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent.
@@ -4577,7 +4668,7 @@ static int ext4_find_delayed_extent(struct inode *inode,
}
block = newes->es_lblk + newes->es_len;
- ext4_es_find_delayed_extent(inode, block, &es);
+ ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS;
else
@@ -4605,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
error = ext4_get_inode_loc(inode, &iloc);
if (error)
return error;
- physical = iloc.bh->b_blocknr << blockbits;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
offset = EXT4_GOOD_OLD_INODE_SIZE +
EXT4_I(inode)->i_extra_isize;
physical += offset;
@@ -4613,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
} else { /* external block */
- physical = EXT4_I(inode)->i_file_acl << blockbits;
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
}
@@ -4623,187 +4714,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode: The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
- int credits, err = 0;
-
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
- mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- err = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- err = -ETXTBSY;
- goto out_mutex;
- }
-
- /* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
-
- /*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
- */
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
- offset;
- }
-
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- err = ext4_flush_unwritten_io(inode);
- if (err)
- goto out_dio;
- inode_dio_wait(inode);
-
- credits = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto out_dio;
- }
-
-
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the buffer
- * heads for the block aligned regions of the page that were
- * completely zeroed.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained within a page
- * just zero out and unmap the middle of that page
- */
- err = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
-
- if (err)
- goto out;
- } else {
- /*
- * zero out and unmap the partial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (err)
- goto out;
- }
-
- /*
- * zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- /*
- * If i_size is contained in the last page, we need to
- * unmap and zero the partial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
-
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out;
- }
- }
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- /* If there are no blocks to remove, return now */
- if (first_block >= stop_block)
- goto out;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- err = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-
- ext4_discard_preallocations(inode);
-
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
-out_dio:
- ext4_inode_resume_unlocked_dio(inode);
-out_mutex:
- mutex_unlock(&inode->i_mutex);
- return err;
-}
-
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index fe3337a85ede..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
* Ext4 extents status tree core functions.
*/
#include <linux/rbtree.h>
+#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"
@@ -232,14 +233,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk
- * if it exists, otherwise, the next extent after @es->lblk.
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
* @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
* @es: delayed extent that we found
*/
-void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
@@ -247,7 +250,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
BUG_ON(es == NULL);
- trace_ext4_es_find_delayed_extent_enter(inode, lblk);
+ BUG_ON(end < lblk);
+ trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
@@ -270,6 +274,10 @@ out:
if (es1 && !ext4_es_is_delayed(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->es_lblk > end) {
+ es1 = NULL;
+ break;
+ }
if (ext4_es_is_delayed(es1))
break;
}
@@ -284,8 +292,7 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
- trace_ext4_es_find_delayed_extent_exit(inode, es);
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
static struct extent_status *
@@ -665,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
@@ -727,7 +733,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_lookup_extent_exit(inode, es, found);
return found;
}
@@ -871,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
EXTENT_STATUS_WRITTEN);
}
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
+
static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct ext4_sb_info *sbi = container_of(shrink,
struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
- struct list_head *cur, *tmp, scanned;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skiped);
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
@@ -886,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (!nr_to_scan)
return ret;
- INIT_LIST_HEAD(&scanned);
-
spin_lock(&sbi->s_es_lru_lock);
+
+ /*
+ * If the inode that is at the head of LRU list is newer than
+ * last_sorted time, that means that we need to sort this list.
+ */
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ }
+
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- list_move_tail(cur, &scanned);
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
- if (ei->i_es_lru_nr == 0) {
- read_unlock(&ei->i_es_lock);
+ /* Skip the inode that is newer than the last_sorted time */
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_move_tail(cur, &skiped);
continue;
}
- read_unlock(&ei->i_es_lock);
+
+ if (ei->i_es_lru_nr == 0)
+ continue;
write_lock(&ei->i_es_lock);
ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
write_unlock(&ei->i_es_lock);
nr_shrunk += ret;
@@ -910,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (nr_to_scan == 0)
break;
}
- list_splice_tail(&scanned, &sbi->s_es_lru);
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skiped, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -918,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
return ret;
}
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
- struct ext4_sb_info *sbi;
-
- sbi = EXT4_SB(sb);
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
sbi->s_es_shrinker.shrink = ext4_es_shrink;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&sbi->s_es_shrinker);
}
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+ unregister_shrinker(&sbi->s_es_shrinker);
}
void ext4_es_lru_add(struct inode *inode)
@@ -940,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
spin_lock(&sbi->s_es_lru_lock);
if (list_empty(&ei->i_es_lru))
list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
- else
- list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d8e2d4dc311e..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
@@ -62,7 +63,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
unsigned long long status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
@@ -118,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
es->es_pblk = block;
}
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 64848b595b24..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include "ext4.h"
@@ -311,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -456,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -464,10 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* it will be as a data.
*/
- ext4_es_find_delayed_extent(inode, last, &es);
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -485,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
}
last++;
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
@@ -493,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
if (dataoff > isize)
return -ENXIO;
- if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
- return -EINVAL;
- if (dataoff > maxsize)
- return -EINVAL;
-
- if (dataoff != file->f_pos) {
- file->f_pos = dataoff;
- file->f_version = 0;
- }
-
- return dataoff;
+ return vfs_setpos(file, dataoff, maxsize);
}
/*
@@ -539,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -547,10 +538,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* we will skip this extent.
*/
- ext4_es_find_delayed_extent(inode, last, &es);
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -565,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
&map, &holeoff);
if (!unwritten) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
}
@@ -579,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
if (holeoff > isize)
holeoff = isize;
- if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
- return -EINVAL;
- if (holeoff > maxsize)
- return -EINVAL;
-
- if (holeoff != file->f_pos) {
- file->f_pos = holeoff;
- file->f_version = 0;
- }
-
- return holeoff;
+ return vfs_setpos(file, holeoff, maxsize);
}
/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode: inode to sync
- * @datasync: only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking. This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly. The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
- int err;
- int ret;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (ret == 0)
- ret = err;
- return ret;
-}
-
/*
* akpm: A new design for ext4_sync_file().
*
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret, err;
+ int ret = 0, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_enter(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_sb->s_flags & MS_RDONLY)
- goto out;
-
- ret = ext4_flush_unwritten_io(inode);
- if (ret < 0)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
goto out;
+ }
if (!journal) {
- ret = __sync_inode(inode, datasync);
+ ret = generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -166,15 +139,13 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
- jbd2_log_start_commit(journal, commit_tid);
- ret = jbd2_log_wait_commit(journal, commit_tid);
+ ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
}
- out:
- mutex_unlock(&inode->i_mutex);
+out:
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6c5bb8d993fe..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
- submit_bh(READ, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
+ /*
+ * Initalize owners and quota early so that we don't have to account
+ * for quota initialization worst case in standard inode creating
+ * transaction
+ */
+ if (owner) {
+ inode->i_mode = mode;
+ i_uid_write(inode, owner[0]);
+ i_gid_write(inode, owner[1]);
+ } else if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ } else
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -697,7 +714,7 @@ got_group:
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
- goto fail;
+ goto out;
/*
* Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
- goto fail;
+ goto out;
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
@@ -730,16 +747,20 @@ repeat_in_this_group:
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks);
+ handle_type, nblocks,
+ 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto fail;
+ ext4_std_error(sb, err);
+ goto out;
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
@@ -755,8 +776,10 @@ repeat_in_this_group:
got:
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +791,8 @@ got:
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
brelse(block_bitmap_bh);
- goto fail;
+ ext4_std_error(sb, err);
+ goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +811,18 @@ got:
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +868,10 @@ got:
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
@@ -851,16 +881,6 @@ got:
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
- if (owner) {
- inode->i_mode = mode;
- i_uid_write(inode, owner[0]);
- i_gid_write(inode, owner[1]);
- } else if (test_opt(sb, GRPID)) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = dir->i_gid;
- } else
- inode_init_owner(inode, dir, mode);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +909,9 @@ got:
* twice.
*/
err = -EIO;
- goto fail;
+ ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+ inode->i_ino);
+ goto out;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +921,6 @@ got:
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
__u32 csum;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +939,6 @@ got:
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
- dquot_initialize(inode);
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
@@ -952,24 +972,17 @@ got:
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
- goto really_out;
-fail:
- ext4_std_error(sb, err);
-out:
- iput(inode);
- ret = ERR_PTR(err);
-really_out:
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
-
fail_drop:
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
clear_nlink(inode);
unlock_new_inode(inode);
+out:
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a04183127ef0..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
* (sct@redhat.com), 1993, 1998
*/
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
@@ -292,131 +293,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
}
/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @handle: handle for this transaction
- * @inode: inode which needs allocated blocks
- * @iblock: the logical block to start allocated at
- * @goal: preferred physical block of allocation
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- * @blks: number of desired blocks
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @err: on return it will store the error code
- *
- * This function will return the number of blocks allocated as
- * requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- struct ext4_allocation_request ar;
- int target, i;
- unsigned long count = 0, blk_allocated = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
-
- /*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
- */
- /* first we try to allocate the indirect blocks */
- target = indirect_blks;
- while (target > 0) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode, goal,
- 0, &count, err);
- if (*err)
- goto failed_out;
-
- if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + count %lu > %d!",
- current_block, count,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
- if (count > 0) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- WARN(1, KERN_INFO "%s returned more blocks than "
- "requested\n", __func__);
- break;
- }
- }
-
- target = blks - count ;
- blk_allocated = count;
- if (!target)
- goto allocated;
- /* Now allocate data blocks */
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = target;
- ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
- /* enable in-core preallocation only for regular files */
- ar.flags = EXT4_MB_HINT_DATA;
-
- current_block = ext4_mb_new_blocks(handle, &ar, err);
- if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + ar.len %d > %d!",
- current_block, ar.len,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- if (*err && (target == blks)) {
- /*
- * if the allocation failed and we didn't allocate
- * any blocks before
- */
- goto failed_out;
- }
- if (!*err) {
- if (target == blks) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- }
- blk_allocated += ar.len;
- }
-allocated:
- /* total number of blocks allocated for direct blocks */
- ret = blk_allocated;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
- return ret;
-}
-
-/**
* ext4_alloc_branch - allocate and set up a chain of blocks.
* @handle: handle for this transaction
* @inode: owner
@@ -448,60 +324,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
int *blks, ext4_fsblk_t goal,
ext4_lblk_t *offsets, Indirect *branch)
{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
+ struct ext4_allocation_request ar;
+ struct buffer_head * bh;
+ ext4_fsblk_t b, new_blocks[4];
+ __le32 *p;
+ int i, j, err, len = 1;
- num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
/*
- * metadata blocks and data blocks are allocated.
+ * Set up for the direct block allocation
*/
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.len = *blks;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ for (i = 0; i <= indirect_blks; i++) {
+ if (i == indirect_blks) {
+ ar.goal = goal;
+ new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+ } else
+ goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+ goal, 0, NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+ }
+ branch[i].key = cpu_to_le32(new_blocks[i]);
+ if (i == 0)
+ continue;
+
+ bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
if (unlikely(!bh)) {
err = -ENOMEM;
goto failed;
}
-
- branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
- /* Don't brelse(bh) here; it's done in
- * ext4_journal_forget() below */
unlock_buffer(bh);
goto failed;
}
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if (n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i = 1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
+ memset(bh->b_data, 0, bh->b_size);
+ p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+ b = new_blocks[i];
+
+ if (i == indirect_blks)
+ len = ar.len;
+ for (j = 0; j < len; j++)
+ *p++ = cpu_to_le32(b++);
+
BUFFER_TRACE(bh, "marking uptodate");
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -511,25 +386,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
if (err)
goto failed;
}
- *blks = num;
- return err;
+ *blks = ar.len;
+ return 0;
failed:
- /* Allocation failed, free what we already allocated */
- ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
- for (i = 1; i <= n ; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
- EXT4_FREE_BLOCKS_FORGET);
+ for (; i >= 0; i--) {
+ if (i != indirect_blks && branch[i].bh)
+ ext4_forget(handle, 1, inode, branch[i].bh,
+ branch[i].bh->b_blocknr);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+ (i == indirect_blks) ? ar.len : 1, 0);
}
- for (i = n+1; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
-
return err;
}
@@ -758,7 +624,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map, err);
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
return err;
}
@@ -809,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
- mutex_lock(&inode->i_mutex);
- ext4_flush_unwritten_io(inode);
- mutex_unlock(&inode->i_mutex);
- }
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -913,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, we need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
- * 2 dindirect blocks, and 1 tindirect block
- */
- return DIV_ROUND_UP(nrblocks,
- EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
- }
/*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
*/
- indirects = nrblocks * 2 + 1;
- return indirects;
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
/*
@@ -941,26 +793,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
+ * Try to extend this transaction for the purposes of truncation. If
* extend fails, we need to propagate the failure up and restart the
* transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
- handle_t *result;
-
- result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
-
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
*
* Returns 0 if we managed to create more room. If we can't create more
* room, and the transaction must be restarted we return 1.
@@ -1091,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
@@ -1353,68 +1190,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
-void ext4_ind_truncate(struct inode *inode)
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
ext4_lblk_t offsets[4];
Indirect chain[4];
Indirect *partial;
__le32 nr = 0;
int n = 0;
ext4_lblk_t last_block, max_block;
- loff_t page_len;
unsigned blocksize = inode->i_sb->s_blocksize;
- int err;
-
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- return; /* AKPM: return what? */
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (err)
- goto out_stop;
- }
-
if (last_block != max_block) {
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
- goto out_stop; /* error */
+ return;
}
- /*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
- *
- * Implication: the file must always be in a sane, consistent
- * truncatable state while each transaction commits.
- */
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
-
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
- ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
/*
@@ -1431,7 +1230,7 @@ void ext4_ind_truncate(struct inode *inode)
* It is unnecessary to free any data blocks if last_block is
* equal to the indirect block limit.
*/
- goto out_unlock;
+ return;
} else if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -1491,31 +1290,6 @@ do_indirects:
case EXT4_TIND_BLOCK:
;
}
-
-out_unlock:
- up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
-
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-out_stop:
- /*
- * If this was a simple ftruncate(), and the file will remain alive
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
- */
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
-
- ext4_journal_stop(handle);
- trace_ext4_truncate_exit(inode);
}
static int free_hole_blocks(handle_t *handle, struct inode *inode,
@@ -1569,8 +1343,8 @@ err:
return ret;
}
-static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t first, ext4_lblk_t stop)
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
{
int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int level, ret = 0;
@@ -1604,157 +1378,3 @@ err:
return ret;
}
-int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- handle_t *handle = NULL;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
- int err = 0;
-
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
- if (err)
- return err;
- }
-
- mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- err = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- err = -ETXTBSY;
- goto out_mutex;
- }
-
- /* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
-
- /*
- * If the hole extents beyond i_size, set the hole
- * to end after the page that contains i_size
- */
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
- offset;
- }
-
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
-
- /* Wait all existing dio works, newcomers will block on i_mutex */
- inode_dio_wait(inode);
-
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- goto out_mutex;
-
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the buffer
- * heads for the block aligned regions of the page that were
- * completely zerod.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained within a page
- * just zero out and unmap the middle of that page
- */
- err = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
- if (err)
- goto out;
- } else {
- /*
- * Zero out and unmap the paritial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (err)
- goto out;
- }
-
- /*
- * Zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- /*
- * If i_size contained in the last page, we need to
- * unmap and zero the paritial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
- if (page_len > 0) {
- err = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
- if (err)
- goto out;
- }
- }
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- if (first_block >= stop_block)
- goto out;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- err = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
-
- ext4_discard_preallocations(inode);
-
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
-
- up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
-
-out_mutex:
- mutex_unlock(&inode->i_mutex);
-
- return err;
-}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c0fd1a123f7d..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE 4
+#define EXT4_INLINE_DOTDOT_OFFSET 2
+#define EXT4_INLINE_DOTDOT_SIZE 4
int ext4_get_inline_size(struct inode *inode)
{
@@ -71,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
entry = (struct ext4_xattr_entry *)
((void *)raw_inode + EXT4_I(inode)->i_inline_off);
- free += le32_to_cpu(entry->e_value_size);
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
goto out;
}
@@ -1289,19 +1290,133 @@ out:
return ret;
}
-int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data)
+{
+ int err = 0, count = 0;
+ unsigned int parent_ino;
+ int pos;
+ struct ext4_dir_entry_2 *de;
+ struct inode *inode = file_inode(dir_file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ struct ext4_dir_entry_2 fake;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ pos = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ while (pos < inline_size) {
+ /*
+ * As inlined dir doesn't store any information about '.' and
+ * only the inode number of '..' is stored, we have to handle
+ * them differently.
+ */
+ if (pos == 0) {
+ fake.inode = cpu_to_le32(inode->i_ino);
+ fake.name_len = 1;
+ strcpy(fake.name, ".");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_OFFSET;
+ } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+ fake.inode = cpu_to_le32(parent_ino);
+ fake.name_len = 2;
+ strcpy(fake.name, "..");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+ pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ if (ext4_check_dir_entry(inode, dir_file, de,
+ iloc.bh, dir_buf,
+ inline_size, pos)) {
+ ret = count;
+ goto out;
+ }
+ }
+
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ count++;
+ }
+ ret = count;
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+ struct dir_context *ctx,
int *has_inline_data)
{
- int error = 0;
unsigned int offset, parent_ino;
- int i, stored;
+ int i;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
+ int dotdot_offset, dotdot_size, extra_offset, extra_size;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1328,96 +1443,89 @@ int ext4_read_inline_dir(struct file *filp,
goto out;
sb = inode->i_sb;
- stored = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ offset = ctx->pos;
- while (!error && !stored && filp->f_pos < inode->i_size) {
-revalidate:
- /*
- * If the version has changed since the last call to
- * readdir(2), then we might be pointing to an invalid
- * dirent right now. Scan from the start of the inline
- * dir to make sure.
- */
- if (filp->f_version != inode->i_version) {
- for (i = 0;
- i < inode->i_size && i < offset;) {
- if (!i) {
- /* skip "." and ".." if needed. */
- i += EXT4_INLINE_DOTDOT_SIZE;
- continue;
- }
- de = (struct ext4_dir_entry_2 *)
- (dir_buf + i);
- /* It's too expensive to do a full
- * dirent test each time round this
- * loop, but we do have to test at
- * least that it is non-zero. A
- * failure will be detected in the
- * dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len,
- inline_size) < EXT4_DIR_REC_LEN(1))
- break;
- i += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
- }
- offset = i;
- filp->f_pos = offset;
- filp->f_version = inode->i_version;
- }
+ /*
+ * dotdot_offset and dotdot_size is the real offset and
+ * size for ".." and "." if the dir is block based while
+ * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+ * So we will use extra_offset and extra_size to indicate them
+ * during the inline dir iteration.
+ */
+ dotdot_offset = EXT4_DIR_REC_LEN(1);
+ dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+ extra_size = extra_offset + inline_size;
- while (!error && filp->f_pos < inode->i_size) {
- if (filp->f_pos == 0) {
- error = filldir(dirent, ".", 1, 0, inode->i_ino,
- DT_DIR);
- if (error)
- break;
- stored++;
-
- error = filldir(dirent, "..", 2, 0, parent_ino,
- DT_DIR);
- if (error)
- break;
- stored++;
-
- filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (file->f_version != inode->i_version) {
+ for (i = 0; i < extra_size && i < offset;) {
+ /*
+ * "." is with offset 0 and
+ * ".." is dotdot_offset.
+ */
+ if (!i) {
+ i = dotdot_offset;
+ continue;
+ } else if (i == dotdot_offset) {
+ i = dotdot_size;
continue;
}
+ /* for other entry, the real offset in
+ * the buf has to be tuned accordingly.
+ */
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i - extra_offset);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+ < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ extra_size);
+ }
+ offset = i;
+ ctx->pos = offset;
+ file->f_version = inode->i_version;
+ }
- de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
- if (ext4_check_dir_entry(inode, filp, de,
- iloc.bh, dir_buf,
- inline_size, offset)) {
- ret = stored;
+ while (ctx->pos < extra_size) {
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_offset;
+ continue;
+ }
+
+ if (ctx->pos == dotdot_offset) {
+ if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_size;
+ continue;
+ }
+
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + ctx->pos - extra_offset);
+ if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+ extra_size, ctx->pos))
+ goto out;
+ if (le32_to_cpu(de->inode)) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
goto out;
- }
- offset += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
- if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored++;
- }
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
- inline_size);
}
- offset = 0;
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
}
out:
kfree(dir_buf);
@@ -1702,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
if (error)
goto out;
- physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3a5213bc73e..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -55,21 +56,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
__u16 csum_hi = 0;
__u32 csum;
- csum_lo = raw->i_checksum_lo;
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
raw->i_checksum_lo = 0;
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum_hi = raw->i_checksum_hi;
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
raw->i_checksum_hi = 0;
}
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
EXT4_INODE_SIZE(inode->i_sb));
- raw->i_checksum_lo = csum_lo;
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
- raw->i_checksum_hi = csum_hi;
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
return csum;
}
@@ -131,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
/*
* Test whether an inode is a fast symlink.
@@ -210,12 +211,12 @@ void ext4_evict_inode(struct inode *inode)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
- jbd2_log_start_commit(journal, commit_tid);
- jbd2_log_wait_commit(journal, commit_tid);
+ jbd2_complete_transaction(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
-
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
- }
- return num;
-}
-
#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1081,31 +1024,56 @@ retry_journal:
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_handle_dirty_metadata(handle, NULL, bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int i_size_changed = 0;
- struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ int i_size_changed = 0;
- if (ext4_has_inline_data(inode))
- copied = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- else
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
+ * cannot change under us because we hole i_mutex.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1115,10 +1083,10 @@ static int ext4_generic_write_end(struct file *file,
i_size_changed = 1;
}
- if (pos + copied > EXT4_I(inode)->i_disksize) {
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
+ * but greater than i_disksize. (hint delalloc)
*/
ext4_update_i_disksize(inode, (pos + copied));
i_size_changed = 1;
@@ -1135,87 +1103,13 @@ static int ext4_generic_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_ordered_write_end(inode, pos, len, copied);
- ret = ext4_jbd2_file_inode(handle, inode);
-
- if (ret == 0) {
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
- /* if we have allocated more blocks and copied
- * less. We will have blocks allocated outside
- * inode->i_size. So truncate them
- */
- ext4_orphan_add(handle, inode);
- if (ret2 < 0)
- ret = ret2;
- } else {
- unlock_page(page);
- page_cache_release(page);
- }
-
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
-
- if (pos + len > inode->i_size) {
- ext4_truncate_failed_write(inode);
- /*
- * If truncate failed early the inode might still be
- * on the orphan list; we need to make sure the inode
- * is removed from the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
-
-
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_writeback_write_end(inode, pos, len, copied);
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-
- if (ret2 < 0)
- ret = ret2;
-
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1465,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
int num_clusters;
ext4_fsblk_t lblk;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
@@ -1510,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int skip_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /*
- * skip page if block allocation undone and
- * block is dirty
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh))
- skip_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (skip_page) {
- unlock_page(page);
- continue;
- }
-
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&io_submit, page, len,
- mpd->wbc);
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t start, last;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
index = mpd->first_page;
end = mpd->next_page - 1;
-
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
pagevec_init(&pvec, 0);
while (index <= end) {
@@ -1656,236 +1460,40 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
EXT4_C2B(EXT4_SB(inode->i_sb),
- ext4_count_free_clusters(inode->i_sb)));
+ ext4_count_free_clusters(sb)));
ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
- (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ (long long) EXT4_C2B(EXT4_SB(sb),
percpu_counter_sum(&sbi->s_freeclusters_counter)));
ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
- (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ (long long) EXT4_C2B(EXT4_SB(sb),
percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
ext4_msg(sb, KERN_CRIT, "Block reservation details");
ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
- EXT4_I(inode)->i_reserved_data_blocks);
+ ei->i_reserved_data_blocks);
ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
- EXT4_I(inode)->i_reserved_meta_blocks);
- return;
-}
-
-/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
-
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
-
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will just let
- * mpage_da_submit_io() unlock all of the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
-
- if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
- mpd->retval = err;
- goto submit_io;
- }
-
- /*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
- */
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
- }
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd);
-
- /* Mark this page range as having been completed */
- mpd->io_done = 1;
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
- unsigned long b_state)
-{
- sector_t next;
- int blkbits = mpd->inode->i_blkbits;
- int nrblocks = mpd->b_size >> blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= (8*1024*1024 >> blkbits))
- goto flush_it;
-
- /* check if the reserved journal credits might overflow */
- if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = 1 << blkbits;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
-
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += 1 << blkbits;
- return;
- }
-
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
return;
}
@@ -1921,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
@@ -2194,7 +1804,7 @@ out:
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
* - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
@@ -2272,76 +1882,405 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- memset(&io_submit, 0, sizeof(io_submit));
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
ret = ext4_bio_write_page(&io_submit, page, len, wbc);
ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ unsigned long b_state)
+{
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return 0;
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = b_state & BH_FLAGS;
+ return 1;
+ }
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return 1;
+ }
+ return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+ lblk >= blocks) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return false;
+ if (lblk >= blocks)
+ return true;
+ continue;
+ }
+ if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+ return false;
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Upto 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ add_page_bufs_to_extent(mpd, head, bh,
+ lblk);
+ pagevec_release(&pvec);
+ return 0;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (++lblk < blocks &&
+ (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err;
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an uninitialized extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ if (ext4_should_dioread_nolock(inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(handle_t *handle,
- struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- struct buffer_head *bh, *head;
- struct inode *inode = mapping->host;
- struct pagevec pvec;
- unsigned int nr_pages;
- sector_t logical;
- pgoff_t index, end;
- long nr_to_write = wbc->nr_to_write;
- int i, tag, ret = 0;
-
- memset(mpd, 0, sizeof(struct mpage_da_data));
- mpd->wbc = wbc;
- mpd->inode = inode;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ while (map->m_len) {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ }
+
+ /* Update on-disk size after IO is submitted */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+
+ ext4_update_i_disksize(inode, disksize);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- return 0;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2356,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
if (page->index > end)
goto out;
- *done_index = page->index + 1;
-
- /*
- * If we can't merge this page, and we have
- * accumulated an contiguous region, write it
- */
- if ((mpd->next_page != page->index) &&
- (mpd->next_page != mpd->first_page)) {
- mpage_da_map_and_submit(mpd);
- goto ret_extent_tail;
- }
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
lock_page(page);
-
/*
- * If the page is no longer dirty, or its
- * mapping no longer corresponds to inode we
- * are writing (which means it has been
- * truncated or invalidated), or the page is
- * already under writeback and we are not
- * doing a data integrity writeback, skip the page
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
if (!PageDirty(page) ||
(PageWriteback(page) &&
- (wbc->sync_mode == WB_SYNC_NONE)) ||
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
@@ -2389,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- /*
- * If we have inline data and arrive here, it means that
- * we will soon create the block for the 1st page, so
- * we'd better clear the inline data here.
- */
- if (ext4_has_inline_data(inode)) {
- BUG_ON(ext4_test_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
- }
-
- if (mpd->next_page != page->index)
+ if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
/* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate unmapped blocks
- * in the same page. Otherwise we won't make
- * progress with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) &&
- buffer_mapped(bh)) {
- /*
- * mapped dirty buffer. We need to
- * update the b_state because we look
- * at b_state in mpage_da_map_blocks.
- * We don't update b_size because if we
- * find an unmapped buffer_head later
- * we need to use the b_state flag of
- * that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state =
- bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
-
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
+ if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ goto out;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
goto out;
}
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+ mpd->next_page - mpd->first_page >=
+ mpd->wbc->nr_to_write)
+ goto out;
}
pagevec_release(&pvec);
cond_resched();
}
return 0;
-ret_extent_tail:
- ret = MPAGE_DA_EXTENT_TAIL;
out:
pagevec_release(&pvec);
- cond_resched();
- return ret;
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2498,170 +2391,171 @@ static int ext4_da_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+ }
+
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert upto one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
-
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
blk_start_plug(&plug);
- while (!ret && wbc->nr_to_write > 0) {
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- blk_finish_plug(&plug);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call write_cache_pages_da() to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4 and submit them.
- */
- ret = write_cache_pages_da(handle, mapping,
- wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * Got one extent now try with rest of the pages.
- * If mpd.retval is set -EIO, journal is aborted.
- * So we don't need to write any more.
- */
- pages_written += mpd.pages_written;
- ret = mpd.retval;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
blk_finish_plug(&plug);
- if (!io_done && !cycled) {
+ if (!ret && !cycled) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
static int ext4_nonda_switch(struct super_block *sb)
{
- s64 free_blocks, dirty_blocks;
+ s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
@@ -2672,17 +2566,18 @@ static int ext4_nonda_switch(struct super_block *sb)
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
*/
- free_blocks = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
/*
* Start pushing delalloc when 1/2 of free blocks are dirty.
*/
- if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
- if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
/*
* free block count is less than 150% of dirty blocks
* or free blocks is less than watermark
@@ -2818,18 +2713,9 @@ static int ext4_da_write_end(struct file *file,
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
- if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- switch (ext4_inode_journal_mode(inode)) {
- case EXT4_INODE_ORDERED_DATA_MODE:
- return ext4_ordered_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- case EXT4_INODE_WRITEBACK_DATA_MODE:
- return ext4_writeback_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- default:
- BUG();
- }
- }
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
@@ -2875,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -2884,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -2910,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -3035,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- trace_ext4_invalidatepage(page, offset);
+ trace_ext4_invalidatepage(page, offset, length);
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
- block_invalidatepage(page, offset);
+ block_invalidatepage(page, offset, length);
}
static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- return jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
}
/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3113,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end) {
+ inode_dio_done(inode);
+ if (is_async)
+ aio_complete(iocb, ret, 0);
+ return;
+ }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3123,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
-
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
-out:
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
io_end->offset = offset;
io_end->size = size;
if (is_async) {
io_end->iocb = iocb;
io_end->result = ret;
}
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
/*
@@ -3175,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
@@ -3182,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
BUG_ON(iocb->private == NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
if (overwrite) {
- atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3213,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) {
ret = -ENOMEM;
goto retake_lock;
}
io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
/*
* we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the
@@ -3243,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
dio_flags);
- if (iocb->private)
- ext4_inode_aio_set(inode, NULL);
/*
- * The io_end structure takes a reference to the inode, that
- * structure needs to be destroyed and the reference to the
- * inode need to be dropped, when IO is complete, even with 0
- * byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will
- * be destroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since VFS
- * direct IO won't invoke the end_io call back function, we
- * need to free the end_io structure here.
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->iocb);
+ /*
+ * Generic code already did inode_dio_done() so we
+ * have to clear EXT4_IO_END_DIRECT to not do it for
+ * the second time.
+ */
+ io_end->flag = 0;
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
+ }
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
* completed, we could do the conversion right here
*/
- err = ext4_convert_unwritten_extents(inode,
+ err = ext4_convert_unwritten_extents(NULL, inode,
offset, ret);
if (err < 0)
ret = err;
@@ -3277,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
}
retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
- inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -3334,27 +3237,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writepage,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
+static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
+ .write_end = ext4_write_end,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3368,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3383,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3399,108 +3289,73 @@ void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
case EXT4_INODE_ORDERED_DATA_MODE:
- if (test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else
- inode->i_mapping->a_ops = &ext4_ordered_aops;
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_WRITEBACK_DATA_MODE:
- if (test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else
- inode->i_mapping->a_ops = &ext4_writeback_aops;
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
- break;
+ return;
default:
BUG();
}
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_aops;
}
-
/*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
struct inode *inode = mapping->host;
- struct page *page;
- int err = 0;
-
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
- err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
- from, length, flags);
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- unlock_page(page);
- page_cache_release(page);
- return err;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped. Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been locked. The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'. This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode: The files inode
- * page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the beginning of the file)
- * to begin discarding
- * len: The length of bytes to discard
- * flags: Optional flags that may be used:
- *
- * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- * Only zero the regions of the page whose buffer heads
- * have already been unmapped. This flag is appropriate
- * for updating the contents of a page whose blocks may
- * have already been released, and we only want to zero
- * out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned int offset = from & (PAGE_CACHE_SIZE-1);
- unsigned int blocksize, max, pos;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
+ struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
- blocksize = inode->i_sb->s_blocksize;
- max = PAGE_CACHE_SIZE - offset;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
- if (index != page->index)
- return -EINVAL;
+ blocksize = inode->i_sb->s_blocksize;
+ max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
- * 'from' and the end of the page
+ * 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
@@ -3518,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- pos = offset;
- while (pos < offset + length) {
- unsigned int end_of_block, range_to_discard;
-
- err = 0;
-
- /* The length of space left to zero and unmap */
- range_to_discard = offset + length - pos;
-
- /* The length of space until the end of the block */
- end_of_block = blocksize - (pos & (blocksize-1));
-
- /*
- * Do not unmap or zero past end of block
- * for this buffer head
- */
- if (range_to_discard > end_of_block)
- range_to_discard = end_of_block;
-
-
- /*
- * Skip this buffer head if we are only zeroing unampped
- * regions of the page
- */
- if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
- buffer_mapped(bh))
- goto next;
-
- /* If the range is block aligned, unmap */
- if (range_to_discard == blocksize) {
- clear_buffer_dirty(bh);
- bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
- clear_buffer_uptodate(bh);
- zero_user(page, pos, range_to_discard);
- BUFFER_TRACE(bh, "Buffer discarded");
- goto next;
- }
-
- /*
- * If this block is not completely contained in the range
- * to be discarded, then it is not going to be released. Because
- * we need to keep this block, we need to make sure this part
- * of the page is uptodate before we modify it by writeing
- * partial zeros on it.
- */
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto unlock;
+ }
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh)) {
- /*
- * Buffer head must be mapped before we can read
- * from the block
- */
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto next;
- }
+ BUFFER_TRACE(bh, "still unmapped");
+ goto unlock;
}
+ }
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt.*/
- if (!buffer_uptodate(bh))
- goto next;
- }
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+ zero_user(page, offset, length);
+ BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto next;
- }
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else {
+ err = 0;
+ mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
+ }
- zero_user(page, pos, range_to_discard);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
- bh = bh->b_this_page;
- iblock++;
- pos += range_to_discard;
- }
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
+
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
return err;
}
@@ -3643,23 +3483,128 @@ int ext4_can_truncate(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_block_offset, last_block_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
+
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return ext4_ind_punch_hole(file, offset, length);
-
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+ if (EXT4_SB(sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
return -EOPNOTSUPP;
}
trace_ext4_punch_hole(inode, offset, length);
- return ext4_ext_punch_hole(file, offset, length);
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ ret = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ goto out_mutex;
+ }
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
+ }
+
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
+
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
@@ -3692,6 +3637,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
*/
void ext4_truncate(struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int credits;
+ handle_t *handle;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * There is a possibility that we're either freeing the inode
+ * or it completely new indode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
+ */
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
@@ -3711,9 +3668,59 @@ void ext4_truncate(struct inode *inode)
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ext4_ext_truncate(inode);
+ credits = ext4_writepage_trans_blocks(inode);
else
- ext4_ind_truncate(inode);
+ credits = ext4_blocks_for_truncate(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
+
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ /*
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
+ *
+ * Implication: the file must always be in a sane, consistent
+ * truncatable state while each transaction commits.
+ */
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ ext4_discard_preallocations(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(handle, inode);
+ else
+ ext4_ind_truncate(handle, inode);
+
+ up_write(&ei->i_data_sem);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+out_stop:
+ /*
+ * If this was a simple ftruncate() and the file will remain alive,
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
trace_ext4_truncate_exit(inode);
}
@@ -3821,13 +3828,14 @@ make_io:
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
table = ext4_inode_table(sb, gdp);
/* s_inode_readahead_blks is always a power of 2 */
- b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
if (table > b)
b = table;
- end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ end = b + ra_blks;
num = EXT4_INODES_PER_GROUP(sb);
if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
@@ -4024,8 +4032,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
ret = -ESTALE;
goto bad_inode;
@@ -4033,7 +4042,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4153,6 +4164,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
} else {
ret = -EIO;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
@@ -4435,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
inode->i_size >> PAGE_CACHE_SHIFT);
if (!page)
return;
- ret = __ext4_journalled_invalidatepage(page, offset);
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
unlock_page(page);
page_cache_release(page);
if (ret != -EBUSY)
@@ -4617,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
@@ -4635,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
EXT4_I(inode)->i_reserved_data_blocks);
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
return 0;
}
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_ind_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -4657,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -4665,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -4680,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -4716,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 721f4d33e148..9491ac0590f7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a: pointer to first memory area
+ * @b: pointer to second memory area
+ * @len: number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+ unsigned char *ap, *bp;
+ unsigned char tmp;
+
+ ap = (unsigned char *)a;
+ bp = (unsigned char *)b;
+ while (len-- > 0) {
+ tmp = *ap;
+ *ap = *bp;
+ *bp = tmp;
+ ap++;
+ bp++;
+ }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1: pointer to first inode
+ * @inode2: pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+ loff_t isize;
+ struct ext4_inode_info *ei1;
+ struct ext4_inode_info *ei2;
+
+ ei1 = EXT4_I(inode1);
+ ei2 = EXT4_I(inode2);
+
+ memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+ memswap(&inode1->i_version, &inode2->i_version,
+ sizeof(inode1->i_version));
+ memswap(&inode1->i_blocks, &inode2->i_blocks,
+ sizeof(inode1->i_blocks));
+ memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+ memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+ memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+ memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+ memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+ memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+ i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb: the super block of the filesystem
+ * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+ struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+ struct inode *inode_bl;
+ struct ext4_inode_info *ei;
+ struct ext4_inode_info *ei_bl;
+ struct ext4_sb_info *sbi;
+
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+ err = -EINVAL;
+ goto swap_boot_out;
+ }
+
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto swap_boot_out;
+ }
+
+ sbi = EXT4_SB(sb);
+ ei = EXT4_I(inode);
+
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ if (IS_ERR(inode_bl)) {
+ err = PTR_ERR(inode_bl);
+ goto swap_boot_out;
+ }
+ ei_bl = EXT4_I(inode_bl);
+
+ filemap_flush(inode->i_mapping);
+ filemap_flush(inode_bl->i_mapping);
+
+ /* Protect orig inodes against a truncate and make sure,
+ * that only 1 swap_inode_boot_loader is running. */
+ ext4_inode_double_lock(inode, inode_bl);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ ext4_inode_block_unlocked_dio(inode_bl);
+ inode_dio_wait(inode);
+ inode_dio_wait(inode_bl);
+
+ handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+ if (IS_ERR(handle)) {
+ err = -EINVAL;
+ goto swap_boot_out;
+ }
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(inode, inode_bl);
+
+ if (inode_bl->i_nlink == 0) {
+ /* this inode has never been used as a BOOT_LOADER */
+ set_nlink(inode_bl, 1);
+ i_uid_write(inode_bl, 0);
+ i_gid_write(inode_bl, 0);
+ inode_bl->i_flags = 0;
+ ei_bl->i_flags = 0;
+ inode_bl->i_version = 1;
+ i_size_write(inode_bl, 0);
+ inode_bl->i_mode = S_IFREG;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode_bl);
+ } else
+ memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+ }
+
+ swap_inode_data(inode, inode_bl);
+
+ inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ inode_bl->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ext4_discard_preallocations(inode);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ } else {
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+
+ ext4_journal_stop(handle);
+
+ ext4_double_up_write_data_sem(inode, inode_bl);
+
+ ext4_inode_resume_unlocked_dio(inode);
+ ext4_inode_resume_unlocked_dio(inode_bl);
+
+ ext4_inode_double_unlock(inode, inode_bl);
+
+ iput(inode_bl);
+
+swap_boot_out:
+ return err;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
- if (oldflags & EXT4_EXTENTS_FL) {
- /* We don't support clearning extent flags */
- if (!(flags & EXT4_EXTENTS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (flags & EXT4_EXTENTS_FL) {
- /* migrate the file */
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
- flags &= ~EXT4_EXTENTS_FL;
- }
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ flags_err:
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
- if (migrate)
- err = ext4_ext_migrate(inode);
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ group_add_out:
return err;
}
+ case EXT4_IOC_SWAP_BOOT:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ return swap_inode_boot_loader(sb, inode);
+
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
+}
+
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
- int group;
-
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
@@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
struct page *page;
int ret = 0;
+ might_sleep();
mb_debug(1, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
@@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
+ might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+ int first, int count)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
+ BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, e4b->bd_bitmap);
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, e4b->bd_bitmap);
- if (block && max)
- e4b->bd_info->bb_fragments--;
- else if (!block && !max)
- e4b->bd_info->bb_fragments++;
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
+ if (unlikely(block != -1)) {
+ ext4_fsblk_t blocknr;
- if (!mb_test_bit(block, e4b->bd_bitmap)) {
- ext4_fsblk_t blocknr;
-
- blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += EXT4_C2B(EXT4_SB(sb), block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u)", block);
- }
- mb_clear_bit(block, e4b->bd_bitmap);
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u)", block);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
- if (!buddy2)
- break;
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
+ e4b->bd_info->bb_fragments--;
+ else if (!left_is_free && !right_is_free)
+ e4b->bd_info->bb_fragments++;
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1994,7 +2105,12 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- if (group == ngroups)
+ cond_resched();
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
@@ -2149,7 +2265,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
{
- struct super_block *sb = PDE(inode)->data;
+ struct super_block *sb = PDE_DATA(inode);
int rc;
rc = seq_open(file, &ext4_mb_seq_groups_ops);
@@ -3342,7 +3458,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3807,7 +3923,7 @@ repeat:
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
@@ -4069,7 +4185,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
@@ -4217,6 +4333,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
@@ -4289,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp) {
- ext4_discard_allocated_blocks(ac);
- goto errout;
- }
+ if (*errp)
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4420,11 +4540,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(entry, new_entry)) {
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_start_cluster = entry->efd_start_cluster;
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
@@ -4432,10 +4552,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, efd_node);
- if (can_merge(new_entry, entry)) {
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- ext4_journal_callback_del(handle, &entry->efd_jce);
kmem_cache_free(ext4_free_data_cachep, entry);
}
}
@@ -4470,6 +4590,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
+ might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -4495,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 480acf4a085f..49e8bdff9163 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
-
}
int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ out:
return retval;
}
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent *ex;
+ unsigned int i, len;
+ ext4_fsblk_t blk;
+ handle_t *handle;
+ int ret;
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return -EOPNOTSUPP;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_check_inode(inode);
+ if (ret)
+ goto errout;
+
+ eh = ext_inode_hdr(inode);
+ ex = EXT_FIRST_EXTENT(eh);
+ if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+ eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ if (eh->eh_entries == 0)
+ blk = len = 0;
+ else {
+ len = le16_to_cpu(ex->ee_len);
+ blk = ext4_ext_pblock(ex);
+ if (len > EXT4_NDIR_BLOCKS) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ for (i=0; i < len; i++)
+ ei->i_data[i] = cpu_to_le32(blk++);
+ ext4_mark_inode_dirty(handle, inode);
+errout:
+ ext4_journal_stop(handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f9b551561d2c..214461e42a05 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
#include "ext4.h"
/* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE_SYNC, bh);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(READ_SYNC, *bh);
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 33e1c086858b..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ * of i_data_sem
*
* Acquire write lock of i_data_sem of the two inodes
*/
-static void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
}
/**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
end_ext, eh, range_to_move);
- if (depth) {
- ret = ext4_handle_dirty_metadata(handle, orig_inode,
- orig_path->p_bh);
- if (ret)
- return ret;
- } else {
- ret = ext4_mark_inode_dirty(handle, orig_inode);
- if (ret < 0)
- return ret;
- }
-
- return 0;
+ return ext4_ext_dirty(handle, orig_inode, orig_path);
}
/**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
donor_off += dext_alen;
orig_off += dext_alen;
+ BUG_ON(replaced_count > count);
/* Already moved the expected blocks */
if (replaced_count >= count)
break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page_cache_release(page[0]);
return -ENOMEM;
}
-
+ /*
+ * grab_cache_page_write_begin() may not wait on page's writeback if
+ * BDI not demand that. But it is reasonable to be very conservative
+ * here and explicitly wait on page's writeback
+ */
+ wait_on_page_writeback(page[0]);
+ wait_on_page_writeback(page[1]);
if (inode1 > inode2) {
struct page *tmp;
tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
- int err = 0;
err = ext4_get_block(inode, block, bh, 0);
if (err) {
SetPageError(page);
@@ -915,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
- long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
@@ -943,8 +939,6 @@ again:
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -976,7 +970,7 @@ again:
* necessary, just swap data blocks between orig and donor.
*/
if (uninit) {
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +984,7 @@ again:
goto drop_data_sem;
if (!uninit) {
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
if ((page_has_private(pagep[0]) &&
@@ -1004,7 +998,7 @@ again:
donor_inode, orig_blk_offset,
block_len_in_page, err);
drop_data_sem:
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
}
data_copy:
@@ -1033,7 +1027,7 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, from + replaced_size,
+ *err = __block_write_begin(pagep[0], from, replaced_size,
ext4_get_block);
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1059,11 @@ repair_branches:
* Extents are swapped already, but we are not able to copy data.
* Try to swap extents to it's original places
*/
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
orig_blk_offset,
block_len_in_page, &err2);
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
"Unable to copy data block,"
@@ -1209,15 +1203,15 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
*
* Lock two inodes' i_mutex
*/
-static void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
BUG_ON(inode1 == inode2);
if (inode1 < inode2) {
@@ -1230,15 +1224,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
}
/**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
*
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
*/
-static void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1327,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ ext4_inode_double_lock(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1336,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
@@ -1466,7 +1460,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
@@ -1500,7 +1494,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
@@ -1538,10 +1532,10 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
- mext_inode_double_unlock(orig_inode, donor_inode);
+ ext4_inode_double_unlock(orig_inode, donor_inode);
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3825d6aa8336..234b834d5a97 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- __u32 csum, old_csum;
+ __u32 csum;
+ __le32 save_csum;
int size;
size = count_offset + (count * sizeof(struct dx_entry));
- old_csum = t->dt_checksum;
+ save_csum = t->dt_checksum;
t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
- t->dt_checksum = old_csum;
+ t->dt_checksum = save_csum;
return cpu_to_le32(csum);
}
@@ -917,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse(bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -971,6 +969,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ count = htree_inlinedir_to_tree(dir_file, dir, 0,
+ &hinfo, start_hash,
+ start_minor_hash,
+ &has_inline_data);
+ if (has_inline_data) {
+ *next_hash = ~0;
+ return count;
+ }
+ }
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
@@ -1455,24 +1464,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
- struct ext4_dir_entry_2 *de,
- umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
/*
* Move count entries from end of map between two memory locations.
* Returns pointer to last entry moved.
@@ -2251,8 +2242,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2276,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
@@ -2307,6 +2296,45 @@ retry:
return err;
}
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ dquot_initialize(dir);
+
+retry:
+ inode = ext4_new_inode_start_handle(dir, mode,
+ NULL, 0, NULL,
+ EXT4_HT_DIR,
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT4_XATTR_TRANS_BLOCKS);
+ handle = ext4_journal_current_handle();
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto err_drop_inode;
+ mark_inode_dirty(inode);
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ }
+ if (handle)
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_drop_inode:
+ ext4_journal_stop(handle);
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
+}
+
struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
struct ext4_dir_entry_2 *de,
int blocksize, int csum_size,
@@ -2396,8 +2424,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
&dentry->d_name,
@@ -2826,8 +2853,7 @@ static int ext4_symlink(struct inode *dir,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
retry:
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
@@ -2916,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS));
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2930,6 +2956,11 @@ retry:
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
} else {
drop_nlink(inode);
@@ -3182,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
+ .tmpfile = ext4_tmpfile,
.rename = ext4_rename,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 047a6de04a0a..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
@@ -29,82 +30,137 @@
#include "xattr.h"
#include "acl.h"
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
+static struct kmem_cache *io_end_cachep;
int __init ext4_init_pageio(void)
{
- io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
- if (io_page_cachep == NULL)
- return -ENOMEM;
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
- if (io_end_cachep == NULL) {
- kmem_cache_destroy(io_page_cachep);
+ if (io_end_cachep == NULL)
return -ENOMEM;
- }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
- kmem_cache_destroy(io_page_cachep);
}
/*
- * This function is called by ext4_evict_inode() to make sure there is
- * no more pending I/O completion work left to do.
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
*/
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
- /*
- * We need to make sure the work structure is finished being
- * used before we let the inode get destroyed.
- */
- if (work_pending(&EXT4_I(inode)->i_unwritten_work))
- cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+ char b[BDEVNAME_SIZE];
+ printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
}
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_finish_bio(struct bio *bio)
{
- if (atomic_dec_and_test(&io_page->p_count)) {
- end_page_writeback(io_page->p_page);
- put_page(io_page->p_page);
- kmem_cache_free(io_page_cachep, io_page);
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ struct bio_vec *bvec = &bio->bi_io_vec[i];
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
}
}
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
{
- int i;
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
- BUG_ON(!io);
- BUG_ON(!list_empty(&io->list));
- BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
- for (i = 0; i < io->num_io_pages; i++)
- put_io_page(io->pages[i]);
- io->num_io_pages = 0;
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io->inode));
- kmem_cache_free(io_end_cachep, io);
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+ if (io_end->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(io_end->inode);
+ if (io_end->iocb)
+ aio_complete(io_end->iocb, io_end->result, 0);
+ kmem_cache_free(io_end_cachep, io_end);
}
-/* check a range of space and convert unwritten extents to written. */
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- ret = ext4_convert_unwritten_extents(inode, offset, size);
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
@@ -112,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
return ret;
}
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
- ext4_debug("inode %lu completed_io list is empty\n",
- inode->i_ino);
+ if (list_empty(head))
return;
- }
- ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -149,23 +197,30 @@ static void dump_completed_IO(struct inode *inode)
}
/* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
unsigned long flags;
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list))
- queue_work(wq, &ei->i_unwritten_work);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ if (io_end->handle) {
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ } else {
+ wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+ if (list_empty(&ei->i_unrsv_conversion_list))
+ queue_work(wq, &ei->i_unrsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+ }
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
{
ext4_io_end_t *io;
struct list_head unwritten;
@@ -174,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
int err, ret = 0;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- dump_completed_IO(inode);
- list_replace_init(&ei->i_completed_io_list, &unwritten);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
@@ -186,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
- ext4_free_io_end(io);
}
return ret;
}
/*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
*/
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
{
struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unwritten_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode);
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
{
- int ret;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
- !(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode);
- ext4_unwritten_wait(inode);
- return ret;
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unrsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -219,72 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
}
return io;
}
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c. This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message. We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
}
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct inode *inode;
- int i;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
- bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- bio_put(bio);
- for (i = 0; i < io_end->num_io_pages; i++) {
- struct page *page = io_end->pages[i]->p_page;
- struct buffer_head *bh, *head;
- loff_t offset;
- loff_t io_end_offset;
-
- if (error) {
- SetPageError(page);
- set_bit(AS_EIO, &page->mapping->flags);
- head = page_buffers(page);
- BUG_ON(!head);
-
- io_end_offset = io_end->offset + io_end->size;
-
- offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
- bh = head;
- do {
- if ((offset >= io_end->offset) &&
- (offset+bh->b_size <= io_end_offset))
- buffer_io_error(bh);
-
- offset += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- put_io_page(io_end->pages[i]);
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ } else {
+ ext4_finish_bio(bio);
+ bio_put(bio);
}
- io_end->num_io_pages = 0;
- inode = io_end->inode;
if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
+ struct inode *inode = io_end->inode;
+
ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
inode->i_ino,
@@ -293,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
}
-
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- return;
- }
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
@@ -313,76 +355,53 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio);
}
io->io_bio = NULL;
- io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
io->io_end = NULL;
}
-static int io_submit_init(struct ext4_io_submit *io,
- struct inode *inode,
- struct writeback_control *wbc,
- struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
+ return -ENOMEM;
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
-
- io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+ bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
- struct ext4_io_page *io_page,
struct inode *inode,
- struct writeback_control *wbc,
struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
int ret;
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init(io, inode, wbc, bh);
+ ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
- io_end = io->io_end;
- if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
- (io_end->pages[io_end->num_io_pages-1] != io_page))
- goto submit_and_retry;
- if (buffer_uninit(bh))
- ext4_set_io_unwritten_flag(inode, io_end);
- io->io_end->size += bh->b_size;
- io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- if ((io_end->num_io_pages == 0) ||
- (io_end->pages[io_end->num_io_pages-1] != io_page)) {
- io_end->pages[io_end->num_io_pages++] = io_page;
- atomic_inc(&io_page->p_count);
- }
+ io->io_next_block++;
return 0;
}
@@ -392,33 +411,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- unsigned block_start, block_end, blocksize;
- struct ext4_io_page *io_page;
+ unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
+ int nr_submitted = 0;
blocksize = 1 << inode->i_blkbits;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
- if (!io_page) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return -ENOMEM;
- }
- io_page->p_page = page;
- atomic_set(&io_page->p_count, 1);
- get_page(page);
set_page_writeback(page);
ClearPageError(page);
- for (bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start = block_end, bh = bh->b_this_page) {
-
- block_end = block_start + blocksize;
+ /*
+ * In the first loop we prepare and mark buffers to submit. We have to
+ * mark all buffers in the page before submitting so that
+ * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * on the first buffer finishes and we are still working on submitting
+ * the second buffer.
+ */
+ bh = head = page_buffers(page);
+ do {
+ block_start = bh_offset(bh);
if (block_start >= len) {
/*
* Comments copied from block_write_full_page_endio:
@@ -431,7 +446,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* mapped, and writes to that region are not written
* out to the file."
*/
- zero_user_segment(page, block_start, block_end);
+ zero_user_segment(page, block_start,
+ block_start + blocksize);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
@@ -445,7 +461,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
- ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ set_buffer_async_write(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Now submit buffers to write */
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_async_write(bh))
+ continue;
+ ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
@@ -455,17 +483,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
redirty_page_for_writepage(wbc, page);
break;
}
+ nr_submitted++;
clear_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Error stopped previous loop? Clean up buffers... */
+ if (ret) {
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
}
unlock_page(page);
- /*
- * If the page was truncated before we could do the writeback,
- * or we had a memory allocation error while trying to write
- * the first buffer head, we won't have submitted any pages for
- * I/O. In that case we need to make sure we've cleared the
- * PageWriteback bit from the page to prevent the system from
- * wedging later on.
- */
- put_io_page(io_page);
+ /* Nothing submitted - we have to end page writeback */
+ if (!nr_submitted)
+ end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c169477a62c9..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_group_overhead_blocks(sb, group);
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, "Cannot add at group %u (only %u groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
+ if (offset != 0)
ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -272,7 +277,7 @@ next_group:
if (start_blk >= last_blk)
goto next_group;
group_data[bb_index].block_bitmap = start_blk++;
- ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
@@ -284,7 +289,7 @@ next_group:
if (start_blk >= last_blk)
goto next_group;
group_data[ib_index].inode_bitmap = start_blk++;
- ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
@@ -296,7 +301,7 @@ next_group:
if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
- ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+ group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count -=
EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +397,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_group_t group;
int err;
- ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ group = ext4_get_group_number(sb, block);
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
@@ -1341,6 +1346,8 @@ static void ext4_update_super(struct super_block *sb,
/* Update the global fs size fields */
sbi->s_groups_count += flex_gd->count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
* active. */
@@ -1549,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
struct inode *inode = NULL;
- int gdb_off, gdb_num;
+ int gdb_off;
int err;
__u16 bg_flags = 0;
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1654,12 +1660,10 @@ errout:
err = err2;
if (!err) {
- ext4_fsblk_t first_block;
- first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1879,7 +1883,11 @@ retry:
/* Nothing need to do */
return 0;
- ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+ n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+ if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ ext4_warning(sb, "resize would cause inodes_count overflow");
+ return -EINVAL;
+ }
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -81,6 +82,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
@@ -353,10 +355,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce, *tmp;
+ struct ext4_journal_cb_entry *jce;
+ BUG_ON(txn->t_state == T_FINISHED);
spin_lock(&sbi->s_md_lock);
- list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+ while (!list_empty(&txn->t_private_list)) {
+ jce = list_entry(txn->t_private_list.next,
+ struct ext4_journal_cb_entry, jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
jce->jce_func(sb, jce, error);
@@ -394,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
}
if (test_opt(sb, ERRORS_RO)) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC))
@@ -418,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
ext4_handle_error(sb);
}
-void ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -447,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
ext4_handle_error(inode->i_sb);
}
-void ext4_error_file(struct file *file, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -566,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
if ((sb->s_flags & MS_RDONLY) == 0) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -576,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
panic("EXT4-fs panic from previous error\n");
}
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -699,22 +715,19 @@ fail:
/*
* Release the journal device
*/
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
{
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext4_blkdev_put(bdev);
+ ext4_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -749,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->dio_unwritten_wq);
- destroy_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->unrsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
@@ -759,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -848,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -858,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0;
#endif
ei->jinode = NULL;
- INIT_LIST_HEAD(&ei->i_completed_io_list);
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
- INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1092,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
+ .sync_fs = ext4_sync_fs_nojournal,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1802,7 +1821,7 @@ static int options_seq_show(struct seq_file *seq, void *offset)
static int options_open_fs(struct inode *inode, struct file *file)
{
- return single_open(file, options_seq_show, PDE(inode)->data);
+ return single_open(file, options_seq_show, PDE_DATA(inode));
}
static const struct file_operations ext4_seq_options_fops = {
@@ -1907,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group;
- unsigned int groups_per_flex = 0;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1915,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
sbi->s_log_groups_per_flex = 0;
return 1;
}
- groups_per_flex = 1U << sbi->s_log_groups_per_flex;
err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
if (err)
@@ -1948,16 +1965,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
if ((sbi->s_es->s_feature_ro_compat &
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
/* Use new metadata_csum algorithm */
- __u16 old_csum;
+ __le16 save_csum;
__u32 csum32;
- old_csum = gdp->bg_checksum;
+ save_csum = gdp->bg_checksum;
gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
sbi->s_desc_size);
- gdp->bg_checksum = old_csum;
+ gdp->bg_checksum = save_csum;
crc = csum32 & 0xFFFF;
goto out;
@@ -2163,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
dquot_initialize(inode);
if (inode->i_nlink) {
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -2376,20 +2396,21 @@ struct ext4_attr {
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
- int offset;
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
};
-static int parse_strtoul(const char *buf,
- unsigned long max, unsigned long *value)
+static int parse_strtoull(const char *buf,
+ unsigned long long max, unsigned long long *value)
{
- char *endp;
-
- *value = simple_strtoul(skip_spaces(buf), &endp, 0);
- endp = skip_spaces(endp);
- if (*endp || *value > max)
- return -EINVAL;
+ int ret;
- return 0;
+ ret = kstrtoull(skip_spaces(buf), 0, value);
+ if (!ret && *value > max)
+ ret = -EINVAL;
+ return ret;
}
static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2452,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0x40000000, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
- if (t && !is_power_of_2(t))
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
@@ -2445,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
static ssize_t sbi_ui_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -2454,15 +2477,38 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0xffffffff, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
*ui = t;
return count;
}
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ int ret;
+
+ if (parse_strtoull(buf, -1ULL, &val))
+ return -EINVAL;
+ ret = ext4_reserve_clusters(sbi, val);
+
+ return ret ? ret : count;
+}
+
static ssize_t trigger_test_error(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
@@ -2480,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
return count;
}
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct ext4_sb_info, _elname), \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
}
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2496,10 +2550,19 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2509,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -2517,6 +2580,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -3192,6 +3256,40 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+ ext4_fsblk_t resv_clusters;
+
+ /*
+ * By default we reserve 2% or 4096 clusters, whichever is smaller.
+ * This should cover the situations where we can not afford to run
+ * out of space like for example punch hole, or converting
+ * uninitialized extents in delalloc path. In most cases such
+ * allocation would require 1, or 2 blocks, higher numbers are
+ * very rare.
+ */
+ resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+ do_div(resv_clusters, 50);
+ resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+ return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+ ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits;
+
+ if (count >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, count);
+ return 0;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3624,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+ /* Do we have standard group size of blocksize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == blocksize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3800,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.function = print_daily_error_info;
sbi->s_err_report.data = (unsigned long) sb;
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sbi);
+
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
if (!err) {
@@ -3720,12 +3825,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
- sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
- /* Register extent status tree shrinker */
- ext4_es_register_shrinker(sb);
-
/*
* set up enough so that it can read an inode
*/
@@ -3851,12 +3952,20 @@ no_journal:
* The maximum number of concurrent works can be high and
* concurrency isn't really necessary. Limit it to 1.
*/
- EXT4_SB(sb)->dio_unwritten_wq =
- alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->dio_unwritten_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
ret = -ENOMEM;
- goto failed_mount_wq;
+ goto failed_mount4;
+ }
+
+ EXT4_SB(sb)->unrsv_conversion_wq =
+ alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
}
/*
@@ -3911,6 +4020,13 @@ no_journal:
"available");
}
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+ "reserved pool", ext4_calculate_resv_clusters(sbi));
+ goto failed_mount4a;
+ }
+
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4003,13 +4119,17 @@ failed_mount4a:
sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ if (EXT4_SB(sb)->unrsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4297,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4445,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
*/
dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(sbi->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
}
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
return ret;
}
@@ -4742,9 +4895,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- ext4_fsblk_t overhead = 0;
+ ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
s64 bfree;
+ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
if (!test_opt(sb, MINIX_DF))
overhead = sbi->s_overhead;
@@ -4756,8 +4910,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
- buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
- if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_bavail = buf->f_bfree -
+ (ext4_r_blocks_count(es) + resv_blocks);
+ if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5100,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
return PTR_ERR(qf_inode);
}
+ /* Don't account quota for quota files to avoid recursion */
+ qf_inode->i_flags |= S_NOQUOTA;
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a120b277240..c081e34f717f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
struct ext4_xattr_header *hdr)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u32 csum, old;
+ __u32 csum;
+ __le32 save_csum;
+ __le64 dsk_block_nr = cpu_to_le64(block_nr);
- old = hdr->h_checksum;
+ save_csum = hdr->h_checksum;
hdr->h_checksum = 0;
- block_nr = cpu_to_le64(block_nr);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
- sizeof(block_nr));
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
EXT4_BLOCK_SIZE(inode->i_sb));
- hdr->h_checksum = old;
+ hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index aa25deb5c6cd..c767dbdd7fc4 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
+#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+ bool "F2FS Security Labels"
+ depends on F2FS_FS_XATTR
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the f2fs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 137af4255da6..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size);
+ error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
kfree(value);
if (!error)
@@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct posix_acl *acl;
int error;
- mode_t mode = get_inode_mode(inode);
+ umode_t mode = get_inode_mode(inode);
if (!test_opt(sbi, POSIX_ACL))
return 0;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2b6fc131e2ce..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,6 +20,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include <trace/events/f2fs.h>
static struct kmem_cache *orphan_entry_slab;
static struct kmem_cache *inode_entry_slab;
@@ -57,13 +58,19 @@ repeat:
cond_resched();
goto repeat;
}
- if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+ if (PageUptodate(page))
+ goto out;
+
+ if (f2fs_readpage(sbi, page, index, READ_SYNC))
+ goto repeat;
+
+ lock_page(page);
+ if (page->mapping != mapping) {
f2fs_put_page(page, 1);
goto repeat;
}
+out:
mark_page_accessed(page);
-
- /* We do not allow returning an errorneous page */
return page;
}
@@ -350,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
unsigned long blk_size = sbi->blocksize;
struct f2fs_checkpoint *cp_block;
unsigned long long cur_version = 0, pre_version = 0;
- unsigned int crc = 0;
size_t crc_offset;
+ __u32 crc = 0;
/* Read the 1st cp block in this CP pack */
cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -362,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp1;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
@@ -377,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp2;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
@@ -443,13 +450,30 @@ fail_no_cp:
return -EINVAL;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct list_head *head = &sbi->dir_inode_list;
- struct dir_inode_entry *new;
struct list_head *this;
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode)
+ return -EEXIST;
+ }
+ list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+ sbi->n_dirty_dirs++;
+#endif
+ return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+
if (!S_ISDIR(inode->i_mode))
return;
retry:
@@ -462,23 +486,31 @@ retry:
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode) {
- kmem_cache_free(inode_entry_slab, new);
- goto out;
- }
- }
- list_add_tail(&new->list, head);
- sbi->n_dirty_dirs++;
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
- BUG_ON(!S_ISDIR(inode->i_mode));
-out:
inc_page_count(sbi, F2FS_DIRTY_DENTS);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void add_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+ spin_lock(&sbi->dir_inode_lock);
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
spin_unlock(&sbi->dir_inode_lock);
}
@@ -492,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents))
- goto out;
+ if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
list_for_each(this, head) {
struct dir_inode_entry *entry;
@@ -501,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
if (entry->inode == inode) {
list_del(&entry->list);
kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->n_dirty_dirs--;
+#endif
+ break;
+ }
+ }
+ spin_unlock(&sbi->dir_inode_lock);
+
+ /* Only from the recovery routine */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+ clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ iput(inode);
+ }
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+ struct inode *inode = NULL;
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode->i_ino == ino) {
+ inode = entry->inode;
break;
}
}
-out:
spin_unlock(&sbi->dir_inode_lock);
+ return inode;
}
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -541,54 +601,44 @@ retry:
*/
static void block_operations(struct f2fs_sb_info *sbi)
{
- int t;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.for_reclaim = 0,
};
+ struct blk_plug plug;
- /* Stop renaming operation */
- mutex_lock_op(sbi, RENAME);
- mutex_lock_op(sbi, DENTRY_OPS);
+ blk_start_plug(&plug);
-retry_dents:
- /* write all the dirty dentry pages */
- sync_dirty_dir_inodes(sbi);
+retry_flush_dents:
+ mutex_lock_all(sbi);
- mutex_lock_op(sbi, DATA_WRITE);
+ /* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
- mutex_unlock_op(sbi, DATA_WRITE);
- goto retry_dents;
+ mutex_unlock_all(sbi);
+ sync_dirty_dir_inodes(sbi);
+ goto retry_flush_dents;
}
- /* block all the operations */
- for (t = DATA_NEW; t <= NODE_TRUNC; t++)
- mutex_lock_op(sbi, t);
-
- mutex_lock(&sbi->write_inode);
-
/*
* POR: we should ensure that there is no dirty node pages
* until finishing nat/sit flush.
*/
-retry:
- sync_node_pages(sbi, 0, &wbc);
-
- mutex_lock_op(sbi, NODE_WRITE);
+retry_flush_nodes:
+ mutex_lock(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
- mutex_unlock_op(sbi, NODE_WRITE);
- goto retry;
+ mutex_unlock(&sbi->node_write);
+ sync_node_pages(sbi, 0, &wbc);
+ goto retry_flush_nodes;
}
- mutex_unlock(&sbi->write_inode);
+ blk_finish_plug(&plug);
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
- int t;
- for (t = NODE_WRITE; t >= RENAME; t--)
- mutex_unlock_op(sbi, t);
+ mutex_unlock(&sbi->node_write);
+ mutex_unlock_all(sbi);
}
static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
@@ -598,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
- unsigned int crc32 = 0;
+ __u32 crc32 = 0;
void *kaddr;
int i;
@@ -667,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
- *(__le32 *)((unsigned char *)ckpt +
- le32_to_cpu(ckpt->checksum_offset))
+ *((__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
start_blk = __start_cp_addr(sbi);
@@ -727,9 +777,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
+ trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+
mutex_lock(&sbi->cp_mutex);
block_operations(sbi);
+ trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+
f2fs_submit_bio(sbi, DATA, true);
f2fs_submit_bio(sbi, NODE, true);
f2fs_submit_bio(sbi, META, true);
@@ -746,13 +800,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
flush_nat_entries(sbi);
flush_sit_entries(sbi);
- reset_victim_segmap(sbi);
-
/* unlock all the fs_lock[] in do_checkpoint() */
do_checkpoint(sbi, is_umount);
unblock_operations(sbi);
mutex_unlock(&sbi->cp_mutex);
+
+ trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
}
void init_orphan_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7bd22a201125..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
@@ -21,6 +22,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include <trace/events/f2fs.h>
/*
* Lock ordering for the change of data block address:
@@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn)
if (!inc_valid_block_count(sbi, dn->inode, 1))
return -ENOSPC;
+ trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+
__set_data_blkaddr(dn, NEW_ADDR);
dn->data_blkaddr = NEW_ADDR;
sync_inode_page(dn);
@@ -64,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
@@ -74,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
return 0;
}
+#ifdef CONFIG_F2FS_STAT_FS
sbi->total_hit_ext++;
+#endif
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
start_blkaddr = fi->ext.blk_addr;
@@ -92,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
else
bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
sbi->read_hit_ext++;
+#endif
read_unlock(&fi->ext.ext_lock);
return 1;
}
@@ -133,7 +143,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
goto end_update;
}
- /* Frone merge */
+ /* Front merge */
if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
fi->ext.fofs--;
fi->ext.blk_addr--;
@@ -169,7 +179,7 @@ end_update:
return;
}
-struct page *find_data_page(struct inode *inode, pgoff_t index)
+struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -183,7 +193,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
f2fs_put_page(page, 0);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err)
return ERR_PTR(err);
f2fs_put_dnode(&dn);
@@ -195,16 +205,24 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
if (dn.data_blkaddr == NEW_ADDR)
return ERR_PTR(-EINVAL);
- page = grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
- if (err) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return page;
+ }
+
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+ sync ? READ_SYNC : READA);
+ if (sync) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 0);
+ return ERR_PTR(-EIO);
+ }
}
- unlock_page(page);
return page;
}
@@ -221,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
struct page *page;
int err;
+repeat:
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, RDONLY_NODE);
- if (err)
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
+ }
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR)
+ if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ }
if (PageUptodate(page))
return page;
@@ -241,9 +264,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
BUG_ON(dn.data_blkaddr == NULL_ADDR);
err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
- if (err) {
- f2fs_put_page(page, 1);
+ if (err)
return ERR_PTR(err);
+
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
}
return page;
}
@@ -251,9 +282,13 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
/*
* Caller ensures that this data page is never allocated.
* A new zero-filled data page is allocated in the page cache.
+ *
+ * Also, caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
*/
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
- bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+ struct page *npage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -261,19 +296,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, 0);
+ set_new_dnode(&dn, inode, npage, npage, 0);
+ err = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err)
return ERR_PTR(err);
if (dn.data_blkaddr == NULL_ADDR) {
if (reserve_new_block(&dn)) {
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
return ERR_PTR(-ENOSPC);
}
}
- f2fs_put_dnode(&dn);
-
+ if (!npage)
+ f2fs_put_dnode(&dn);
+repeat:
page = grab_cache_page(mapping, index);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -283,18 +320,27 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
} else {
err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
- if (err) {
- f2fs_put_page(page, 1);
+ if (err)
return ERR_PTR(err);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
}
}
- SetPageUptodate(page);
if (new_i_size &&
i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ /* Only the directory inode sets new_i_size */
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
mark_inode_dirty_sync(inode);
}
return page;
@@ -325,21 +371,15 @@ static void read_end_io(struct bio *bio, int err)
/*
* Fill the locked page with data located in the block address.
- * Read operation is synchronous, and caller must unlock the page.
+ * Return unlocked page.
*/
int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
block_t blk_addr, int type)
{
struct block_device *bdev = sbi->sb->s_bdev;
- bool sync = (type == READ_SYNC);
struct bio *bio;
- /* This page can be already read by other threads */
- if (PageUptodate(page)) {
- if (!sync)
- unlock_page(page);
- return 0;
- }
+ trace_f2fs_readpage(page, blk_addr, type);
down_read(&sbi->bio_sem);
@@ -354,18 +394,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
kfree(bio->bi_private);
bio_put(bio);
up_read(&sbi->bio_sem);
+ f2fs_put_page(page, 1);
return -EFAULT;
}
submit_bio(type, bio);
up_read(&sbi->bio_sem);
-
- /* wait for read completion if sync */
- if (sync) {
- lock_page(page);
- if (PageError(page))
- return -EIO;
- }
return 0;
}
@@ -387,14 +421,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
/* Get the page offset from the block offset(iblock) */
pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
- if (check_extent_cache(inode, pgofs, bh_result))
+ if (check_extent_cache(inode, pgofs, bh_result)) {
+ trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
return 0;
+ }
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
- if (err)
+ err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ if (err) {
+ trace_f2fs_get_data_block(inode, iblock, bh_result, err);
return (err == -ENOENT) ? 0 : err;
+ }
/* It does not support data allocation */
BUG_ON(create);
@@ -419,6 +457,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
bh_result->b_size = (i << blkbits);
}
f2fs_put_dnode(&dn);
+ trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
return 0;
}
@@ -437,13 +476,12 @@ static int f2fs_read_data_pages(struct file *file,
int do_write_data_page(struct page *page)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t old_blk_addr, new_blk_addr;
struct dnode_of_data dn;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+ err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
return err;
@@ -459,16 +497,15 @@ int do_write_data_page(struct page *page)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
- need_inplace_update(inode)) {
+ if (unlikely(old_blk_addr != NEW_ADDR &&
+ !is_cold_data(page) &&
+ need_inplace_update(inode))) {
rewrite_data_page(F2FS_SB(inode->i_sb), page,
old_blk_addr);
} else {
write_data_page(inode, page, &dn,
old_blk_addr, &new_blk_addr);
update_extent_cache(new_blk_addr, &dn);
- F2FS_I(inode)->data_version =
- le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
}
out_writepage:
f2fs_put_dnode(&dn);
@@ -484,10 +521,11 @@ static int f2fs_write_data_page(struct page *page,
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
unsigned offset;
+ bool need_balance_fs = false;
int err = 0;
if (page->index < end_index)
- goto out;
+ goto write;
/*
* If the offset is out-of-range of file size,
@@ -499,50 +537,46 @@ static int f2fs_write_data_page(struct page *page,
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
}
- goto unlock_out;
+ goto out;
}
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-out:
- if (sbi->por_doing)
- goto redirty_out;
-
- if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+write:
+ if (sbi->por_doing) {
+ err = AOP_WRITEPAGE_ACTIVATE;
goto redirty_out;
+ }
- mutex_lock_op(sbi, DATA_WRITE);
+ /* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
+ err = do_write_data_page(page);
+ } else {
+ int ilock = mutex_lock_op(sbi);
+ err = do_write_data_page(page);
+ mutex_unlock_op(sbi, ilock);
+ need_balance_fs = true;
}
- err = do_write_data_page(page);
- if (err && err != -ENOENT) {
- wbc->pages_skipped++;
- set_page_dirty(page);
- }
- mutex_unlock_op(sbi, DATA_WRITE);
+ if (err == -ENOENT)
+ goto out;
+ else if (err)
+ goto redirty_out;
if (wbc->for_reclaim)
f2fs_submit_bio(sbi, DATA, true);
- if (err == -ENOENT)
- goto unlock_out;
-
clear_cold_data(page);
+out:
unlock_page(page);
-
- if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+ if (need_balance_fs)
f2fs_balance_fs(sbi);
return 0;
-unlock_out:
- unlock_page(page);
- return (err == -ENOENT) ? 0 : err;
-
redirty_out:
wbc->pages_skipped++;
set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE;
+ return err;
}
#define MAX_DESIRED_PAGES_WP 4096
@@ -561,19 +595,26 @@ static int f2fs_write_data_pages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ bool locked = false;
int ret;
long excess_nrtw = 0, desired_nrtw;
+ /* deal with chardevs and other special file */
+ if (!mapping->a_ops->writepage)
+ return 0;
+
if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
desired_nrtw = MAX_DESIRED_PAGES_WP;
excess_nrtw = desired_nrtw - wbc->nr_to_write;
wbc->nr_to_write = desired_nrtw;
}
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode)) {
mutex_lock(&sbi->writepages);
+ locked = true;
+ }
ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- if (!S_ISDIR(inode->i_mode))
+ if (locked)
mutex_unlock(&sbi->writepages);
f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
@@ -593,39 +634,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
int err = 0;
+ int ilock;
/* for nobh_write_end */
*fsdata = NULL;
f2fs_balance_fs(sbi);
-
+repeat:
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- mutex_lock_op(sbi, DATA_NEW);
+ ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, 0);
- if (err) {
- mutex_unlock_op(sbi, DATA_NEW);
- f2fs_put_page(page, 1);
- return err;
- }
+ err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ if (err)
+ goto err;
- if (dn.data_blkaddr == NULL_ADDR) {
+ if (dn.data_blkaddr == NULL_ADDR)
err = reserve_new_block(&dn);
- if (err) {
- f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, DATA_NEW);
- f2fs_put_page(page, 1);
- return err;
- }
- }
+
f2fs_put_dnode(&dn);
+ if (err)
+ goto err;
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
@@ -636,21 +671,55 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
/* Reading beyond i_size is simple: memset to zero */
zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
- return 0;
+ goto out;
}
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
- if (err) {
- f2fs_put_page(page, 1);
+ if (err)
return err;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ return -EIO;
+ }
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
}
}
+out:
SetPageUptodate(page);
clear_cold_data(page);
return 0;
+
+err:
+ mutex_unlock_op(sbi, ilock);
+ f2fs_put_page(page, 1);
+ return err;
+}
+
+static int f2fs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ if (pos + copied > i_size_read(inode)) {
+ i_size_write(inode, pos + copied);
+ mark_inode_dirty(inode);
+ update_inode_page(inode);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
}
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
@@ -667,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
get_data_block_ro);
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -681,7 +751,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
static int f2fs_release_data_page(struct page *page, gfp_t wait)
{
ClearPagePrivate(page);
- return 0;
+ return 1;
}
static int f2fs_set_data_page_dirty(struct page *page)
@@ -709,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
.invalidatepage = f2fs_invalidate_data_page,
.releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 025b9e2f935d..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -13,7 +13,6 @@
#include <linux/fs.h>
#include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
#include <linux/f2fs_fs.h>
#include <linux/blkdev.h>
#include <linux/debugfs.h>
@@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
}
}
mutex_unlock(&sit_i->sentry_lock);
- dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+ dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
si->avg_vblocks = total_vblocks / ndirty;
@@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
if (sbi->segs_per_sec > 1)
- si->base_mem += sbi->total_sections *
- sizeof(struct sec_entry);
+ si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
/* build free segmap */
si->base_mem += sizeof(struct free_segmap_info);
si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+ si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
/* buld nm */
si->base_mem += sizeof(struct f2fs_nm_info);
@@ -177,12 +175,12 @@ get_cache:
static int stat_show(struct seq_file *s, void *v)
{
- struct f2fs_stat_info *si, *next;
+ struct f2fs_stat_info *si;
int i = 0;
int j;
mutex_lock(&f2fs_stat_mutex);
- list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ list_for_each_entry(si, &f2fs_stat_list, stat_list) {
char devname[BDEVNAME_SIZE];
update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a1f38443ecee..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
#include "f2fs.h"
#include "node.h"
#include "acl.h"
+#include "xattr.h"
static unsigned long dir_blocks(struct inode *inode)
{
@@ -60,7 +61,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
{
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
@@ -148,7 +149,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
for (; bidx < end_block; bidx++) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = find_data_page(dir, bidx);
+ dentry_page = find_data_page(dir, bidx, true);
if (IS_ERR(dentry_page)) {
room = true;
continue;
@@ -189,6 +190,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
unsigned int max_depth;
unsigned int level;
+ if (namelen > F2FS_NAME_LEN)
+ return NULL;
+
if (npages == 0)
return NULL;
@@ -212,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page = NULL;
- struct f2fs_dir_entry *de = NULL;
- struct f2fs_dentry_block *dentry_blk = NULL;
+ struct page *page;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_block *dentry_blk;
page = get_lock_data_page(dir, 0);
if (IS_ERR(page))
@@ -246,9 +250,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-
- mutex_lock_op(sbi, DENTRY_OPS);
lock_page(page);
wait_on_page_writeback(page);
de->ino = cpu_to_le32(inode->i_ino);
@@ -262,18 +263,12 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
F2FS_I(inode)->i_pino = dir->i_ino;
f2fs_put_page(page, 1);
- mutex_unlock_op(sbi, DENTRY_OPS);
}
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_node *rn;
- if (IS_ERR(ipage))
- return;
-
- wait_on_page_writeback(ipage);
-
/* copy name info. to this inode page */
rn = (struct f2fs_node *)page_address(ipage);
rn->i.i_namelen = cpu_to_le32(name->len);
@@ -281,64 +276,115 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-static int init_inode_metadata(struct inode *inode,
+static int make_empty_dir(struct inode *inode,
+ struct inode *parent, struct page *page)
+{
+ struct page *dentry_page;
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de;
+ void *kaddr;
+
+ dentry_page = get_new_data_page(inode, page, 0, true);
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
+
+ kaddr = kmap_atomic(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+
+ de = &dentry_blk->dentry[0];
+ de->name_len = cpu_to_le16(1);
+ de->hash_code = 0;
+ de->ino = cpu_to_le32(inode->i_ino);
+ memcpy(dentry_blk->filename[0], ".", 1);
+ set_de_type(de, inode);
+
+ de = &dentry_blk->dentry[1];
+ de->hash_code = 0;
+ de->name_len = cpu_to_le16(2);
+ de->ino = cpu_to_le32(parent->i_ino);
+ memcpy(dentry_blk->filename[1], "..", 2);
+ set_de_type(de, inode);
+
+ test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+ test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+ kunmap_atomic(kaddr);
+
+ set_page_dirty(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ return 0;
+}
+
+static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
+ struct page *page;
+ int err;
+
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- int err;
- err = new_inode_page(inode, name);
- if (err)
- return err;
+ page = new_inode_page(inode, name);
+ if (IS_ERR(page))
+ return page;
if (S_ISDIR(inode->i_mode)) {
- err = f2fs_make_empty(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ err = make_empty_dir(inode, dir, page);
+ if (err)
+ goto error;
}
err = f2fs_init_acl(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ if (err)
+ goto error;
+
+ err = f2fs_init_security(inode, dir, name, page);
+ if (err)
+ goto error;
+
+ wait_on_page_writeback(page);
} else {
- struct page *ipage;
- ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- set_cold_node(inode, ipage);
- init_dent_inode(name, ipage);
- f2fs_put_page(ipage, 1);
+ page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(page))
+ return page;
+
+ wait_on_page_writeback(page);
+ set_cold_node(inode, page);
}
+
+ init_dent_inode(name, page);
+
+ /*
+ * This file should be checkpointed during fsync.
+ * We lost i_pino from now on.
+ */
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ file_lost_pino(inode);
inc_nlink(inode);
- f2fs_write_inode(inode, NULL);
}
- return 0;
+ return page;
+
+error:
+ f2fs_put_page(page, 1);
+ remove_inode_page(inode);
+ return ERR_PTR(err);
}
static void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- bool need_dir_update = false;
-
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
if (S_ISDIR(inode->i_mode)) {
inc_nlink(dir);
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
if (F2FS_I(dir)->i_current_depth != current_depth) {
F2FS_I(dir)->i_current_depth = current_depth;
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (need_dir_update)
- f2fs_write_inode(dir, NULL);
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
+ update_inode_page(dir);
else
mark_inode_dirty(dir);
@@ -370,6 +416,10 @@ next:
goto next;
}
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
{
unsigned int bit_pos;
@@ -379,11 +429,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
f2fs_hash_t dentry_hash;
struct f2fs_dir_entry *de;
unsigned int nbucket, nblock;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
size_t namelen = name->len;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
int slots = GET_DENTRY_SLOTS(namelen);
+ struct page *page;
int err = 0;
int i;
@@ -409,12 +459,9 @@ start:
bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- mutex_lock_op(sbi, DENTRY_OPS);
- dentry_page = get_new_data_page(dir, block, true);
- if (IS_ERR(dentry_page)) {
- mutex_unlock_op(sbi, DENTRY_OPS);
+ dentry_page = get_new_data_page(dir, NULL, block, true);
+ if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
- }
dentry_blk = kmap(dentry_page);
bit_pos = room_for_filename(dentry_blk, slots);
@@ -423,19 +470,19 @@ start:
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- mutex_unlock_op(sbi, DENTRY_OPS);
}
/* Move to next level to find the empty slot for new dentry */
++level;
goto start;
add_dentry:
- err = init_inode_metadata(inode, dir, name);
- if (err)
- goto fail;
-
wait_on_page_writeback(dentry_page);
+ page = init_inode_metadata(inode, dir, name);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
@@ -446,14 +493,16 @@ add_dentry:
test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
set_page_dirty(dentry_page);
- update_parent_metadata(dir, inode, current_depth);
-
- /* update parent inode number before releasing dentry page */
+ /* we don't need to mark_inode_dirty now */
F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+
+ update_parent_metadata(dir, inode, current_depth);
fail:
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- mutex_unlock_op(sbi, DENTRY_OPS);
return err;
}
@@ -473,8 +522,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
void *kaddr = page_address(page);
int i;
- mutex_lock_op(sbi, DENTRY_OPS);
-
lock_page(page);
wait_on_page_writeback(page);
@@ -494,7 +541,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (inode && S_ISDIR(inode->i_mode)) {
drop_nlink(dir);
- f2fs_write_inode(dir, NULL);
+ update_inode_page(dir);
} else {
mark_inode_dirty(dir);
}
@@ -506,7 +553,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
drop_nlink(inode);
i_size_write(inode, 0);
}
- f2fs_write_inode(inode, NULL);
+ update_inode_page(inode);
+
if (inode->i_nlink == 0)
add_orphan_inode(sbi, inode->i_ino);
}
@@ -519,45 +567,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
inode_dec_dirty_dents(dir);
}
f2fs_put_page(page, 1);
-
- mutex_unlock_op(sbi, DENTRY_OPS);
-}
-
-int f2fs_make_empty(struct inode *inode, struct inode *parent)
-{
- struct page *dentry_page;
- struct f2fs_dentry_block *dentry_blk;
- struct f2fs_dir_entry *de;
- void *kaddr;
-
- dentry_page = get_new_data_page(inode, 0, true);
- if (IS_ERR(dentry_page))
- return PTR_ERR(dentry_page);
-
- kaddr = kmap_atomic(dentry_page);
- dentry_blk = (struct f2fs_dentry_block *)kaddr;
-
- de = &dentry_blk->dentry[0];
- de->name_len = cpu_to_le16(1);
- de->hash_code = f2fs_dentry_hash(".", 1);
- de->ino = cpu_to_le32(inode->i_ino);
- memcpy(dentry_blk->filename[0], ".", 1);
- set_de_type(de, inode);
-
- de = &dentry_blk->dentry[1];
- de->hash_code = f2fs_dentry_hash("..", 2);
- de->name_len = cpu_to_le16(2);
- de->ino = cpu_to_le32(parent->i_ino);
- memcpy(dentry_blk->filename[1], "..", 2);
- set_de_type(de, inode);
-
- test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
- test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
- kunmap_atomic(kaddr);
-
- set_page_dirty(dentry_page);
- f2fs_put_page(dentry_page, 1);
- return 0;
}
bool f2fs_empty_dir(struct inode *dir)
@@ -597,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = file->f_pos;
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned char *types = NULL;
- unsigned int bit_pos = 0, start_bit_pos = 0;
- int over = 0;
+ unsigned int bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
- unsigned int n = 0;
+ unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
unsigned char d_type = DT_UNKNOWN;
- int slots;
- types = f2fs_filetype_table;
- bit_pos = (pos % NR_DENTRY_IN_BLOCK);
- n = (pos / NR_DENTRY_IN_BLOCK);
+ bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
for ( ; n < npages; n++) {
dentry_page = get_lock_data_page(inode, n);
if (IS_ERR(dentry_page))
continue;
- start_bit_pos = bit_pos;
dentry_blk = kmap(dentry_page);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
- d_type = DT_UNKNOWN;
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
bit_pos);
@@ -632,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
break;
de = &dentry_blk->dentry[bit_pos];
- if (types && de->file_type < F2FS_FT_MAX)
- d_type = types[de->file_type];
-
- over = filldir(dirent,
+ if (de->file_type < F2FS_FT_MAX)
+ d_type = f2fs_filetype_table[de->file_type];
+ else
+ d_type = DT_UNKNOWN;
+ if (!dir_emit(ctx,
dentry_blk->filename[bit_pos],
le16_to_cpu(de->name_len),
- (n * NR_DENTRY_IN_BLOCK) + bit_pos,
- le32_to_cpu(de->ino), d_type);
- if (over) {
- file->f_pos += bit_pos - start_bit_pos;
- goto success;
- }
- slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- bit_pos += slots;
+ le32_to_cpu(de->ino), d_type))
+ goto stop;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
}
bit_pos = 0;
- file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+ ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
dentry_page = NULL;
}
-success:
+stop:
if (dentry_page && !IS_ERR(dentry_page)) {
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
@@ -665,7 +664,7 @@ success:
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = f2fs_readdir,
+ .iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
.unlocked_ioctl = f2fs_ioctl,
};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cc2213afdcc7..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
typecheck(unsigned long long, b) && \
((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t; /*
+ * should not change u32, since it is the on-disk block
+ * address format, __le32.
+ */
typedef u32 nid_t;
struct f2fs_mount_info {
unsigned int opt;
};
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
{
- return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+ unsigned char *p = (unsigned char *)buf;
+ __u32 crc = F2FS_SUPER_MAGIC;
+ int i;
+
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+ }
+ return crc;
}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
{
- return f2fs_crc32(buff, buff_size) == blk_crc;
+ return f2fs_crc32(buf, buf_size) == blk_crc;
}
/*
@@ -125,11 +139,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
* file keeping -1 as its node offset to
* distinguish from index node blocks.
*/
-#define RDONLY_NODE 1 /*
- * specify a read-only mode when getting
- * a node block. 0 is read-write mode.
- * used by get_dnode_of_data().
+enum {
+ ALLOC_NODE, /* allocate a new node page if needed */
+ LOOKUP_NODE, /* look up a node without readahead */
+ LOOKUP_NODE_RA, /*
+ * look up a node with readahead called
+ * by get_datablock_ro.
*/
+};
+
#define F2FS_LINK_MAX 32000 /* maximum link count per file */
/* for in-memory extent cache entry */
@@ -137,13 +155,14 @@ struct extent_info {
rwlock_t ext_lock; /* rwlock for consistency */
unsigned int fofs; /* start offset in a file */
u32 blk_addr; /* start block address of the extent */
- unsigned int len; /* lenth of the extent */
+ unsigned int len; /* length of the extent */
};
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
#define FADVISE_COLD_BIT 0x01
+#define FADVISE_LOST_PINO_BIT 0x02
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
@@ -155,7 +174,6 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
- unsigned long long data_version;/* latest version of data for fsync */
atomic_t dirty_dents; /* # of dirty dentry pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -186,7 +204,6 @@ static inline void set_raw_extent(struct extent_info *ext,
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
- nid_t init_scan_nid; /* the first nid to be scanned */
nid_t next_scan_nid; /* the next nid to be scanned */
/* NAT cache management */
@@ -305,23 +322,12 @@ enum count_type {
};
/*
- * FS_LOCK nesting subclasses for the lock validator:
- *
- * The locking order between these classes is
- * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
- * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
+ * The checkpoint procedure blocks all the locks in this fs_lock array.
+ * Some FS operations grab free locks, and if there is no free lock,
+ * then wait to grab a lock in a round-robin manner.
*/
-enum lock_type {
- RENAME, /* for renaming operations */
- DENTRY_OPS, /* for directory operations */
- DATA_WRITE, /* for data write */
- DATA_NEW, /* for data allocation */
- DATA_TRUNC, /* for data truncate */
- NODE_NEW, /* for node allocation */
- NODE_TRUNC, /* for node truncate */
- NODE_WRITE, /* for node write */
- NR_LOCK_TYPE,
-};
+#define NR_GLOBAL_LOCKS 8
/*
* The below are the page types of bios used in submti_bio().
@@ -361,11 +367,13 @@ struct f2fs_sb_info {
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
- struct mutex cp_mutex; /* for checkpoint procedure */
- struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */
- struct mutex write_inode; /* mutex for write inode */
+ struct mutex cp_mutex; /* checkpoint procedure lock */
+ struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */
+ struct mutex node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
+ unsigned char next_lock_num; /* round-robin global locks */
int por_doing; /* recovery is doing or not */
+ int on_build_free_nids; /* build_free_nids is doing */
/* for orphan inode management */
struct list_head orphan_inode_list; /* orphan inode list */
@@ -375,7 +383,6 @@ struct f2fs_sb_info {
/* for directory inode management */
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- unsigned int n_dirty_dirs; /* # of dir inodes */
/* basic file system units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,17 +413,21 @@ struct f2fs_sb_info {
/* for cleaning operations */
struct mutex gc_mutex; /* mutex for GC */
struct f2fs_gc_kthread *gc_thread; /* GC thread */
+ unsigned int cur_victim_sec; /* current victim section num */
/*
* for stat information.
* one is for the LFS mode, and the other is for the SSR mode.
*/
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
- unsigned int last_victim[2]; /* last victim segment # */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
int bg_gc; /* background gc calls */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+#endif
+ unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
};
@@ -498,22 +509,59 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
-static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
{
- mutex_lock_nested(&sbi->fs_lock[t], t);
+ int i;
+
+ for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+ /*
+ * This is the only time we take multiple fs_lock[]
+ * instances; the order is immaterial since we
+ * always hold cp_mutex, which serializes multiple
+ * such operations.
+ */
+ mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+ }
}
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
{
- mutex_unlock(&sbi->fs_lock[t]);
+ int i = 0;
+ for (; i < NR_GLOBAL_LOCKS; i++)
+ mutex_unlock(&sbi->fs_lock[i]);
+}
+
+static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+{
+ unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS;
+ int i = 0;
+
+ for (; i < NR_GLOBAL_LOCKS; i++)
+ if (mutex_trylock(&sbi->fs_lock[i]))
+ return i;
+
+ mutex_lock(&sbi->fs_lock[next_lock]);
+ sbi->next_lock_num++;
+ return next_lock;
+}
+
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
+{
+ if (ilock < 0)
+ return;
+ BUG_ON(ilock >= NR_GLOBAL_LOCKS);
+ mutex_unlock(&sbi->fs_lock[ilock]);
}
/*
* Check whether the given nid is within node id range.
*/
-static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
{
- BUG_ON((nid >= NM_I(sbi)->max_nid));
+ WARN_ON((nid >= NM_I(sbi)->max_nid));
+ if (nid >= NM_I(sbi)->max_nid)
+ return -EINVAL;
+ return 0;
}
#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
@@ -819,10 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
- FI_NEED_CP, /* need to do checkpoint during fsync */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_UPDATE_DIR, /* should update inode block for consistency */
+ FI_DELAY_IPUT, /* used for the recovery */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -855,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
return 0;
}
+static inline int f2fs_readonly(struct super_block *sb)
+{
+ return sb->s_flags & MS_RDONLY;
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
long f2fs_ioctl(struct file *, unsigned int, unsigned long);
long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -872,6 +929,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
void update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
@@ -889,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
@@ -924,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
@@ -950,7 +1007,6 @@ void destroy_node_manager_caches(void);
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
void clear_prefree_segments(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
@@ -973,7 +1029,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
int, unsigned int, int);
void flush_sit_entries(struct f2fs_sb_info *);
int build_segment_manager(struct f2fs_sb_info *);
-void reset_victim_segmap(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
/*
@@ -988,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
@@ -1000,9 +1057,9 @@ void destroy_checkpoint_caches(void);
*/
int reserve_new_block(struct dnode_of_data *);
void update_extent_cache(block_t, struct dnode_of_data *);
-struct page *find_data_page(struct inode *, pgoff_t);
+struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
int do_write_data_page(struct page *);
@@ -1020,7 +1077,7 @@ void destroy_gc_caches(void);
/*
* recovery.c
*/
-void recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *);
bool space_for_roll_forward(struct f2fs_sb_info *);
/*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 958a46da19ae..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include <linux/falloc.h>
#include <linux/types.h>
#include <linux/compat.h>
@@ -24,6 +25,7 @@
#include "segment.h"
#include "xattr.h"
#include "acl.h"
+#include <trace/events/f2fs.h>
static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
@@ -33,19 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t old_blk_addr;
struct dnode_of_data dn;
- int err;
+ int err, ilock;
f2fs_balance_fs(sbi);
sb_start_pagefault(inode->i_sb);
- mutex_lock_op(sbi, DATA_NEW);
-
/* block allocation */
+ ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, page->index, 0);
+ err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
if (err) {
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
goto out;
}
@@ -55,17 +56,17 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
err = reserve_new_block(&dn);
if (err) {
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
goto out;
}
}
f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, ilock);
- mutex_unlock_op(sbi, DATA_NEW);
-
+ file_update_time(vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) ||
+ page_offset(page) > i_size_read(inode) ||
!PageUptodate(page)) {
unlock_page(page);
err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
* check to see if the page is mapped already (no holes)
*/
if (PageMappedToDisk(page))
- goto out;
-
- /* fill the page */
- wait_on_page_writeback(page);
+ goto mapped;
/* page is wholly or partially inside EOF */
if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
set_page_dirty(page);
SetPageUptodate(page);
- file_update_time(vma->vm_file);
+mapped:
+ /* fill the page */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -102,28 +102,28 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.remap_pages = generic_file_remap_pages,
};
-static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
+static int get_parent_ino(struct inode *inode, nid_t *pino)
{
struct dentry *dentry;
- nid_t pino;
inode = igrab(inode);
dentry = d_find_any_alias(inode);
- if (!dentry) {
- iput(inode);
+ iput(inode);
+ if (!dentry)
return 0;
- }
- pino = dentry->d_parent->d_inode->i_ino;
+
+ inode = igrab(dentry->d_parent->d_inode);
dput(dentry);
+
+ *pino = inode->i_ino;
iput(inode);
- return !is_checkpointed_node(sbi, pino);
+ return 1;
}
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- unsigned long long cur_version;
int ret = 0;
bool need_cp = false;
struct writeback_control wbc = {
@@ -132,12 +132,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
.for_reclaim = 0,
};
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(inode->i_sb))
return 0;
+ trace_f2fs_sync_file_enter(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
+ if (ret) {
+ trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
+ }
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
@@ -147,40 +150,44 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
- mutex_lock(&sbi->cp_mutex);
- cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
- mutex_unlock(&sbi->cp_mutex);
-
- if (F2FS_I(inode)->data_version != cur_version &&
- !(inode->i_state & I_DIRTY))
- goto out;
- F2FS_I(inode)->data_version--;
-
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+ else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
need_cp = true;
- else if (need_to_sync_dir(sbi, inode))
+ else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
need_cp = true;
if (need_cp) {
+ nid_t pino;
+
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
- clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ F2FS_I(inode)->i_pino = pino;
+ file_got_pino(inode);
+ mark_inode_dirty_sync(inode);
+ ret = f2fs_write_inode(inode, NULL);
+ if (ret)
+ goto out;
+ }
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
}
filemap_fdatawait_range(sbi->node_inode->i_mapping,
0, LONG_MAX);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
}
out:
mutex_unlock(&inode->i_mutex);
+ trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
}
@@ -191,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -208,14 +215,17 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
update_extent_cache(NULL_ADDR, dn);
invalidate_blocks(sbi, blkaddr);
- dec_valid_block_count(sbi, dn->inode, 1);
nr_free++;
}
if (nr_free) {
+ dec_valid_block_count(sbi, dn->inode, nr_free);
set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
+
+ trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
+ dn->ofs_in_node, nr_free);
return nr_free;
}
@@ -232,11 +242,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
if (!offset)
return;
- page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+ page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
if (IS_ERR(page))
return;
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ f2fs_put_page(page, 1);
+ return;
+ }
wait_on_page_writeback(page);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
set_page_dirty(page);
@@ -249,20 +263,22 @@ static int truncate_blocks(struct inode *inode, u64 from)
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
- int count = 0;
+ int count = 0, ilock = -1;
int err;
+ trace_f2fs_truncate_blocks_enter(inode, from);
+
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- mutex_lock_op(sbi, DATA_TRUNC);
-
+ ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+ err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- mutex_unlock_op(sbi, DATA_TRUNC);
+ mutex_unlock_op(sbi, ilock);
+ trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -273,6 +289,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
count -= dn.ofs_in_node;
BUG_ON(count < 0);
+
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
truncate_data_blocks_range(&dn, count);
free_from += count;
@@ -281,11 +298,12 @@ static int truncate_blocks(struct inode *inode, u64 from)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- mutex_unlock_op(sbi, DATA_TRUNC);
+ mutex_unlock_op(sbi, ilock);
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
+ trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -295,13 +313,15 @@ void f2fs_truncate(struct inode *inode)
S_ISLNK(inode->i_mode)))
return;
+ trace_f2fs_truncate(inode);
+
if (!truncate_blocks(inode, i_size_read(inode))) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
}
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -389,15 +409,16 @@ static void fill_zero(struct inode *inode, pgoff_t index,
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
+ int ilock;
if (!len)
return;
f2fs_balance_fs(sbi);
- mutex_lock_op(sbi, DATA_NEW);
- page = get_new_data_page(inode, index, false);
- mutex_unlock_op(sbi, DATA_NEW);
+ ilock = mutex_lock_op(sbi);
+ page = get_new_data_page(inode, NULL, index, false);
+ mutex_unlock_op(sbi, ilock);
if (!IS_ERR(page)) {
wait_on_page_writeback(page);
@@ -414,15 +435,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
for (index = pg_start; index < pg_end; index++) {
struct dnode_of_data dn;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
- f2fs_balance_fs(sbi);
- mutex_lock_op(sbi, DATA_TRUNC);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err) {
- mutex_unlock_op(sbi, DATA_TRUNC);
if (err == -ENOENT)
continue;
return err;
@@ -431,7 +447,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
if (dn.data_blkaddr != NULL_ADDR)
truncate_data_blocks_range(&dn, 1);
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, DATA_TRUNC);
}
return 0;
}
@@ -461,12 +476,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int ilock;
+
+ f2fs_balance_fs(sbi);
blk_start = pg_start << PAGE_CACHE_SHIFT;
blk_end = pg_end << PAGE_CACHE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
+
+ ilock = mutex_lock_op(sbi);
ret = truncate_hole(inode, pg_start, pg_end);
+ mutex_unlock_op(sbi, ilock);
}
}
@@ -500,13 +522,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
for (index = pg_start; index <= pg_end; index++) {
struct dnode_of_data dn;
+ int ilock;
- mutex_lock_op(sbi, DATA_NEW);
-
+ ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, index, 0);
+ ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
break;
}
@@ -514,13 +536,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
ret = reserve_new_block(&dn);
if (ret) {
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
break;
}
}
f2fs_put_dnode(&dn);
-
- mutex_unlock_op(sbi, DATA_NEW);
+ mutex_unlock_op(sbi, ilock);
if (pg_start == pg_end)
new_size = offset + len;
@@ -559,6 +580,7 @@ static long f2fs_fallocate(struct file *file, int mode,
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
+ trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
}
@@ -583,14 +605,14 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case F2FS_IOC_GETFLAGS:
flags = fi->i_flags & FS_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
- case FS_IOC_SETFLAGS:
+ case F2FS_IOC_SETFLAGS:
{
unsigned int oldflags;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -627,7 +649,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 94b8a0c48453..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -11,7 +11,6 @@
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/f2fs_fs.h>
#include <linux/kthread.h>
@@ -23,6 +22,7 @@
#include "node.h"
#include "segment.h"
#include "gc.h"
+#include <trace/events/f2fs.h>
static struct kmem_cache *winode_slab;
@@ -76,14 +76,13 @@ static int gc_thread_func(void *data)
else
wait_ms = increase_sleep_time(wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->bg_gc++;
+#endif
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
- else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
- wait_ms = GC_THREAD_MAX_SLEEP_TIME;
-
} while (!kthread_should_stop());
return 0;
}
@@ -92,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
+ int err = 0;
if (!test_opt(sbi, BG_GC))
- return 0;
+ goto out;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th)
- return -ENOMEM;
+ if (!gc_th) {
+ err = -ENOMEM;
+ goto out;
+ }
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
+ err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
- return -ENOMEM;
}
- return 0;
+
+out:
+ return err;
}
void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -131,7 +135,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- if (p->alloc_mode) {
+ if (p->alloc_mode == SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_segmap = dirty_i->dirty_segmap[type];
p->ofs_unit = 1;
@@ -160,18 +164,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno;
+ unsigned int hint = 0;
+ unsigned int secno;
/*
* If the gc_type is FG_GC, we can select victim segments
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
- segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
- TOTAL_SEGS(sbi), 0);
- if (segno < TOTAL_SEGS(sbi)) {
- clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
- return segno;
+next:
+ secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++);
+ if (secno < TOTAL_SECS(sbi)) {
+ if (sec_usage_check(sbi, secno))
+ goto next;
+ clear_bit(secno, dirty_i->victim_secmap);
+ return secno * sbi->segs_per_sec;
}
return NULL_SEGNO;
}
@@ -222,7 +229,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
}
/*
- * This function is called from two pathes.
+ * This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
* When it is called during GC, it just gets a victim segment
* and it does not remove it from dirty seglist.
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int segno;
+ unsigned int secno, max_cost;
int nsearched = 0;
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = get_max_cost(sbi, &p);
+ p.min_cost = max_cost = get_max_cost(sbi, &p);
mutex_lock(&dirty_i->seglist_lock);
@@ -253,6 +260,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
while (1) {
unsigned long cost;
+ unsigned int segno;
segno = find_next_bit(p.dirty_segmap,
TOTAL_SEGS(sbi), p.offset);
@@ -265,13 +273,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
break;
}
p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+ secno = GET_SECNO(sbi, segno);
- if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
- continue;
- if (gc_type == BG_GC &&
- test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+ if (sec_usage_check(sbi, secno))
continue;
- if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+ if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
continue;
cost = get_gc_cost(sbi, segno, &p);
@@ -281,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = cost;
}
- if (cost == get_max_cost(sbi, &p))
+ if (cost == max_cost)
continue;
if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -289,15 +295,20 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
break;
}
}
-got_it:
if (p.min_segno != NULL_SEGNO) {
- *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_it:
if (p.alloc_mode == LFS) {
- int i;
- for (i = 0; i < p.ofs_unit; i++)
- set_bit(*result + i,
- dirty_i->victim_segmap[gc_type]);
+ secno = GET_SECNO(sbi, p.min_segno);
+ if (gc_type == FG_GC)
+ sbi->cur_victim_sec = secno;
+ else
+ set_bit(secno, dirty_i->victim_secmap);
}
+ *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+
+ trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
+ sbi->cur_victim_sec,
+ prefree_segments(sbi), free_segments(sbi));
}
mutex_unlock(&dirty_i->seglist_lock);
@@ -310,28 +321,21 @@ static const struct victim_selection default_v_ops = {
static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
{
- struct list_head *this;
struct inode_entry *ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
+ list_for_each_entry(ie, ilist, list)
if (ie->inode->i_ino == ino)
return ie->inode;
- }
return NULL;
}
static void add_gc_inode(struct inode *inode, struct list_head *ilist)
{
- struct list_head *this;
- struct inode_entry *new_ie, *ie;
+ struct inode_entry *new_ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
- if (ie->inode == inode) {
- iput(inode);
- return;
- }
+ if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ iput(inode);
+ return;
}
repeat:
new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
@@ -381,6 +385,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi,
next_step:
entry = sum;
+
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
@@ -401,11 +406,18 @@ next_step:
continue;
/* set page dirty and write it */
- if (!PageWriteback(node_page))
+ if (gc_type == FG_GC) {
+ f2fs_submit_bio(sbi, NODE, true);
+ wait_on_page_writeback(node_page);
set_page_dirty(node_page);
+ } else {
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ }
f2fs_put_page(node_page, 1);
stat_inc_node_blk_count(sbi, 1);
}
+
if (initial) {
initial = false;
goto next_step;
@@ -418,6 +430,13 @@ next_step:
.for_reclaim = 0,
};
sync_node_pages(sbi, 0, &wbc);
+
+ /*
+ * In the case of FG_GC, it'd be better to reclaim this victim
+ * completely.
+ */
+ if (get_valid_blocks(sbi, segno, 1) != 0)
+ goto next_step;
}
}
@@ -481,21 +500,19 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
static void move_data_page(struct inode *inode, struct page *page, int gc_type)
{
- if (page->mapping != inode->i_mapping)
- goto out;
-
- if (inode != page->mapping->host)
- goto out;
-
- if (PageWriteback(page))
- goto out;
-
if (gc_type == BG_GC) {
+ if (PageWriteback(page))
+ goto out;
set_page_dirty(page);
set_cold_data(page);
} else {
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- mutex_lock_op(sbi, DATA_WRITE);
+
+ if (PageWriteback(page)) {
+ f2fs_submit_bio(sbi, DATA, true);
+ wait_on_page_writeback(page);
+ }
+
if (clear_page_dirty_for_io(page) &&
S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
@@ -503,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
}
set_cold_data(page);
do_write_data_page(page);
- mutex_unlock_op(sbi, DATA_WRITE);
clear_cold_data(page);
}
out:
@@ -530,6 +546,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
next_step:
entry = sum;
+
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
struct page *data_page;
struct inode *inode;
@@ -567,7 +584,7 @@ next_step:
continue;
data_page = find_data_page(inode,
- start_bidx + ofs_in_node);
+ start_bidx + ofs_in_node, false);
if (IS_ERR(data_page))
goto next_iput;
@@ -588,11 +605,22 @@ next_step:
next_iput:
iput(inode);
}
+
if (++phase < 4)
goto next_step;
- if (gc_type == FG_GC)
+ if (gc_type == FG_GC) {
f2fs_submit_bio(sbi, DATA, true);
+
+ /*
+ * In the case of FG_GC, it'd be better to reclaim this victim
+ * completely.
+ */
+ if (get_valid_blocks(sbi, segno, 1) != 0) {
+ phase = 2;
+ goto next_step;
+ }
+ }
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -611,18 +639,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
{
struct page *sum_page;
struct f2fs_summary_block *sum;
+ struct blk_plug plug;
/* read segment summary of victim */
sum_page = get_sum_page(sbi, segno);
if (IS_ERR(sum_page))
return;
- /*
- * CP needs to lock sum_page. In this time, we don't need
- * to lock this page, because this summary page is not gone anywhere.
- * Also, this page is not gonna be updated before GC is done.
- */
- unlock_page(sum_page);
+ blk_start_plug(&plug);
+
sum = page_address(sum_page);
switch (GET_SUM_TYPE((&sum->footer))) {
@@ -633,10 +658,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
break;
}
+ blk_finish_plug(&plug);
+
stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
stat_inc_call_count(sbi->stat_info);
- f2fs_put_page(sum_page, 0);
+ f2fs_put_page(sum_page, 1);
}
int f2fs_gc(struct f2fs_sb_info *sbi)
@@ -652,8 +679,10 @@ gc_more:
if (!(sbi->sb->s_flags & MS_ACTIVE))
goto stop;
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
gc_type = FG_GC;
+ write_checkpoint(sbi, false);
+ }
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
goto stop;
@@ -662,9 +691,11 @@ gc_more:
for (i = 0; i < sbi->segs_per_sec; i++)
do_garbage_collect(sbi, segno + i, &ilist, gc_type);
- if (gc_type == FG_GC &&
- get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+ if (gc_type == FG_GC) {
+ sbi->cur_victim_sec = NULL_SEGNO;
nfree++;
+ WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
+ }
if (has_not_enough_free_secs(sbi, nfree))
goto gc_more;
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 30b2db003acd..2c6a6bd08322 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,9 +13,9 @@
* whether IO subsystem is idle
* or not
*/
-#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */
-#define GC_THREAD_MAX_SLEEP_TIME 30000
-#define GC_THREAD_NOGC_SLEEP_TIME 10000
+#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME 60000
+#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
@@ -58,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
static inline long increase_sleep_time(long wait)
{
+ if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+ return wait;
+
wait += GC_THREAD_MIN_SLEEP_TIME;
if (wait > GC_THREAD_MAX_SLEEP_TIME)
wait = GC_THREAD_MAX_SLEEP_TIME;
@@ -66,6 +69,9 @@ static inline long increase_sleep_time(long wait)
static inline long decrease_sleep_time(long wait)
{
+ if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+ wait = GC_THREAD_MAX_SLEEP_TIME;
+
wait -= GC_THREAD_MIN_SLEEP_TIME;
if (wait <= GC_THREAD_MIN_SLEEP_TIME)
wait = GC_THREAD_MIN_SLEEP_TIME;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index ddae412d30c8..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,6 +16,8 @@
#include "f2fs.h"
#include "node.h"
+#include <trace/events/f2fs.h>
+
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -44,7 +46,11 @@ static int do_read_inode(struct inode *inode)
struct f2fs_inode *ri;
/* Check if ino is within scope */
- check_nid_range(sbi, inode->i_ino);
+ if (check_nid_range(sbi, inode->i_ino)) {
+ f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
+ (unsigned long) inode->i_ino);
+ return -EINVAL;
+ }
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page))
@@ -76,7 +82,6 @@ static int do_read_inode(struct inode *inode)
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
fi->i_flags = le32_to_cpu(ri->i_flags);
fi->flags = 0;
- fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
get_extent_info(&fi->ext, ri->i_ext);
@@ -88,25 +93,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
- int ret;
+ int ret = 0;
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+
+ if (!(inode->i_state & I_NEW)) {
+ trace_f2fs_iget(inode);
return inode;
+ }
if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
goto make_now;
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
-
- if (!sbi->por_doing && inode->i_nlink == 0) {
- ret = -ENOENT;
- goto bad_inode;
- }
-
make_now:
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -122,8 +124,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
- __GFP_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -136,11 +137,12 @@ make_now:
goto bad_inode;
}
unlock_new_inode(inode);
-
+ trace_f2fs_iget(inode);
return inode;
bad_inode:
iget_failed(inode);
+ trace_f2fs_iget_exit(inode, ret);
return ERR_PTR(ret);
}
@@ -190,49 +192,57 @@ void update_inode(struct inode *inode, struct page *node_page)
set_cold_node(inode, node_page);
set_page_dirty(node_page);
+ clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
-int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+int update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *node_page;
- bool need_lock = false;
-
- if (inode->i_ino == F2FS_NODE_INO(sbi) ||
- inode->i_ino == F2FS_META_INO(sbi))
- return 0;
-
- if (wbc)
- f2fs_balance_fs(sbi);
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
- if (!PageDirty(node_page)) {
- need_lock = true;
- f2fs_put_page(node_page, 1);
- mutex_lock(&sbi->write_inode);
- node_page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(node_page)) {
- mutex_unlock(&sbi->write_inode);
- return PTR_ERR(node_page);
- }
- }
update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
- if (need_lock)
- mutex_unlock(&sbi->write_inode);
return 0;
}
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int ret, ilock;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return 0;
+
+ if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ return 0;
+
+ if (wbc)
+ f2fs_balance_fs(sbi);
+
+ /*
+ * We need to lock here to prevent from producing dirty node pages
+ * during the urgent cleaning time when runing out of free sections.
+ */
+ ilock = mutex_lock_op(sbi);
+ ret = update_inode_page(inode);
+ mutex_unlock_op(sbi, ilock);
+ return ret;
+}
+
/*
* Called at the last iput() if i_nlink is zero
*/
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int ilock;
+ trace_f2fs_evict_inode(inode);
truncate_inode_pages(&inode->i_data, 0);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
@@ -252,7 +262,10 @@ void f2fs_evict_inode(struct inode *inode)
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
+ ilock = mutex_lock_op(sbi);
remove_inode_page(inode);
+ mutex_unlock_op(sbi, ilock);
+
sb_end_intwrite(inode->i_sb);
no_delete:
clear_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 1a49b881bac0..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -15,8 +15,10 @@
#include <linux/ctype.h>
#include "f2fs.h"
+#include "node.h"
#include "xattr.h"
#include "acl.h"
+#include <trace/events/f2fs.h>
static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
{
@@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
- int err;
+ int err, ilock;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
- mutex_lock_op(sbi, NODE_NEW);
+ ilock = mutex_lock_op(sbi);
if (!alloc_nid(sbi, &ino)) {
- mutex_unlock_op(sbi, NODE_NEW);
+ mutex_unlock_op(sbi, ilock);
err = -ENOSPC;
goto fail;
}
- mutex_unlock_op(sbi, NODE_NEW);
+ mutex_unlock_op(sbi, ilock);
inode->i_uid = current_fsuid();
@@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
goto out;
}
-
+ trace_f2fs_new_inode(inode, 0);
mark_inode_dirty(inode);
return inode;
@@ -69,6 +71,8 @@ out:
clear_nlink(inode);
unlock_new_inode(inode);
fail:
+ trace_f2fs_new_inode(inode, err);
+ make_bad_inode(inode);
iput(inode);
if (nid_free)
alloc_nid_failed(sbi, ino);
@@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
int ret;
if (sublen > slen)
- return 1;
+ return 0;
ret = memcmp(s + slen - sublen, sub, sublen);
if (ret) { /* compare upper case */
@@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
char upper_sub[8];
for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
upper_sub[i] = toupper(sub[i]);
- return memcmp(s + slen - sublen, upper_sub, sublen);
+ return !memcmp(s + slen - sublen, upper_sub, sublen);
}
- return ret;
+ return !ret;
}
/*
* Set multimedia files as cold files for hot/cold data separation
*/
-static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
{
int i;
@@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
int count = le32_to_cpu(sbi->raw_super->extension_count);
for (i = 0; i < count; i++) {
- if (!is_multimedia_file(name, extlist[i])) {
- F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ if (is_multimedia_file(name, extlist[i])) {
+ file_set_cold(inode);
break;
}
}
@@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
nid_t ino = 0;
- int err;
+ int err, ilock;
f2fs_balance_fs(sbi);
@@ -130,26 +134,28 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
return PTR_ERR(inode);
if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_cold_file(sbi, inode, dentry->d_name.name);
+ set_cold_files(sbi, inode, dentry->d_name.name);
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
+ ilock = mutex_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
+ mutex_unlock_op(sbi, ilock);
if (err)
goto out;
alloc_nid_done(sbi, ino);
- if (!sbi->por_doing)
- d_instantiate(dentry, inode);
+ d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
clear_nlink(inode);
unlock_new_inode(inode);
+ make_bad_inode(inode);
iput(inode);
alloc_nid_failed(sbi, ino);
return err;
@@ -161,15 +167,17 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = old_dentry->d_inode;
struct super_block *sb = dir->i_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int err;
+ int err, ilock;
f2fs_balance_fs(sbi);
inode->i_ctime = CURRENT_TIME;
- atomic_inc(&inode->i_count);
+ ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ ilock = mutex_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
+ mutex_unlock_op(sbi, ilock);
if (err)
goto out;
@@ -197,7 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct f2fs_dir_entry *de;
struct page *page;
- if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ if (dentry->d_name.len > F2FS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -222,7 +230,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
struct f2fs_dir_entry *de;
struct page *page;
int err = -ENOENT;
+ int ilock;
+ trace_f2fs_unlink_enter(dir, dentry);
f2fs_balance_fs(sbi);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -236,11 +246,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
+ ilock = mutex_lock_op(sbi);
f2fs_delete_entry(de, page, inode);
+ mutex_unlock_op(sbi, ilock);
/* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
fail:
+ trace_f2fs_unlink_exit(inode, err);
return err;
}
@@ -251,7 +264,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
- int err;
+ int err, ilock;
f2fs_balance_fs(sbi);
@@ -262,7 +275,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ ilock = mutex_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
+ mutex_unlock_op(sbi, ilock);
if (err)
goto out;
@@ -275,6 +290,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
out:
clear_nlink(inode);
unlock_new_inode(inode);
+ make_bad_inode(inode);
iput(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
@@ -284,7 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct inode *inode;
- int err;
+ int err, ilock;
f2fs_balance_fs(sbi);
@@ -298,7 +314,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ ilock = mutex_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
+ mutex_unlock_op(sbi, ilock);
if (err)
goto out_fail;
@@ -313,6 +331,7 @@ out_fail:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
clear_nlink(inode);
unlock_new_inode(inode);
+ make_bad_inode(inode);
iput(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
@@ -333,6 +352,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
int err = 0;
+ int ilock;
if (!new_valid_dev(rdev))
return -EINVAL;
@@ -346,7 +366,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
+ ilock = mutex_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
+ mutex_unlock_op(sbi, ilock);
if (err)
goto out;
@@ -357,6 +379,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
out:
clear_nlink(inode);
unlock_new_inode(inode);
+ make_bad_inode(inode);
iput(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
@@ -374,7 +397,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
- int err = -ENOENT;
+ int err = -ENOENT, ilock = -1;
f2fs_balance_fs(sbi);
@@ -389,7 +412,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_old;
}
- mutex_lock_op(sbi, RENAME);
+ ilock = mutex_lock_op(sbi);
if (new_inode) {
struct page *new_page;
@@ -412,7 +435,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
if (!new_inode->i_nlink)
add_orphan_inode(sbi, new_inode->i_ino);
- f2fs_write_inode(new_inode, NULL);
+ update_inode_page(new_inode);
} else {
err = f2fs_add_link(new_dentry, old_inode);
if (err)
@@ -420,12 +443,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir_entry) {
inc_nlink(new_dir);
- f2fs_write_inode(new_dir, NULL);
+ update_inode_page(new_dir);
}
}
old_inode->i_ctime = CURRENT_TIME;
- set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
mark_inode_dirty(old_inode);
f2fs_delete_entry(old_entry, old_page, NULL);
@@ -439,10 +461,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_put_page(old_dir_page, 0);
}
drop_nlink(old_dir);
- f2fs_write_inode(old_dir, NULL);
+ update_inode_page(old_dir);
}
- mutex_unlock_op(sbi, RENAME);
+ mutex_unlock_op(sbi, ilock);
return 0;
out_dir:
@@ -450,7 +472,7 @@ out_dir:
kunmap(old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- mutex_unlock_op(sbi, RENAME);
+ mutex_unlock_op(sbi, ilock);
out_old:
kunmap(old_page);
f2fs_put_page(old_page, 0);
@@ -468,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
.rename = f2fs_rename,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
@@ -482,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
@@ -492,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
};
const struct inode_operations f2fs_special_inode_operations = {
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e275218904ed..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include <trace/events/f2fs.h>
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
@@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
{
struct address_space *mapping = sbi->meta_inode->i_mapping;
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct blk_plug plug;
struct page *page;
pgoff_t index;
int i;
+ blk_start_plug(&plug);
+
for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
if (nid >= nm_i->max_nid)
nid = 0;
@@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
page = grab_cache_page(mapping, index);
if (!page)
continue;
- if (f2fs_readpage(sbi, page, index, READ)) {
+ if (PageUptodate(page)) {
f2fs_put_page(page, 1);
continue;
}
+ if (f2fs_readpage(sbi, page, index, READ))
+ continue;
+
f2fs_put_page(page, 0);
}
+ blk_finish_plug(&plug);
}
static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+ if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
return 0;
write_lock(&nm_i->nat_tree_lock);
@@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
noffset[0] = 0;
if (block < direct_index) {
- offset[n++] = block;
- level = 0;
+ offset[n] = block;
goto got;
}
block -= direct_index;
if (block < direct_blks) {
offset[n++] = NODE_DIR1_BLOCK;
noffset[n] = 1;
- offset[n++] = block;
+ offset[n] = block;
level = 1;
goto got;
}
@@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
if (block < direct_blks) {
offset[n++] = NODE_DIR2_BLOCK;
noffset[n] = 2;
- offset[n++] = block;
+ offset[n] = block;
level = 1;
goto got;
}
@@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
noffset[n] = 3;
offset[n++] = block / direct_blks;
noffset[n] = 4 + offset[n - 1];
- offset[n++] = block % direct_blks;
+ offset[n] = block % direct_blks;
level = 2;
goto got;
}
@@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
noffset[n] = 4 + dptrs_per_blk;
offset[n++] = block / direct_blks;
noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
- offset[n++] = block % direct_blks;
+ offset[n] = block % direct_blks;
level = 2;
goto got;
}
@@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
noffset[n] = 7 + (dptrs_per_blk * 2) +
offset[n - 2] * (dptrs_per_blk + 1) +
offset[n - 1];
- offset[n++] = block % direct_blks;
+ offset[n] = block % direct_blks;
level = 3;
goto got;
} else {
@@ -383,8 +390,11 @@ got:
/*
* Caller should call f2fs_put_dnode(dn).
+ * Also, it should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * In the case of RDONLY_NODE, we don't need to care about mutex.
*/
-int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *npage[4];
@@ -398,12 +408,16 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
level = get_node_path(index, offset, noffset);
nids[0] = dn->inode->i_ino;
- npage[0] = get_node_page(sbi, nids[0]);
- if (IS_ERR(npage[0]))
- return PTR_ERR(npage[0]);
+ npage[0] = dn->inode_page;
+ if (!npage[0]) {
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+ }
parent = npage[0];
- nids[1] = get_nid(parent, offset[0], true);
+ if (level != 0)
+ nids[1] = get_nid(parent, offset[0], true);
dn->inode_page = npage[0];
dn->inode_page_locked = true;
@@ -411,30 +425,25 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
for (i = 1; i <= level; i++) {
bool done = false;
- if (!nids[i] && !ro) {
- mutex_lock_op(sbi, NODE_NEW);
-
+ if (!nids[i] && mode == ALLOC_NODE) {
/* alloc new node */
if (!alloc_nid(sbi, &(nids[i]))) {
- mutex_unlock_op(sbi, NODE_NEW);
err = -ENOSPC;
goto release_pages;
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i]);
+ npage[i] = new_node_page(dn, noffset[i], NULL);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
- mutex_unlock_op(sbi, NODE_NEW);
err = PTR_ERR(npage[i]);
goto release_pages;
}
set_nid(parent, offset[i - 1], nids[i], i == 1);
alloc_nid_done(sbi, nids[i]);
- mutex_unlock_op(sbi, NODE_NEW);
done = true;
- } else if (ro && i == level && level > 1) {
+ } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
npage[i] = get_node_page_ra(parent, offset[i - 1]);
if (IS_ERR(npage[i])) {
err = PTR_ERR(npage[i]);
@@ -507,6 +516,7 @@ invalidate:
f2fs_put_page(dn->node_page, 1);
dn->node_page = NULL;
+ trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
}
static int truncate_dnode(struct dnode_of_data *dn)
@@ -547,9 +557,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
if (dn->nid == 0)
return NIDS_PER_BLOCK + 1;
+ trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
+
page = get_node_page(sbi, dn->nid);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
return PTR_ERR(page);
+ }
rn = (struct f2fs_node *)page_address(page);
if (depth < 3) {
@@ -591,10 +605,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
} else {
f2fs_put_page(page, 1);
}
+ trace_f2fs_truncate_nodes_exit(dn->inode, freed);
return freed;
out_err:
f2fs_put_page(page, 1);
+ trace_f2fs_truncate_nodes_exit(dn->inode, ret);
return ret;
}
@@ -649,6 +665,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
fail:
for (i = depth - 3; i >= 0; i--)
f2fs_put_page(pages[i], 1);
+
+ trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
+
return err;
}
@@ -658,6 +677,7 @@ fail:
int truncate_inode_blocks(struct inode *inode, pgoff_t from)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *node_mapping = sbi->node_inode->i_mapping;
int err = 0, cont = 1;
int level, offset[4], noffset[4];
unsigned int nofs = 0;
@@ -665,11 +685,15 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
struct dnode_of_data dn;
struct page *page;
- level = get_node_path(from, offset, noffset);
+ trace_f2fs_truncate_inode_blocks_enter(inode, from);
+ level = get_node_path(from, offset, noffset);
+restart:
page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
return PTR_ERR(page);
+ }
set_new_dnode(&dn, inode, page, NULL, 0);
unlock_page(page);
@@ -728,6 +752,10 @@ skip_partial:
if (offset[1] == 0 &&
rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
+ if (page->mapping != node_mapping) {
+ f2fs_put_page(page, 1);
+ goto restart;
+ }
wait_on_page_writeback(page);
rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
@@ -739,9 +767,14 @@ skip_partial:
}
fail:
f2fs_put_page(page, 0);
+ trace_f2fs_truncate_inode_blocks_exit(inode, err);
return err > 0 ? 0 : err;
}
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
int remove_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -749,21 +782,16 @@ int remove_inode_page(struct inode *inode)
nid_t ino = inode->i_ino;
struct dnode_of_data dn;
- mutex_lock_op(sbi, NODE_TRUNC);
page = get_node_page(sbi, ino);
- if (IS_ERR(page)) {
- mutex_unlock_op(sbi, NODE_TRUNC);
+ if (IS_ERR(page))
return PTR_ERR(page);
- }
if (F2FS_I(inode)->i_xattr_nid) {
nid_t nid = F2FS_I(inode)->i_xattr_nid;
struct page *npage = get_node_page(sbi, nid);
- if (IS_ERR(npage)) {
- mutex_unlock_op(sbi, NODE_TRUNC);
+ if (IS_ERR(npage))
return PTR_ERR(npage);
- }
F2FS_I(inode)->i_xattr_nid = 0;
set_new_dnode(&dn, inode, page, npage, nid);
@@ -775,30 +803,22 @@ int remove_inode_page(struct inode *inode)
BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
set_new_dnode(&dn, inode, page, page, ino);
truncate_node(&dn);
-
- mutex_unlock_op(sbi, NODE_TRUNC);
return 0;
}
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct page *page;
struct dnode_of_data dn;
/* allocate inode page for new inode */
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- mutex_lock_op(sbi, NODE_NEW);
- page = new_node_page(&dn, 0);
- init_dent_inode(name, page);
- mutex_unlock_op(sbi, NODE_NEW);
- if (IS_ERR(page))
- return PTR_ERR(page);
- f2fs_put_page(page, 1);
- return 0;
+
+ /* caller should f2fs_put_page(page, 1); */
+ return new_node_page(&dn, 0, NULL);
}
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+ unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -831,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
set_cold_node(dn->inode, page);
dn->node_page = page;
- sync_inode_page(dn);
+ if (ipage)
+ update_inode(dn->inode, ipage);
+ else
+ sync_inode_page(dn);
set_page_dirty(page);
if (ofs == 0)
inc_valid_inode_count(sbi);
@@ -844,6 +867,12 @@ fail:
return ERR_PTR(err);
}
+/*
+ * Caller should do after getting the following values.
+ * 0: f2fs_put_page(page, 0)
+ * LOCKED_PAGE: f2fs_put_page(page, 1)
+ * error: nothing
+ */
static int read_node_page(struct page *page, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
@@ -851,8 +880,14 @@ static int read_node_page(struct page *page, int type)
get_node_info(sbi, page->index, &ni);
- if (ni.blk_addr == NULL_ADDR)
+ if (ni.blk_addr == NULL_ADDR) {
+ f2fs_put_page(page, 1);
return -ENOENT;
+ }
+
+ if (PageUptodate(page))
+ return LOCKED_PAGE;
+
return f2fs_readpage(sbi, page, ni.blk_addr, type);
}
@@ -863,40 +898,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
{
struct address_space *mapping = sbi->node_inode->i_mapping;
struct page *apage;
+ int err;
apage = find_get_page(mapping, nid);
- if (apage && PageUptodate(apage))
- goto release_out;
+ if (apage && PageUptodate(apage)) {
+ f2fs_put_page(apage, 0);
+ return;
+ }
f2fs_put_page(apage, 0);
apage = grab_cache_page(mapping, nid);
if (!apage)
return;
- if (read_node_page(apage, READA))
- unlock_page(apage);
-
-release_out:
- f2fs_put_page(apage, 0);
+ err = read_node_page(apage, READA);
+ if (err == 0)
+ f2fs_put_page(apage, 0);
+ else if (err == LOCKED_PAGE)
+ f2fs_put_page(apage, 1);
return;
}
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
{
- int err;
- struct page *page;
struct address_space *mapping = sbi->node_inode->i_mapping;
-
+ struct page *page;
+ int err;
+repeat:
page = grab_cache_page(mapping, nid);
if (!page)
return ERR_PTR(-ENOMEM);
err = read_node_page(page, READ_SYNC);
- if (err) {
- f2fs_put_page(page, 1);
+ if (err < 0)
return ERR_PTR(err);
- }
+ else if (err == LOCKED_PAGE)
+ goto got_it;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+got_it:
BUG_ON(nid != nid_of_node(page));
mark_page_accessed(page);
return page;
@@ -910,31 +958,27 @@ struct page *get_node_page_ra(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
struct address_space *mapping = sbi->node_inode->i_mapping;
- int i, end;
- int err = 0;
- nid_t nid;
+ struct blk_plug plug;
struct page *page;
+ int err, i, end;
+ nid_t nid;
/* First, try getting the desired direct node. */
nid = get_nid(parent, start, false);
if (!nid)
return ERR_PTR(-ENOENT);
-
- page = find_get_page(mapping, nid);
- if (page && PageUptodate(page))
- goto page_hit;
- f2fs_put_page(page, 0);
-
repeat:
page = grab_cache_page(mapping, nid);
if (!page)
return ERR_PTR(-ENOMEM);
- err = read_node_page(page, READA);
- if (err) {
- f2fs_put_page(page, 1);
+ err = read_node_page(page, READ_SYNC);
+ if (err < 0)
return ERR_PTR(err);
- }
+ else if (err == LOCKED_PAGE)
+ goto page_hit;
+
+ blk_start_plug(&plug);
/* Then, try readahead for siblings of the desired node */
end = start + MAX_RA_NODE;
@@ -946,18 +990,19 @@ repeat:
ra_node_page(sbi, nid);
}
-page_hit:
- lock_page(page);
- if (PageError(page)) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
+ blk_finish_plug(&plug);
- /* Has the page been truncated? */
+ lock_page(page);
if (page->mapping != mapping) {
f2fs_put_page(page, 1);
goto repeat;
}
+page_hit:
+ if (!PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+ mark_page_accessed(page);
return page;
}
@@ -972,7 +1017,7 @@ void sync_inode_page(struct dnode_of_data *dn)
if (!dn->inode_page_locked)
unlock_page(dn->inode_page);
} else {
- f2fs_write_inode(dn->inode, NULL);
+ update_inode_page(dn->inode);
}
}
@@ -1087,17 +1132,8 @@ static int f2fs_write_node_page(struct page *page,
block_t new_addr;
struct node_info ni;
- if (wbc->for_reclaim) {
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- wbc->pages_skipped++;
- set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
-
wait_on_page_writeback(page);
- mutex_lock_op(sbi, NODE_WRITE);
-
/* get old block addr of this node page */
nid = nid_of_node(page);
BUG_ON(page->index != nid);
@@ -1105,17 +1141,25 @@ static int f2fs_write_node_page(struct page *page,
get_node_info(sbi, nid, &ni);
/* This page is already truncated */
- if (ni.blk_addr == NULL_ADDR)
+ if (ni.blk_addr == NULL_ADDR) {
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ unlock_page(page);
return 0;
+ }
- set_page_writeback(page);
+ if (wbc->for_reclaim) {
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
- /* insert node offset */
+ mutex_lock(&sbi->node_write);
+ set_page_writeback(page);
write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
set_node_addr(sbi, &ni, new_addr);
dec_page_count(sbi, F2FS_DIRTY_NODES);
-
- mutex_unlock_op(sbi, NODE_WRITE);
+ mutex_unlock(&sbi->node_write);
unlock_page(page);
return 0;
}
@@ -1130,12 +1174,11 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
- struct block_device *bdev = sbi->sb->s_bdev;
long nr_to_write = wbc->nr_to_write;
/* First check balancing cached NAT entries */
if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
- write_checkpoint(sbi, false);
+ f2fs_sync_fs(sbi->sb, true);
return 0;
}
@@ -1144,10 +1187,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
return 0;
/* if mounting is failed, skip writing node pages */
- wbc->nr_to_write = bio_get_nr_vecs(bdev);
+ wbc->nr_to_write = max_hw_blocks(sbi);
sync_node_pages(sbi, 0, wbc);
- wbc->nr_to_write = nr_to_write -
- (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+ wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write);
return 0;
}
@@ -1166,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1178,7 +1221,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
static int f2fs_release_node_page(struct page *page, gfp_t wait)
{
ClearPagePrivate(page);
- return 0;
+ return 1;
}
/*
@@ -1195,14 +1238,13 @@ const struct address_space_operations f2fs_node_aops = {
static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
{
struct list_head *this;
- struct free_nid *i = NULL;
+ struct free_nid *i;
list_for_each(this, head) {
i = list_entry(this, struct free_nid, list);
if (i->nid == n)
- break;
- i = NULL;
+ return i;
}
- return i;
+ return NULL;
}
static void __del_from_free_nid_list(struct free_nid *i)
@@ -1211,11 +1253,29 @@ static void __del_from_free_nid_list(struct free_nid *i)
kmem_cache_free(free_nid_slab, i);
}
-static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
{
struct free_nid *i;
+ struct nat_entry *ne;
+ bool allocated = false;
if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+ return -1;
+
+ /* 0 nid should not be used */
+ if (nid == 0)
+ return 0;
+
+ if (!build)
+ goto retry;
+
+ /* do not add allocated nids */
+ read_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+ allocated = true;
+ read_unlock(&nm_i->nat_tree_lock);
+ if (allocated)
return 0;
retry:
i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
@@ -1250,63 +1310,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
spin_unlock(&nm_i->free_nid_list_lock);
}
-static int scan_nat_page(struct f2fs_nm_info *nm_i,
+static void scan_nat_page(struct f2fs_nm_info *nm_i,
struct page *nat_page, nid_t start_nid)
{
struct f2fs_nat_block *nat_blk = page_address(nat_page);
block_t blk_addr;
- int fcnt = 0;
int i;
- /* 0 nid should not be used */
- if (start_nid == 0)
- ++start_nid;
-
i = start_nid % NAT_ENTRY_PER_BLOCK;
for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
- blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
+
+ if (start_nid >= nm_i->max_nid)
+ break;
+
+ blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
BUG_ON(blk_addr == NEW_ADDR);
- if (blk_addr == NULL_ADDR)
- fcnt += add_free_nid(nm_i, start_nid);
+ if (blk_addr == NULL_ADDR) {
+ if (add_free_nid(nm_i, start_nid, true) < 0)
+ break;
+ }
}
- return fcnt;
}
static void build_free_nids(struct f2fs_sb_info *sbi)
{
- struct free_nid *fnid, *next_fnid;
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- nid_t nid = 0;
- bool is_cycled = false;
- int fcnt = 0;
- int i;
+ int i = 0;
+ nid_t nid = nm_i->next_scan_nid;
- nid = nm_i->next_scan_nid;
- nm_i->init_scan_nid = nid;
+ /* Enough entries */
+ if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+ return;
+ /* readahead nat pages to be scanned */
ra_nat_pages(sbi, nid);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
- fcnt += scan_nat_page(nm_i, page, nid);
+ scan_nat_page(nm_i, page, nid);
f2fs_put_page(page, 1);
nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-
- if (nid >= nm_i->max_nid) {
+ if (nid >= nm_i->max_nid)
nid = 0;
- is_cycled = true;
- }
- if (fcnt > MAX_FREE_NIDS)
- break;
- if (is_cycled && nm_i->init_scan_nid <= nid)
+
+ if (i++ == FREE_NID_PAGES)
break;
}
+ /* go to the next free nat pages to find free nids abundantly */
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
@@ -1315,22 +1371,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
nid = le32_to_cpu(nid_in_journal(sum, i));
if (addr == NULL_ADDR)
- add_free_nid(nm_i, nid);
+ add_free_nid(nm_i, nid, true);
else
remove_free_nid(nm_i, nid);
}
mutex_unlock(&curseg->curseg_mutex);
-
- /* remove the free nids from current allocated nids */
- list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
- struct nat_entry *ne;
-
- read_lock(&nm_i->nat_tree_lock);
- ne = __lookup_nat_cache(nm_i, fnid->nid);
- if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
- remove_free_nid(nm_i, fnid->nid);
- read_unlock(&nm_i->nat_tree_lock);
- }
}
/*
@@ -1344,41 +1389,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct free_nid *i = NULL;
struct list_head *this;
retry:
- mutex_lock(&nm_i->build_lock);
- if (!nm_i->fcnt) {
- /* scan NAT in order to build free nid list */
- build_free_nids(sbi);
- if (!nm_i->fcnt) {
- mutex_unlock(&nm_i->build_lock);
- return false;
- }
- }
- mutex_unlock(&nm_i->build_lock);
+ if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+ return false;
- /*
- * We check fcnt again since previous check is racy as
- * we didn't hold free_nid_list_lock. So other thread
- * could consume all of free nids.
- */
spin_lock(&nm_i->free_nid_list_lock);
- if (!nm_i->fcnt) {
- spin_unlock(&nm_i->free_nid_list_lock);
- goto retry;
- }
- BUG_ON(list_empty(&nm_i->free_nid_list));
- list_for_each(this, &nm_i->free_nid_list) {
- i = list_entry(this, struct free_nid, list);
- if (i->state == NID_NEW)
- break;
- }
+ /* We should not use stale free nids created by build_free_nids */
+ if (nm_i->fcnt && !sbi->on_build_free_nids) {
+ BUG_ON(list_empty(&nm_i->free_nid_list));
+ list_for_each(this, &nm_i->free_nid_list) {
+ i = list_entry(this, struct free_nid, list);
+ if (i->state == NID_NEW)
+ break;
+ }
- BUG_ON(i->state != NID_NEW);
- *nid = i->nid;
- i->state = NID_ALLOC;
- nm_i->fcnt--;
+ BUG_ON(i->state != NID_NEW);
+ *nid = i->nid;
+ i->state = NID_ALLOC;
+ nm_i->fcnt--;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return true;
+ }
spin_unlock(&nm_i->free_nid_list_lock);
- return true;
+
+ /* Let's scan nat pages and its caches to get free nids */
+ mutex_lock(&nm_i->build_lock);
+ sbi->on_build_free_nids = 1;
+ build_free_nids(sbi);
+ sbi->on_build_free_nids = 0;
+ mutex_unlock(&nm_i->build_lock);
+ goto retry;
}
/*
@@ -1391,10 +1431,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
- if (i) {
- BUG_ON(i->state != NID_ALLOC);
- __del_from_free_nid_list(i);
- }
+ BUG_ON(!i || i->state != NID_ALLOC);
+ __del_from_free_nid_list(i);
spin_unlock(&nm_i->free_nid_list_lock);
}
@@ -1403,8 +1441,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
*/
void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
{
- alloc_nid_done(sbi, nid);
- add_free_nid(NM_I(sbi), nid);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ BUG_ON(!i || i->state != NID_ALLOC);
+ if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
+ __del_from_free_nid_list(i);
+ } else {
+ i->state = NID_NEW;
+ nm_i->fcnt++;
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
}
void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1447,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
new_ni = old_ni;
new_ni.ino = ino;
+ if (!inc_valid_node_count(sbi, NULL, 1))
+ WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR);
inc_valid_inode_count(sbi);
-
f2fs_put_page(ipage, 1);
return 0;
}
@@ -1475,23 +1525,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i++, sum_entry++) {
+ /*
+ * In order to read next node page,
+ * we must clear PageUptodate flag.
+ */
+ ClearPageUptodate(page);
+
if (f2fs_readpage(sbi, page, addr, READ_SYNC))
goto out;
+ lock_page(page);
rn = (struct f2fs_node *)page_address(page);
sum_entry->nid = rn->footer.nid;
sum_entry->version = 0;
sum_entry->ofs_in_node = 0;
addr++;
-
- /*
- * In order to read next node page,
- * we must clear PageUptodate flag.
- */
- ClearPageUptodate(page);
}
-out:
unlock_page(page);
+out:
__free_pages(page, 0);
return 0;
}
@@ -1614,13 +1665,11 @@ flush_now:
nid_in_journal(sum, offset) = cpu_to_le32(nid);
}
- if (nat_get_blkaddr(ne) == NULL_ADDR) {
+ if (nat_get_blkaddr(ne) == NULL_ADDR &&
+ add_free_nid(NM_I(sbi), nid, false) <= 0) {
write_lock(&nm_i->nat_tree_lock);
__del_from_nat_cache(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
-
- /* We can reuse this freed nid at this point */
- add_free_nid(NM_I(sbi), nid);
} else {
write_lock(&nm_i->nat_tree_lock);
__clear_nat_cache_dirty(nm_i, ne);
@@ -1661,19 +1710,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
spin_lock_init(&nm_i->free_nid_list_lock);
rwlock_init(&nm_i->nat_tree_lock);
- nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
- nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
-
- nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
- if (!nm_i->nat_bitmap)
- return -ENOMEM;
+ nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
if (!version_bitmap)
return -EFAULT;
- /* copy version bitmap */
- memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+ nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
+ GFP_KERNEL);
+ if (!nm_i->nat_bitmap)
+ return -ENOMEM;
return 0;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index afdb130f782e..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -29,6 +29,9 @@
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
+/* return value for read_node_page */
+#define LOCKED_PAGE 1
+
/*
* For node information
*/
@@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page)
return false;
if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
ofs -= 6 + 2 * NIDS_PER_BLOCK;
- if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+ if (!((long int)ofs % (NIDS_PER_BLOCK + 1)))
return false;
}
return true;
@@ -272,11 +275,28 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
+{
+ return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+ F2FS_I(inode)->i_advise |= type;
}
+static inline void clear_file(struct inode *inode, int type)
+{
+ F2FS_I(inode)->i_advise &= ~type;
+}
+
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
+
static inline int is_cold_data(struct page *page)
{
return PageChecked(page);
@@ -292,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
ClearPageChecked(page);
}
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
{
void *kaddr = page_address(page);
struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << COLD_BIT_SHIFT);
-}
-
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << FSYNC_BIT_SHIFT);
+ return le32_to_cpu(rn->footer.flag) & (1 << type);
}
-static inline unsigned char is_dent_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
static inline void set_cold_node(struct inode *inode, struct page *page)
{
@@ -328,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_fsync_mark(struct page *page, int mark)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (mark)
- flag |= (0x1 << FSYNC_BIT_SHIFT);
- else
- flag &= ~(0x1 << FSYNC_BIT_SHIFT);
- rn->footer.flag = cpu_to_le32(flag);
-}
-
-static inline void set_dentry_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
- flag |= (0x1 << DENT_BIT_SHIFT);
+ flag |= (0x1 << type);
else
- flag &= ~(0x1 << DENT_BIT_SHIFT);
+ flag &= ~(0x1 << type);
rn->footer.flag = cpu_to_le32(flag);
}
+#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b235215ac138..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ void *kaddr = page_address(ipage);
+ struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
- struct qstr name;
+ nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
+ struct qstr name;
struct page *page;
- struct inode *dir;
+ struct inode *dir, *einode;
int err = 0;
- if (!is_dent_dnode(ipage))
- goto out;
-
- dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
- if (IS_ERR(dir)) {
- err = -EINVAL;
- goto out;
+ dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+ if (!dir) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+ set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ add_dirty_dir_inode(dir);
}
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
-
+retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de) {
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
kunmap(page);
f2fs_put_page(page, 0);
- } else {
- err = __f2fs_add_link(dir, &name, inode);
+ goto out;
+ }
+ if (de) {
+ einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ if (IS_ERR(einode)) {
+ WARN_ON(1);
+ if (PTR_ERR(einode) == -ENOENT)
+ err = -EEXIST;
+ goto out;
+ }
+ f2fs_delete_entry(de, page, einode);
+ iput(einode);
+ goto retry;
}
- iput(dir);
+ err = __f2fs_add_link(dir, &name, inode);
out:
- kunmap(ipage);
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+ "ino = %x, name = %s, dir = %lx, err = %d",
+ ino_of_node(ipage), raw_inode->i_name,
+ IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
+ if (!IS_INODE(node_page))
+ return 0;
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_size_write(inode, le64_to_cpu(raw_inode->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- return recover_dentry(node_page, inode);
+ if (is_dent_dnode(node_page))
+ return recover_dentry(node_page, inode);
+
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+ ino_of_node(node_page), raw_inode->i_name);
+ return 0;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -112,64 +138,61 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
while (1) {
struct fsync_inode_entry *entry;
- if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+ if (err)
goto out;
+ lock_page(page);
+
if (cp_ver != cpver_of_node(page))
- goto out;
+ break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
if (entry) {
- entry->blkaddr = blkaddr;
if (IS_INODE(page) && is_dent_dnode(page))
set_inode_flag(F2FS_I(entry->inode),
FI_INC_LINK);
} else {
if (IS_INODE(page) && is_dent_dnode(page)) {
- if (recover_inode_page(sbi, page)) {
- err = -ENOMEM;
- goto out;
- }
+ err = recover_inode_page(sbi, page);
+ if (err)
+ break;
}
/* add this fsync inode to the list */
entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
if (!entry) {
err = -ENOMEM;
- goto out;
+ break;
}
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
- goto out;
+ break;
}
-
list_add_tail(&entry->list, head);
- entry->blkaddr = blkaddr;
- }
- if (IS_INODE(page)) {
- err = recover_inode(entry->inode, page);
- if (err)
- goto out;
}
+ entry->blkaddr = blkaddr;
+
+ err = recover_inode(entry->inode, page);
+ if (err && err != -ENOENT)
+ break;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
- ClearPageUptodate(page);
}
-out:
unlock_page(page);
+out:
__free_pages(page, 0);
return err;
}
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
- struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
@@ -180,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
}
}
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr, struct dnode_of_data *dn)
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
(sbi->blocks_per_seg - 1);
struct f2fs_summary sum;
- nid_t ino;
+ nid_t ino, nid;
void *kaddr;
struct inode *inode;
struct page *node_page;
@@ -197,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
- return;
+ return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -216,29 +239,50 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 1);
}
+ /* Use the locked dnode page and inode */
+ nid = le32_to_cpu(sum.nid);
+ if (dn->inode->i_ino == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.nid = nid;
+ tdn.node_page = dn->inode_page;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ } else if (dn->nid == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ }
+
/* Get the node page */
- node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
bidx = start_bidx_of_node(ofs_of_node(node_page)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
- return;
+ return PTR_ERR(inode);
truncate_hole(inode, bidx, bidx + 1);
iput(inode);
+ return 0;
}
-static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
unsigned int start, end;
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
+ int err = 0, recovered = 0;
+ int ilock;
start = start_bidx_of_node(ofs_of_node(page));
if (IS_INODE(page))
@@ -246,9 +290,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
else
end = start + ADDRS_PER_BLOCK;
+ ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, start, 0))
- return;
+
+ err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+ if (err) {
+ mutex_unlock_op(sbi, ilock);
+ return err;
+ }
wait_on_page_writeback(dn.node_page);
@@ -270,13 +319,16 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* Check the previous node page having this index */
- check_index_in_prev_nodes(sbi, dest);
+ err = check_index_in_prev_nodes(sbi, dest, &dn);
+ if (err)
+ goto err;
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* write dummy data page */
recover_data_page(sbi, NULL, &sum, src, dest);
update_extent_cache(dest, &dn);
+ recovered++;
}
dn.ofs_in_node++;
}
@@ -292,15 +344,23 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, ilock);
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+ "recovered_data = %d blocks, err = %d",
+ inode->i_ino, recovered, err);
+ return err;
}
-static void recover_data(struct f2fs_sb_info *sbi,
+static int recover_data(struct f2fs_sb_info *sbi,
struct list_head *head, int type)
{
unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
struct curseg_info *curseg;
struct page *page;
+ int err = 0;
block_t blkaddr;
/* get node pages in the current segment */
@@ -310,23 +370,29 @@ static void recover_data(struct f2fs_sb_info *sbi,
/* read node page */
page = alloc_page(GFP_NOFS | __GFP_ZERO);
if (IS_ERR(page))
- return;
+ return -ENOMEM;
+
lock_page(page);
while (1) {
struct fsync_inode_entry *entry;
- if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+ if (err)
goto out;
+ lock_page(page);
+
if (cp_ver != cpver_of_node(page))
- goto out;
+ break;
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
goto next;
- do_recover_data(sbi, entry->inode, page, blkaddr);
+ err = do_recover_data(sbi, entry->inode, page, blkaddr);
+ if (err)
+ break;
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -336,40 +402,45 @@ static void recover_data(struct f2fs_sb_info *sbi,
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
- ClearPageUptodate(page);
}
-out:
unlock_page(page);
+out:
__free_pages(page, 0);
- allocate_new_segments(sbi);
+ if (!err)
+ allocate_new_segments(sbi);
+ return err;
}
-void recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi)
{
struct list_head inode_list;
+ int err;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry), NULL);
if (unlikely(!fsync_entry_slab))
- return;
+ return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
- if (find_fsync_dnodes(sbi, &inode_list))
+ sbi->por_doing = 1;
+ err = find_fsync_dnodes(sbi, &inode_list);
+ if (err)
goto out;
if (list_empty(&inode_list))
goto out;
/* step #2: recover data */
- sbi->por_doing = 1;
- recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- sbi->por_doing = 0;
+ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
BUG_ON(!list_empty(&inode_list));
out:
- destroy_fsync_dnodes(sbi, &inode_list);
+ destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
- write_checkpoint(sbi, false);
+ sbi->por_doing = 0;
+ if (!err)
+ write_checkpoint(sbi, false);
+ return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 777f17e496e6..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -18,6 +18,7 @@
#include "f2fs.h"
#include "segment.h"
#include "node.h"
+#include <trace/events/f2fs.h>
/*
* This function balances dirty node and dentry pages.
@@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (dirty_type == DIRTY) {
struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ enum dirty_type t = DIRTY_HOT_DATA;
+
dirty_type = sentry->type;
+
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
dirty_i->nr_dirty[dirty_type]++;
+
+ /* Only one bitmap should be set */
+ for (; t <= DIRTY_COLD_NODE; t++) {
+ if (t == dirty_type)
+ continue;
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+ dirty_i->nr_dirty[t]--;
+ }
}
}
@@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
dirty_i->nr_dirty[dirty_type]--;
if (dirty_type == DIRTY) {
- struct seg_entry *sentry = get_seg_entry(sbi, segno);
- dirty_type = sentry->type;
- if (test_and_clear_bit(segno,
- dirty_i->dirty_segmap[dirty_type]))
- dirty_i->nr_dirty[dirty_type]--;
- clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
- clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+ enum dirty_type t = DIRTY_HOT_DATA;
+
+ /* clear all the bitmaps */
+ for (; t <= DIRTY_COLD_NODE; t++)
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+ dirty_i->nr_dirty[t]--;
+
+ if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+ clear_bit(GET_SECNO(sbi, segno),
+ dirty_i->victim_secmap);
}
}
@@ -79,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
* Adding dirty entry into seglist is not critical operation.
* If a given segment is one of current working segments, it won't be added.
*/
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned short valid_blocks;
@@ -111,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
__set_test_and_free(sbi, segno);
- offset = segno + 1;
}
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -129,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
- offset = segno + 1;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
dirty_i->nr_dirty[PRE]--;
@@ -242,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
* This function should be resided under the curseg_mutex lock
*/
static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
- struct f2fs_summary *sum, unsigned short offset)
+ struct f2fs_summary *sum)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
void *addr = curseg->sum_blk;
- addr += offset * sizeof(struct f2fs_summary);
+ addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
return;
}
@@ -296,48 +309,15 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
- int ofs_unit, int type)
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
- unsigned int segno, next_segno, i;
- int ofs = 0;
-
- /*
- * If there is not enough reserved sections,
- * we should not reuse prefree segments.
- */
- if (has_not_enough_free_secs(sbi, 0))
- return NULL_SEGNO;
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int segno = curseg->segno + 1;
+ struct free_segmap_info *free_i = FREE_I(sbi);
- /*
- * NODE page should not reuse prefree segment,
- * since those information is used for SPOR.
- */
- if (IS_NODESEG(type))
- return NULL_SEGNO;
-next:
- segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
- ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
- if (segno < TOTAL_SEGS(sbi)) {
- /* skip intermediate segments in a section */
- if (segno % ofs_unit)
- goto next;
-
- /* skip if whole section is not prefree */
- next_segno = find_next_zero_bit(prefree_segmap,
- TOTAL_SEGS(sbi), segno + 1);
- if (next_segno - segno < ofs_unit)
- goto next;
-
- /* skip if whole section was not free at the last checkpoint */
- for (i = 0; i < ofs_unit; i++)
- if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
- goto next;
- return segno;
- }
- return NULL_SEGNO;
+ if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
+ return 0;
}
/*
@@ -348,9 +328,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
unsigned int *newseg, bool new_sec, int dir)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int total_secs = sbi->total_sections;
unsigned int segno, secno, zoneno;
- unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+ unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = *newseg / sbi->segs_per_sec;
unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
unsigned int left_start = hint;
@@ -363,16 +342,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
TOTAL_SEGS(sbi), *newseg + 1);
- if (segno < TOTAL_SEGS(sbi))
+ if (segno - *newseg < sbi->segs_per_sec -
+ (*newseg % sbi->segs_per_sec))
goto got_it;
}
find_other_zone:
- secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
- if (secno >= total_secs) {
+ secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
+ if (secno >= TOTAL_SECS(sbi)) {
if (dir == ALLOC_RIGHT) {
secno = find_next_zero_bit(free_i->free_secmap,
- total_secs, 0);
- BUG_ON(secno >= total_secs);
+ TOTAL_SECS(sbi), 0);
+ BUG_ON(secno >= TOTAL_SECS(sbi));
} else {
go_left = 1;
left_start = hint - 1;
@@ -387,8 +367,8 @@ find_other_zone:
continue;
}
left_start = find_next_zero_bit(free_i->free_secmap,
- total_secs, 0);
- BUG_ON(left_start >= total_secs);
+ TOTAL_SECS(sbi), 0);
+ BUG_ON(left_start >= TOTAL_SECS(sbi));
break;
}
secno = left_start;
@@ -463,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
int dir = ALLOC_LEFT;
write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ GET_SUM_BLOCK(sbi, segno));
if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
@@ -561,26 +541,25 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
int type, bool force)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int ofs_unit;
if (force) {
new_curseg(sbi, type, true);
goto out;
}
- ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
- curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
-
- if (curseg->next_segno != NULL_SEGNO)
- change_curseg(sbi, type, false);
- else if (type == CURSEG_WARM_NODE)
+ if (type == CURSEG_WARM_NODE)
+ new_curseg(sbi, type, false);
+ else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
out:
+#ifdef CONFIG_F2FS_STAT_FS
sbi->segment_count[curseg->alloc_type]++;
+#endif
+ return;
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -656,10 +635,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
if (type >= META_FLUSH)
rw = WRITE_FLUSH_FUA;
+ if (btype == META)
+ rw |= REQ_META;
+
if (sbi->bio[btype]) {
struct bio_private *p = sbi->bio[btype]->bi_private;
p->sbi = sbi;
sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+
+ trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
+
if (type == META_FLUSH) {
DECLARE_COMPLETION_ONSTACK(wait);
p->is_sync = true;
@@ -696,7 +681,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
do_submit_bio(sbi, type, false);
alloc_new:
if (sbi->bio[type] == NULL) {
- sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+ sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
/*
* The end_io will be assigned at the sumbission phase.
@@ -714,6 +699,7 @@ alloc_new:
sbi->last_block_in_bio[type] = blk_addr;
up_write(&sbi->bio_sem);
+ trace_f2fs_submit_write_page(page, blk_addr, type);
}
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -756,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || is_cold_file(inode))
+ else if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
else
return CURSEG_WARM_DATA;
@@ -805,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
* because, this function updates a summary entry in the
* current summary block.
*/
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->block_count[curseg->alloc_type]++;
+#endif
/*
* SIT information should be updated before segment allocation,
@@ -904,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -941,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
}
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
if (next_segno != segno) {
@@ -1390,7 +1378,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
}
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(sbi->total_sections *
+ sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
sizeof(struct sec_entry));
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -1403,10 +1391,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
- dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
if (!dst_bitmap)
return -ENOMEM;
- memcpy(dst_bitmap, src_bitmap, bitmap_size);
/* init SIT information */
sit_i->s_ops = &default_salloc_ops;
@@ -1442,7 +1429,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
if (!free_i->free_segmap)
return -ENOMEM;
- sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+ sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -1541,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0;
+ unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
unsigned short valid_blocks;
- while (segno < TOTAL_SEGS(sbi)) {
+ while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
- if (segno >= TOTAL_SEGS(sbi))
+ segno = find_next_inuse(free_i, total_segs, offset);
+ if (segno >= total_segs)
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
@@ -1559,14 +1546,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
}
}
-static int init_victim_segmap(struct f2fs_sb_info *sbi)
+static int init_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
- dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
- dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+ dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
}
@@ -1593,7 +1579,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
}
init_dirty_segmap(sbi);
- return init_victim_segmap(sbi);
+ return init_victim_secmap(sbi);
}
/*
@@ -1680,18 +1666,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
mutex_unlock(&dirty_i->seglist_lock);
}
-void reset_victim_segmap(struct f2fs_sb_info *sbi)
-{
- unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
- memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
-}
-
-static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-
- kfree(dirty_i->victim_segmap[FG_GC]);
- kfree(dirty_i->victim_segmap[BG_GC]);
+ kfree(dirty_i->victim_secmap);
}
static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -1706,7 +1684,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_DIRTY_TYPE; i++)
discard_dirty_segmap(sbi, i);
- destroy_victim_segmap(sbi);
+ destroy_victim_secmap(sbi);
SM_I(sbi)->dirty_info = NULL;
kfree(dirty_i);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 552dadbb2327..062424a0e4c3 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -8,10 +8,13 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#include <linux/blkdev.h>
+
/* constant macro */
#define NULL_SEGNO ((unsigned int)(~0))
+#define NULL_SECNO ((unsigned int)(~0))
-/* V: Logical segment # in volume, R: Relative segment # in main area */
+/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
@@ -23,13 +26,13 @@
((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
(t == CURSEG_WARM_NODE))
-#define IS_CURSEG(sbi, segno) \
- ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+#define IS_CURSEG(sbi, seg) \
+ ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
#define IS_CURSEC(sbi, secno) \
((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
@@ -81,9 +84,12 @@
#define f2fs_bitmap_size(nr) \
(BITS_TO_LONGS(nr) * sizeof(unsigned long))
#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
+#define TOTAL_SECS(sbi) (sbi->total_sections)
#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
(blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+#define SECTOR_TO_BLOCK(sbi, sectors) \
+ (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
/* during checkpoint, bio_private is used to synchronize the last bio */
struct bio_private {
@@ -213,7 +219,7 @@ struct dirty_seglist_info {
unsigned long *dirty_segmap[NR_DIRTY_TYPE];
struct mutex seglist_lock; /* lock for segment bitmaps */
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
- unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */
+ unsigned long *victim_secmap; /* background GC victims */
};
/* victim selection function for cleaning and SSR */
@@ -464,8 +470,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
static inline int utilization(struct f2fs_sb_info *sbi)
{
- return (long int)valid_user_blocks(sbi) * 100 /
- (long int)sbi->user_block_count;
+ return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count);
}
/*
@@ -616,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
- (base + 1) + type;
}
+
+static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+ if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+ return true;
+ return false;
+}
+
+static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fea6e582a2ed..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -12,7 +12,6 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/statfs.h>
-#include <linux/proc_fs.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
@@ -21,16 +20,21 @@
#include <linux/seq_file.h>
#include <linux/random.h>
#include <linux/exportfs.h>
+#include <linux/blkdev.h>
#include <linux/f2fs_fs.h>
#include "f2fs.h"
#include "node.h"
+#include "segment.h"
#include "xattr.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/f2fs.h>
+
static struct kmem_cache *f2fs_inode_cachep;
enum {
- Opt_gc_background_off,
+ Opt_gc_background,
Opt_disable_roll_forward,
Opt_discard,
Opt_noheap,
@@ -42,7 +46,7 @@ enum {
};
static match_table_t f2fs_tokens = {
- {Opt_gc_background_off, "background_gc_off"},
+ {Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
@@ -72,6 +76,91 @@ static void init_once(void *foo)
inode_init_once(&fi->vfs_inode);
}
+static int parse_options(struct super_block *sb, char *options)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *name;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strncmp(name, "on", 2))
+ set_opt(sbi, BG_GC);
+ else if (!strncmp(name, "off", 3))
+ clear_opt(sbi, BG_GC);
+ else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
@@ -82,7 +171,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
- /* Initilize f2fs-specific inode info */
+ /* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
atomic_set(&fi->dirty_dents, 0);
fi->i_current_depth = 1;
@@ -94,6 +183,31 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
return &fi->vfs_inode;
}
+static int f2fs_drop_inode(struct inode *inode)
+{
+ /*
+ * This is to avoid a deadlock condition like below.
+ * writeback_single_inode(inode)
+ * - f2fs_write_data_page
+ * - f2fs_gc -> iput -> evict
+ * - inode_wait_for_writeback(inode)
+ */
+ if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+ return 0;
+ return generic_drop_inode(inode);
+}
+
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+ set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ return;
+}
+
static void f2fs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -132,13 +246,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ trace_f2fs_sync_fs(sb, sync);
+
if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
return 0;
- if (sync)
+ if (sync) {
+ mutex_lock(&sbi->gc_mutex);
write_checkpoint(sbi, false);
- else
+ mutex_unlock(&sbi->gc_mutex);
+ } else {
f2fs_balance_fs(sbi);
+ }
return 0;
}
@@ -147,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
{
int err;
- if (sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(sb))
return 0;
err = f2fs_sync_fs(sb, 1);
@@ -180,7 +299,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sbi->total_node_count;
buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
- buf->f_namelen = F2FS_MAX_NAME_LEN;
+ buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -191,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (test_opt(sbi, BG_GC))
- seq_puts(seq, ",background_gc_on");
+ if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+ seq_printf(seq, ",background_gc=%s", "on");
else
- seq_puts(seq, ",background_gc_off");
+ seq_printf(seq, ",background_gc=%s", "off");
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -221,10 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_mount_info org_mount_opt;
+ int err, active_logs;
+
+ /*
+ * Save the old mount options in case we
+ * need to restore them.
+ */
+ org_mount_opt = sbi->mount_opt;
+ active_logs = sbi->active_logs;
+
+ /* parse mount options */
+ err = parse_options(sb, data);
+ if (err)
+ goto restore_opts;
+
+ /*
+ * Previous and new state of filesystem is RO,
+ * so no point in checking GC conditions.
+ */
+ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ goto skip;
+
+ /*
+ * We stop the GC thread if FS is mounted as RO
+ * or if background_gc = off is passed in mount
+ * option. Also sync the filesystem.
+ */
+ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if (sbi->gc_thread) {
+ stop_gc_thread(sbi);
+ f2fs_sync_fs(sb, 1);
+ }
+ } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ err = start_gc_thread(sbi);
+ if (err)
+ goto restore_opts;
+ }
+skip:
+ /* Update the POSIXACL Flag */
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ return 0;
+
+restore_opts:
+ sbi->mount_opt = org_mount_opt;
+ sbi->active_logs = active_logs;
+ return err;
+}
+
static struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
+ .drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
.write_inode = f2fs_write_inode,
+ .dirty_inode = f2fs_dirty_inode,
.show_options = f2fs_show_options,
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
@@ -232,6 +405,7 @@ static struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
+ .remount_fs = f2fs_remount,
};
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -279,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
- char *options)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int arg = 0;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
-
- switch (token) {
- case Opt_gc_background_off:
- clear_opt(sbi, BG_GC);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_discard:
- set_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
-#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
-#else
- case Opt_nouser_xattr:
- f2fs_msg(sb, KERN_INFO,
- "nouser_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
-#else
- case Opt_noacl:
- f2fs_msg(sb, KERN_INFO, "noacl options not supported");
- break;
-#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
- return -EINVAL;
- sbi->active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- default:
- f2fs_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- }
- return 0;
-}
-
static loff_t max_file_size(unsigned bits)
{
loff_t result = ADDRS_PER_INODE;
@@ -457,6 +558,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+ sbi->cur_victim_sec = NULL_SECNO;
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
@@ -473,7 +575,7 @@ static int validate_superblock(struct super_block *sb,
if (!*raw_super_buf) {
f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
super);
- return 1;
+ return -EIO;
}
*raw_super = (struct f2fs_super_block *)
@@ -485,7 +587,7 @@ static int validate_superblock(struct super_block *sb,
f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
"in %s superblock", super);
- return 1;
+ return -EINVAL;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -508,11 +610,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) {
+ err = validate_superblock(sb, &raw_super, &raw_super_buf, 0);
+ if (err) {
brelse(raw_super_buf);
- if (validate_superblock(sb, &raw_super, &raw_super_buf, 1))
+ /* check secondary superblock when primary failed */
+ err = validate_superblock(sb, &raw_super, &raw_super_buf, 1);
+ if (err)
goto free_sb_buf;
}
+ sb->s_fs_info = sbi;
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -525,7 +631,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- if (parse_options(sb, sbi, (char *)data))
+ err = parse_options(sb, (char *)data);
+ if (err)
goto free_sb_buf;
sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
@@ -536,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
- sb->s_fs_info = sbi;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -547,11 +653,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sbi->raw_super = raw_super;
sbi->raw_super_buf = raw_super_buf;
mutex_init(&sbi->gc_mutex);
- mutex_init(&sbi->write_inode);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
- for (i = 0; i < NR_LOCK_TYPE; i++)
+ for (i = 0; i < NR_GLOBAL_LOCKS; i++)
mutex_init(&sbi->fs_lock[i]);
+ mutex_init(&sbi->node_write);
sbi->por_doing = 0;
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->bio_sem);
@@ -638,18 +744,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
}
/* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
- recover_fsync_data(sbi);
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ err = recover_fsync_data(sbi);
+ if (err)
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot recover all fsync data errno=%ld", err);
+ }
- /* After POR, we can run background GC thread */
- err = start_gc_thread(sbi);
- if (err)
- goto fail;
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+ }
err = f2fs_build_stats(sbi);
if (err)
goto fail;
+ if (test_opt(sbi, DISCARD)) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ if (!blk_queue_discard(q))
+ f2fs_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
+
return 0;
fail:
stop_gc_thread(sbi);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8038c0496504..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
*/
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
+#include <linux/security.h>
#include "f2fs.h"
#include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ break;
default:
return -EINVAL;
}
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
memcpy(list, prefix, prefix_len);
- memcpy(list+prefix_len, name, name_len);
+ memcpy(list + prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name,
- buffer, size);
+ return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
}
static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return 0;
}
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *page)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, (struct page *)page);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &f2fs_initxattrs, ipage);
+}
+#endif
+
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
.set = f2fs_xattr_advise_set,
};
+const struct xattr_handler f2fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = F2FS_XATTR_INDEX_SECURITY,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
#endif
[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
&f2fs_xattr_acl_default_handler,
#endif
&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ &f2fs_xattr_security_handler,
+#endif
&f2fs_xattr_advise_handler,
NULL,
};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
return -ENODATA;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return 0;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
}
int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len)
+ const void *value, size_t value_len, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -307,37 +358,40 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
int error, found, free, newsize;
size_t name_len;
char *pval;
+ int ilock;
if (name == NULL)
return -EINVAL;
- name_len = strlen(name);
if (value == NULL)
value_len = 0;
- if (name_len > 255 || value_len > MAX_VALUE_LEN)
+ name_len = strlen(name);
+
+ if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN)
return -ERANGE;
f2fs_balance_fs(sbi);
- mutex_lock_op(sbi, NODE_NEW);
+ ilock = mutex_lock_op(sbi);
+
if (!fi->i_xattr_nid) {
/* Allocate new attribute block */
struct dnode_of_data dn;
if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
- mutex_unlock_op(sbi, NODE_NEW);
- return -ENOSPC;
+ error = -ENOSPC;
+ goto exit;
}
set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
mark_inode_dirty(inode);
- page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
if (IS_ERR(page)) {
alloc_nid_failed(sbi, fi->i_xattr_nid);
fi->i_xattr_nid = 0;
- mutex_unlock_op(sbi, NODE_NEW);
- return PTR_ERR(page);
+ error = PTR_ERR(page);
+ goto exit;
}
alloc_nid_done(sbi, fi->i_xattr_nid);
@@ -349,8 +403,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
/* The inode already has an extended attribute block. */
page = get_node_page(sbi, fi->i_xattr_nid);
if (IS_ERR(page)) {
- mutex_unlock_op(sbi, NODE_NEW);
- return PTR_ERR(page);
+ error = PTR_ERR(page);
+ goto exit;
}
base_addr = page_address(page);
@@ -432,12 +486,16 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
- f2fs_write_inode(inode, NULL);
- mutex_unlock_op(sbi, NODE_NEW);
+ if (ipage)
+ update_inode(inode, ipage);
+ else
+ update_inode_page(inode);
+ mutex_unlock_op(sbi, ilock);
return 0;
cleanup:
f2fs_put_page(page, 1);
- mutex_unlock_op(sbi, NODE_NEW);
+exit:
+ mutex_unlock_op(sbi, ilock);
return error;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_acl_access_handler;
extern const struct xattr_handler f2fs_xattr_acl_default_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+ const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len)
+ const char *name, const void *value, size_t value_len)
{
return -EOPNOTSUPP;
}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
}
#endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+ const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return 0;
+}
+#endif
#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 165012ef363a..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
EXPORT_SYMBOL_GPL(fat_search_long);
struct fat_ioctl_filldir_callback {
+ struct dir_context ctx;
void __user *dirent;
int result;
/* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
int short_len;
};
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
- filldir_t filldir, int short_only, int both)
+static int __fat_readdir(struct inode *inode, struct file *file,
+ struct dir_context *ctx, int short_only,
+ struct fat_ioctl_filldir_callback *both)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
unsigned char bufname[FAT_MAX_SHORT_SIZE];
int isvfat = sbi->options.isvfat;
const char *fill_name = NULL;
- unsigned long inum;
- unsigned long lpos, dummy, *furrfu = &lpos;
+ int fake_offset = 0;
loff_t cpos;
int short_len = 0, fill_len = 0;
int ret = 0;
mutex_lock(&sbi->s_lock);
- cpos = filp->f_pos;
+ cpos = ctx->pos;
/* Fake . and .. for the root directory. */
if (inode->i_ino == MSDOS_ROOT_INO) {
- while (cpos < 2) {
- if (filldir(dirent, "..", cpos+1, cpos,
- MSDOS_ROOT_INO, DT_DIR) < 0)
- goto out;
- cpos++;
- filp->f_pos++;
- }
- if (cpos == 2) {
- dummy = 2;
- furrfu = &dummy;
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos == 2) {
+ fake_offset = 1;
cpos = 0;
}
}
@@ -619,7 +614,7 @@ parse_record:
int status = fat_parse_long(inode, &cpos, &bh, &de,
&unicode, &nr_slots);
if (status < 0) {
- filp->f_pos = cpos;
+ ctx->pos = cpos;
ret = status;
goto out;
} else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
/* !both && !short_only, so we don't need shortname. */
if (!both)
goto start_filldir;
+
+ short_len = fat_parse_short(sb, de, bufname,
+ sbi->options.dotsOK);
+ if (short_len == 0)
+ goto record_end;
+ /* hack for fat_ioctl_filldir() */
+ both->longname = fill_name;
+ both->long_len = fill_len;
+ both->shortname = bufname;
+ both->short_len = short_len;
+ fill_name = NULL;
+ fill_len = 0;
+ goto start_filldir;
}
}
@@ -646,28 +654,21 @@ parse_record:
if (short_len == 0)
goto record_end;
- if (nr_slots) {
- /* hack for fat_ioctl_filldir() */
- struct fat_ioctl_filldir_callback *p = dirent;
-
- p->longname = fill_name;
- p->long_len = fill_len;
- p->shortname = bufname;
- p->short_len = short_len;
- fill_name = NULL;
- fill_len = 0;
- } else {
- fill_name = bufname;
- fill_len = short_len;
- }
+ fill_name = bufname;
+ fill_len = short_len;
start_filldir:
- lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
- if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
- inum = inode->i_ino;
- else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
- inum = parent_ino(filp->f_path.dentry);
+ if (!fake_offset)
+ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+
+ if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+ if (!dir_emit_dot(file, ctx))
+ goto fill_failed;
+ } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+ if (!dir_emit_dotdot(file, ctx))
+ goto fill_failed;
} else {
+ unsigned long inum;
loff_t i_pos = fat_make_i_pos(sb, bh, de);
struct inode *tmp = fat_iget(sb, i_pos);
if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
iput(tmp);
} else
inum = iunique(sb, MSDOS_ROOT_INO);
+ if (!dir_emit(ctx, fill_name, fill_len, inum,
+ (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+ goto fill_failed;
}
- if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
- (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
- goto fill_failed;
-
record_end:
- furrfu = &lpos;
- filp->f_pos = cpos;
+ fake_offset = 0;
+ ctx->pos = cpos;
goto get_new;
end_of_dir:
- filp->f_pos = cpos;
+ ctx->pos = cpos;
fill_failed:
brelse(bh);
if (unicode)
@@ -696,10 +696,9 @@ out:
return ret;
}
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
- return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
+ return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
@@ -755,20 +754,25 @@ efault: \
FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
void __user *dirent, filldir_t filldir,
int short_only, int both)
{
- struct fat_ioctl_filldir_callback buf;
+ struct fat_ioctl_filldir_callback buf = {
+ .ctx.actor = filldir,
+ .dirent = dirent
+ };
int ret;
buf.dirent = dirent;
buf.result = 0;
mutex_lock(&inode->i_mutex);
+ buf.ctx.pos = file->f_pos;
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
- ret = __fat_readdir(inode, filp, &buf, filldir,
- short_only, both);
+ ret = __fat_readdir(inode, file, &buf.ctx,
+ short_only, both ? &buf : NULL);
+ file->f_pos = buf.ctx.pos;
}
mutex_unlock(&inode->i_mutex);
if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
const struct file_operations fat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = fat_readdir,
+ .iterate = fat_readdir,
.unlocked_ioctl = fat_dir_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = fat_compat_dir_ioctl,
@@ -964,6 +968,29 @@ int fat_scan(struct inode *dir, const unsigned char *name,
}
EXPORT_SYMBOL_GPL(fat_scan);
+/*
+ * Scans a directory for a given logstart.
+ * Returns an error code or zero.
+ */
+int fat_scan_logstart(struct inode *dir, int i_logstart,
+ struct fat_slot_info *sinfo)
+{
+ struct super_block *sb = dir->i_sb;
+
+ sinfo->slot_off = 0;
+ sinfo->bh = NULL;
+ while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
+ &sinfo->de) >= 0) {
+ if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) {
+ sinfo->slot_off -= sizeof(*sinfo->de);
+ sinfo->nr_slots = 1;
+ sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
{
struct super_block *sb = dir->i_sb;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e9cc3f0d58e2..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -23,6 +23,9 @@
#define FAT_ERRORS_PANIC 2 /* panic on error */
#define FAT_ERRORS_RO 3 /* remount r/o on error */
+#define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */
+#define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */
+
struct fat_mount_options {
kuid_t fs_uid;
kgid_t fs_gid;
@@ -34,6 +37,7 @@ struct fat_mount_options {
unsigned short shortname; /* flags for shortname display/create rule */
unsigned char name_check; /* r = relaxed, n = normal, s = strict */
unsigned char errors; /* On error: continue, panic, remount-ro */
+ unsigned char nfs; /* NFS support: nostale_ro, stale_rw */
unsigned short allow_utime;/* permission for setting the [am]time */
unsigned quiet:1, /* set = fake successful chmods and chowns */
showexec:1, /* set = only set x bit for com/exe/bat */
@@ -48,8 +52,7 @@ struct fat_mount_options {
usefree:1, /* Use free_clusters for FAT32 */
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
- discard:1, /* Issue discard requests on deletions */
- nfs:1; /* Do extra work needed for NFS export */
+ discard:1; /* Issue discard requests on deletions */
};
#define FAT_HASH_BITS 8
@@ -72,6 +75,7 @@ struct msdos_sb_info {
unsigned long root_cluster; /* first cluster of the root directory */
unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
struct mutex fat_lock;
+ struct mutex nfs_build_inode_lock;
struct mutex s_lock;
unsigned int prev_free; /* previously allocated cluster number */
unsigned int free_clusters; /* -1 if undefined */
@@ -82,6 +86,7 @@ struct msdos_sb_info {
const void *dir_ops; /* Opaque; default directory operations */
int dir_per_block; /* dir entries per block */
int dir_per_block_bits; /* log2(dir_per_block) */
+ unsigned int vol_id; /*volume ID*/
int fatent_shift;
struct fatent_operations *fatent_ops;
@@ -215,6 +220,27 @@ static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+ sbi->data_start;
}
+static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi,
+ loff_t i_pos, sector_t *blknr, int *offset)
+{
+ *blknr = i_pos >> sbi->dir_per_block_bits;
+ *offset = i_pos & (sbi->dir_per_block - 1);
+}
+
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+ struct inode *inode)
+{
+ loff_t i_pos;
+#if BITS_PER_LONG == 32
+ spin_lock(&sbi->inode_hash_lock);
+#endif
+ i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+ spin_unlock(&sbi->inode_hash_lock);
+#endif
+ return i_pos;
+}
+
static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
{
#ifdef __BIG_ENDIAN
@@ -271,6 +297,8 @@ extern int fat_dir_empty(struct inode *dir);
extern int fat_subdirs(struct inode *dir);
extern int fat_scan(struct inode *dir, const unsigned char *name,
struct fat_slot_info *sinfo);
+extern int fat_scan_logstart(struct inode *dir, int i_logstart,
+ struct fat_slot_info *sinfo);
extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
struct msdos_dir_entry **de);
extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
@@ -348,6 +376,7 @@ extern struct inode *fat_build_inode(struct super_block *sb,
extern int fat_sync_inode(struct inode *inode);
extern int fat_fill_super(struct super_block *sb, void *data, int silent,
int isvfat, void (*setup)(struct super_block *));
+extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
@@ -382,12 +411,8 @@ int fat_cache_init(void);
void fat_cache_destroy(void);
/* fat/nfs.c */
-struct fid;
-extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type);
-extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type);
-extern struct dentry *fat_get_parent(struct dentry *child_dir);
+extern const struct export_operations fat_export_ops;
+extern const struct export_operations fat_export_ops_nostale;
/* helper for printk */
typedef unsigned long long llu;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3978f8ca1823..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
return err;
}
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ return put_user(sbi->vol_id, user_attr);
+}
+
long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return fat_ioctl_get_attributes(inode, user_attr);
case FAT_IOCTL_SET_ATTRIBUTES:
return fat_ioctl_set_attributes(filp, user_attr);
+ case FAT_IOCTL_GET_VOLUME_ID:
+ return fat_ioctl_get_volume_id(inode, user_attr);
default:
return -ENOTTY; /* Inappropriate ioctl for device */
}
@@ -306,6 +314,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
struct inode *inode = dentry->d_inode;
generic_fillattr(inode, stat);
stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+
+ if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+ /* Use i_pos for ino. This is used as fileid of nfs. */
+ stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(fat_getattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index acf6e479b443..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,8 +18,8 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/uio.h>
@@ -385,7 +385,7 @@ static int fat_calc_dir_size(struct inode *inode)
}
/* doesn't deal with root inode */
-static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
+int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int error;
@@ -444,12 +444,25 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
return 0;
}
+static inline void fat_lock_build_inode(struct msdos_sb_info *sbi)
+{
+ if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+ mutex_lock(&sbi->nfs_build_inode_lock);
+}
+
+static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi)
+{
+ if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+ mutex_unlock(&sbi->nfs_build_inode_lock);
+}
+
struct inode *fat_build_inode(struct super_block *sb,
struct msdos_dir_entry *de, loff_t i_pos)
{
struct inode *inode;
int err;
+ fat_lock_build_inode(MSDOS_SB(sb));
inode = fat_iget(sb, i_pos);
if (inode)
goto out;
@@ -469,6 +482,7 @@ struct inode *fat_build_inode(struct super_block *sb,
fat_attach(inode, i_pos);
insert_inode_hash(inode);
out:
+ fat_unlock_build_inode(MSDOS_SB(sb));
return inode;
}
@@ -655,20 +669,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
- struct inode *inode)
-{
- loff_t i_pos;
-#if BITS_PER_LONG == 32
- spin_lock(&sbi->inode_hash_lock);
-#endif
- i_pos = MSDOS_I(inode)->i_pos;
-#if BITS_PER_LONG == 32
- spin_unlock(&sbi->inode_hash_lock);
-#endif
- return i_pos;
-}
-
static int __fat_write_inode(struct inode *inode, int wait)
{
struct super_block *sb = inode->i_sb;
@@ -676,7 +676,8 @@ static int __fat_write_inode(struct inode *inode, int wait)
struct buffer_head *bh;
struct msdos_dir_entry *raw_entry;
loff_t i_pos;
- int err;
+ sector_t blocknr;
+ int err, offset;
if (inode->i_ino == MSDOS_ROOT_INO)
return 0;
@@ -686,7 +687,8 @@ retry:
if (!i_pos)
return 0;
- bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
+ fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset);
+ bh = sb_bread(sb, blocknr);
if (!bh) {
fat_msg(sb, KERN_ERR, "unable to read inode block "
"for updating (i_pos %lld)", i_pos);
@@ -699,8 +701,7 @@ retry:
goto retry;
}
- raw_entry = &((struct msdos_dir_entry *) (bh->b_data))
- [i_pos & (sbi->dir_per_block - 1)];
+ raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset];
if (S_ISDIR(inode->i_mode))
raw_entry->size = 0;
else
@@ -761,12 +762,6 @@ static const struct super_operations fat_sops = {
.show_options = fat_show_options,
};
-static const struct export_operations fat_export_ops = {
- .fh_to_dentry = fat_fh_to_dentry,
- .fh_to_parent = fat_fh_to_parent,
- .get_parent = fat_get_parent,
-};
-
static int fat_show_options(struct seq_file *m, struct dentry *root)
{
struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
@@ -814,8 +809,6 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",usefree");
if (opts->quiet)
seq_puts(m, ",quiet");
- if (opts->nfs)
- seq_puts(m, ",nfs");
if (opts->showexec)
seq_puts(m, ",showexec");
if (opts->sys_immutable)
@@ -849,6 +842,10 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",errors=panic");
else
seq_puts(m, ",errors=remount-ro");
+ if (opts->nfs == FAT_NFS_NOSTALE_RO)
+ seq_puts(m, ",nfs=nostale_ro");
+ else if (opts->nfs)
+ seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
@@ -865,7 +862,7 @@ enum {
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_err,
+ Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
};
static const match_table_t fat_tokens = {
@@ -895,7 +892,9 @@ static const match_table_t fat_tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_discard, "discard"},
- {Opt_nfs, "nfs"},
+ {Opt_nfs_stale_rw, "nfs"},
+ {Opt_nfs_stale_rw, "nfs=stale_rw"},
+ {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
{Opt_obsolete, "conv=binary"},
{Opt_obsolete, "conv=text"},
{Opt_obsolete, "conv=auto"},
@@ -1092,6 +1091,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_err_ro:
opts->errors = FAT_ERRORS_RO;
break;
+ case Opt_nfs_stale_rw:
+ opts->nfs = FAT_NFS_STALE_RW;
+ break;
+ case Opt_nfs_nostale_ro:
+ opts->nfs = FAT_NFS_NOSTALE_RO;
+ break;
/* msdos specific */
case Opt_dots:
@@ -1150,9 +1155,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_discard:
opts->discard = 1;
break;
- case Opt_nfs:
- opts->nfs = 1;
- break;
/* obsolete mount options */
case Opt_obsolete:
@@ -1183,6 +1185,10 @@ out:
opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
if (opts->unicode_xlate)
opts->utf8 = 0;
+ if (opts->nfs == FAT_NFS_NOSTALE_RO) {
+ sb->s_flags |= MS_RDONLY;
+ sb->s_export_op = &fat_export_ops_nostale;
+ }
return 0;
}
@@ -1193,7 +1199,7 @@ static int fat_read_root(struct inode *inode)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
int error;
- MSDOS_I(inode)->i_pos = 0;
+ MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode->i_version++;
@@ -1223,6 +1229,19 @@ static int fat_read_root(struct inode *inode)
return 0;
}
+static unsigned long calc_fat_clusters(struct super_block *sb)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+ /* Divide first to avoid overflow */
+ if (sbi->fat_bits != 12) {
+ unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
+ return ent_per_sec * sbi->fat_length;
+ }
+
+ return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+}
+
/*
* Read the super block of an MS-DOS FS.
*/
@@ -1256,6 +1275,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sb->s_magic = MSDOS_SUPER_MAGIC;
sb->s_op = &fat_sops;
sb->s_export_op = &fat_export_ops;
+ mutex_init(&sbi->nfs_build_inode_lock);
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1395,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
brelse(fsinfo_bh);
}
+ /* interpret volume ID as a little endian 32 bit integer */
+ if (sbi->fat_bits == 32)
+ sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+ ((u32)b->fat32.vol_id[1] << 8) |
+ ((u32)b->fat32.vol_id[2] << 16) |
+ ((u32)b->fat32.vol_id[3] << 24));
+ else /* fat 16 or 12 */
+ sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+ ((u32)b->fat16.vol_id[1] << 8) |
+ ((u32)b->fat16.vol_id[2] << 16) |
+ ((u32)b->fat16.vol_id[3] << 24));
+
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
@@ -1427,7 +1459,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
/* check that FAT table does not overflow */
- fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+ fat_clusters = calc_fat_clusters(sb);
total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
if (total_clusters > MAX_FAT(sb)) {
if (!silent)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+ fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
va_end(args);
}
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
sb->s_flags |= MS_RDONLY;
- printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
- "set read-only\n", sb->s_id);
+ fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
}
}
EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
* that the existing dentry can be used. The msdos fs routines will
* return ENOENT or EINVAL as appropriate.
*/
-static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
{
struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
* Compare two msdos names. If either of the names are invalid,
* we fall back to doing the standard name comparison.
*/
-static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
* that the existing dentry can be used. The vfat fs routines will
* return ENOENT or EINVAL as appropriate.
*/
-static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
{
qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
* that the existing dentry can be used. The vfat fs routines will
* return ENOENT or EINVAL as appropriate.
*/
-static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
{
struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
/*
* Case insensitive compare of two vfat names.
*/
-static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
/*
* Case sensitive compare of two vfat names.
*/
-static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
unsigned int alen, blen;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 499c10438ca2..93e14933dcb6 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -14,6 +14,18 @@
#include <linux/exportfs.h>
#include "fat.h"
+struct fat_fid {
+ u32 i_gen;
+ u32 i_pos_low;
+ u16 i_pos_hi;
+ u16 parent_i_pos_hi;
+ u32 parent_i_pos_low;
+ u32 parent_i_gen;
+};
+
+#define FAT_FID_SIZE_WITHOUT_PARENT 3
+#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
+
/**
* Look up a directory inode given its starting cluster.
*/
@@ -38,63 +50,252 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)
return inode;
}
-static struct inode *fat_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos)
{
- struct inode *inode;
+ if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO)
+ return fat_iget(sb, i_pos);
- if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
- return NULL;
+ else {
+ if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
+ return NULL;
+ return ilookup(sb, ino);
+ }
+}
+
+static struct inode *__fat_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation, loff_t i_pos)
+{
+ struct inode *inode = fat_ilookup(sb, ino, i_pos);
- inode = ilookup(sb, ino);
if (inode && generation && (inode->i_generation != generation)) {
iput(inode);
inode = NULL;
}
+ if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+ struct buffer_head *bh = NULL;
+ struct msdos_dir_entry *de ;
+ sector_t blocknr;
+ int offset;
+ fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset);
+ bh = sb_bread(sb, blocknr);
+ if (!bh) {
+ fat_msg(sb, KERN_ERR,
+ "unable to read block(%llu) for building NFS inode",
+ (llu)blocknr);
+ return inode;
+ }
+ de = (struct msdos_dir_entry *)bh->b_data;
+ /* If a file is deleted on server and client is not updated
+ * yet, we must not build the inode upon a lookup call.
+ */
+ if (IS_FREE(de[offset].name))
+ inode = NULL;
+ else
+ inode = fat_build_inode(sb, &de[offset], i_pos);
+ brelse(bh);
+ }
return inode;
}
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+
+ return __fat_nfs_get_inode(sb, ino, generation, 0);
+}
+
+static int
+fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
+ struct inode *parent)
+{
+ int len = *lenp;
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct fat_fid *fid = (struct fat_fid *) fh;
+ loff_t i_pos;
+ int type = FILEID_FAT_WITHOUT_PARENT;
+
+ if (parent) {
+ if (len < FAT_FID_SIZE_WITH_PARENT) {
+ *lenp = FAT_FID_SIZE_WITH_PARENT;
+ return FILEID_INVALID;
+ }
+ } else {
+ if (len < FAT_FID_SIZE_WITHOUT_PARENT) {
+ *lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+ return FILEID_INVALID;
+ }
+ }
+
+ i_pos = fat_i_pos_read(sbi, inode);
+ *lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+ fid->i_gen = inode->i_generation;
+ fid->i_pos_low = i_pos & 0xFFFFFFFF;
+ fid->i_pos_hi = (i_pos >> 32) & 0xFFFF;
+ if (parent) {
+ i_pos = fat_i_pos_read(sbi, parent);
+ fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF;
+ fid->parent_i_pos_low = i_pos & 0xFFFFFFFF;
+ fid->parent_i_gen = parent->i_generation;
+ type = FILEID_FAT_WITH_PARENT;
+ *lenp = FAT_FID_SIZE_WITH_PARENT;
+ }
+
+ return type;
+}
+
/**
* Map a NFS file handle to a corresponding dentry.
* The dentry may or may not be connected to the filesystem root.
*/
-struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
fat_nfs_get_inode);
}
+static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb,
+ struct fid *fh, int fh_len,
+ int fh_type)
+{
+ struct inode *inode = NULL;
+ struct fat_fid *fid = (struct fat_fid *)fh;
+ loff_t i_pos;
+
+ switch (fh_type) {
+ case FILEID_FAT_WITHOUT_PARENT:
+ if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT)
+ return NULL;
+ break;
+ case FILEID_FAT_WITH_PARENT:
+ if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+ return NULL;
+ break;
+ default:
+ return NULL;
+ }
+ i_pos = fid->i_pos_hi;
+ i_pos = (i_pos << 32) | (fid->i_pos_low);
+ inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos);
+
+ return d_obtain_alias(inode);
+}
+
/*
* Find the parent for a file specified by NFS handle.
* This requires that the handle contain the i_ino of the parent.
*/
-struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
return generic_fh_to_parent(sb, fid, fh_len, fh_type,
fat_nfs_get_inode);
}
+static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb,
+ struct fid *fh, int fh_len,
+ int fh_type)
+{
+ struct inode *inode = NULL;
+ struct fat_fid *fid = (struct fat_fid *)fh;
+ loff_t i_pos;
+
+ if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_FAT_WITH_PARENT:
+ i_pos = fid->parent_i_pos_hi;
+ i_pos = (i_pos << 32) | (fid->parent_i_pos_low);
+ inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos);
+ break;
+ }
+
+ return d_obtain_alias(inode);
+}
+
+/*
+ * Rebuild the parent for a directory that is not connected
+ * to the filesystem root
+ */
+static
+struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart)
+{
+ int search_clus, clus_to_match;
+ struct msdos_dir_entry *de;
+ struct inode *parent = NULL;
+ struct inode *dummy_grand_parent = NULL;
+ struct fat_slot_info sinfo;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart);
+ struct buffer_head *parent_bh = sb_bread(sb, blknr);
+ if (!parent_bh) {
+ fat_msg(sb, KERN_ERR,
+ "unable to read cluster of parent directory");
+ return NULL;
+ }
+
+ de = (struct msdos_dir_entry *) parent_bh->b_data;
+ clus_to_match = fat_get_start(sbi, &de[0]);
+ search_clus = fat_get_start(sbi, &de[1]);
+
+ dummy_grand_parent = fat_dget(sb, search_clus);
+ if (!dummy_grand_parent) {
+ dummy_grand_parent = new_inode(sb);
+ if (!dummy_grand_parent) {
+ brelse(parent_bh);
+ return parent;
+ }
+
+ dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO);
+ fat_fill_inode(dummy_grand_parent, &de[1]);
+ MSDOS_I(dummy_grand_parent)->i_pos = -1;
+ }
+
+ if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo))
+ parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+
+ brelse(parent_bh);
+ iput(dummy_grand_parent);
+
+ return parent;
+}
+
/*
* Find the parent for a directory that is not currently connected to
* the filesystem root.
*
* On entry, the caller holds child_dir->d_inode->i_mutex.
*/
-struct dentry *fat_get_parent(struct dentry *child_dir)
+static struct dentry *fat_get_parent(struct dentry *child_dir)
{
struct super_block *sb = child_dir->d_sb;
struct buffer_head *bh = NULL;
struct msdos_dir_entry *de;
struct inode *parent_inode = NULL;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
- int parent_logstart = fat_get_start(MSDOS_SB(sb), de);
+ int parent_logstart = fat_get_start(sbi, de);
parent_inode = fat_dget(sb, parent_logstart);
+ if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+ parent_inode = fat_rebuild_parent(sb, parent_logstart);
}
brelse(bh);
return d_obtain_alias(parent_inode);
}
+
+const struct export_operations fat_export_ops = {
+ .fh_to_dentry = fat_fh_to_dentry,
+ .fh_to_parent = fat_fh_to_parent,
+ .get_parent = fat_get_parent,
+};
+
+const struct export_operations fat_export_ops_nostale = {
+ .encode_fh = fat_encode_fh_nostale,
+ .fh_to_dentry = fat_fh_to_dentry_nostale,
+ .fh_to_parent = fat_fh_to_parent_nostale,
+ .get_parent = fat_get_parent,
+};
diff --git a/fs/fifo.c b/fs/fifo.c
deleted file mode 100644
index cf6f4345ceb0..000000000000
--- a/fs/fifo.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * linux/fs/fifo.c
- *
- * written by Paul H. Hargrove
- *
- * Fixes:
- * 10-06-1999, AV: fixed OOM handling in fifo_open(), moved
- * initialization there, switched to external
- * allocation of pipe_inode_info.
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/pipe_fs_i.h>
-
-static int wait_for_partner(struct inode* inode, unsigned int *cnt)
-{
- int cur = *cnt;
-
- while (cur == *cnt) {
- pipe_wait(inode->i_pipe);
- if (signal_pending(current))
- break;
- }
- return cur == *cnt ? -ERESTARTSYS : 0;
-}
-
-static void wake_up_partner(struct inode* inode)
-{
- wake_up_interruptible(&inode->i_pipe->wait);
-}
-
-static int fifo_open(struct inode *inode, struct file *filp)
-{
- struct pipe_inode_info *pipe;
- int ret;
-
- mutex_lock(&inode->i_mutex);
- pipe = inode->i_pipe;
- if (!pipe) {
- ret = -ENOMEM;
- pipe = alloc_pipe_info(inode);
- if (!pipe)
- goto err_nocleanup;
- inode->i_pipe = pipe;
- }
- filp->f_version = 0;
-
- /* We can only do regular read/write on fifos */
- filp->f_mode &= (FMODE_READ | FMODE_WRITE);
-
- switch (filp->f_mode) {
- case FMODE_READ:
- /*
- * O_RDONLY
- * POSIX.1 says that O_NONBLOCK means return with the FIFO
- * opened, even when there is no process writing the FIFO.
- */
- filp->f_op = &read_pipefifo_fops;
- pipe->r_counter++;
- if (pipe->readers++ == 0)
- wake_up_partner(inode);
-
- if (!pipe->writers) {
- if ((filp->f_flags & O_NONBLOCK)) {
- /* suppress POLLHUP until we have
- * seen a writer */
- filp->f_version = pipe->w_counter;
- } else {
- if (wait_for_partner(inode, &pipe->w_counter))
- goto err_rd;
- }
- }
- break;
-
- case FMODE_WRITE:
- /*
- * O_WRONLY
- * POSIX.1 says that O_NONBLOCK means return -1 with
- * errno=ENXIO when there is no process reading the FIFO.
- */
- ret = -ENXIO;
- if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
- goto err;
-
- filp->f_op = &write_pipefifo_fops;
- pipe->w_counter++;
- if (!pipe->writers++)
- wake_up_partner(inode);
-
- if (!pipe->readers) {
- if (wait_for_partner(inode, &pipe->r_counter))
- goto err_wr;
- }
- break;
-
- case FMODE_READ | FMODE_WRITE:
- /*
- * O_RDWR
- * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
- * This implementation will NEVER block on a O_RDWR open, since
- * the process can at least talk to itself.
- */
- filp->f_op = &rdwr_pipefifo_fops;
-
- pipe->readers++;
- pipe->writers++;
- pipe->r_counter++;
- pipe->w_counter++;
- if (pipe->readers == 1 || pipe->writers == 1)
- wake_up_partner(inode);
- break;
-
- default:
- ret = -EINVAL;
- goto err;
- }
-
- /* Ok! */
- mutex_unlock(&inode->i_mutex);
- return 0;
-
-err_rd:
- if (!--pipe->readers)
- wake_up_interruptible(&pipe->wait);
- ret = -ERESTARTSYS;
- goto err;
-
-err_wr:
- if (!--pipe->writers)
- wake_up_interruptible(&pipe->wait);
- ret = -ERESTARTSYS;
- goto err;
-
-err:
- if (!pipe->readers && !pipe->writers)
- free_pipe_info(inode);
-
-err_nocleanup:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
-/*
- * Dummy default file-operations: the only thing this does
- * is contain the open that then fills in the correct operations
- * depending on the access mode of the file...
- */
-const struct file_operations def_fifo_fops = {
- .open = fifo_open, /* will set read_ or write_pipefifo_fops */
- .llseek = noop_llseek,
-};
diff --git a/fs/file.c b/fs/file.c
index 3906d9577a18..4a78f981557a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -23,24 +23,10 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
-struct fdtable_defer {
- spinlock_t lock;
- struct work_struct wq;
- struct fdtable *next;
-};
-
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
int sysctl_nr_open_max = 1024 * 1024; /* raised later */
-/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
- * this per-task structure.
- */
-static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
static void *alloc_fdmem(size_t size)
{
/*
@@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt)
kfree(fdt);
}
-static void free_fdtable_work(struct work_struct *work)
-{
- struct fdtable_defer *f =
- container_of(work, struct fdtable_defer, wq);
- struct fdtable *fdt;
-
- spin_lock_bh(&f->lock);
- fdt = f->next;
- f->next = NULL;
- spin_unlock_bh(&f->lock);
- while(fdt) {
- struct fdtable *next = fdt->next;
-
- __free_fdtable(fdt);
- fdt = next;
- }
-}
-
static void free_fdtable_rcu(struct rcu_head *rcu)
{
- struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
- struct fdtable_defer *fddef;
-
- BUG_ON(!fdt);
- BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
-
- if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
- kfree(fdt->fd);
- kfree(fdt->open_fds);
- kfree(fdt);
- } else {
- fddef = &get_cpu_var(fdtable_defer_list);
- spin_lock(&fddef->lock);
- fdt->next = fddef->next;
- fddef->next = fdt;
- /* vmallocs are handled from the workqueue context */
- schedule_work(&fddef->wq);
- spin_unlock(&fddef->lock);
- put_cpu_var(fdtable_defer_list);
- }
+ __free_fdtable(container_of(rcu, struct fdtable, rcu));
}
/*
@@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
- fdt->next = NULL;
return fdt;
@@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
/* Continue as planned */
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
+ if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else {
/* Somebody else expanded, so undo our attempt */
@@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->fd = &newf->fd_array[0];
- new_fdt->next = NULL;
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
@@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk)
}
}
-static void fdtable_defer_list_init(int cpu)
-{
- struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
- spin_lock_init(&fddef->lock);
- INIT_WORK(&fddef->wq, free_fdtable_work);
- fddef->next = NULL;
-}
-
void __init files_defer_init(void)
{
- int i;
- for_each_possible_cpu(i)
- fdtable_defer_list_init(i);
sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
-BITS_PER_LONG;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4d87a82951..08e719b884ca 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_inode;
might_sleep();
@@ -306,17 +306,18 @@ void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
+ unsigned long flags;
+
file_sb_list_del(file);
- if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) {
- unsigned long flags;
- spin_lock_irqsave(&delayed_fput_lock, flags);
- list_add(&file->f_u.fu_list, &delayed_fput_list);
- schedule_work(&delayed_fput_work);
- spin_unlock_irqrestore(&delayed_fput_lock, flags);
- return;
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_u.fu_rcuhead, ____fput);
+ if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
+ return;
}
- init_task_work(&file->f_u.fu_rcuhead, ____fput);
- task_work_add(task, &file->f_u.fu_rcuhead, true);
+ spin_lock_irqsave(&delayed_fput_lock, flags);
+ list_add(&file->f_u.fu_list, &delayed_fput_list);
+ schedule_work(&delayed_fput_work);
+ spin_unlock_irqrestore(&delayed_fput_lock, flags);
}
}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int vxfs_readdir(struct file *, void *, filldir_t);
+static int vxfs_readdir(struct file *, struct dir_context *);
const struct inode_operations vxfs_dir_inode_ops = {
.lookup = vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
const struct file_operations vxfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = vxfs_readdir,
+ .iterate = vxfs_readdir,
};
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
* Zero.
*/
static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
{
struct inode *ip = file_inode(fp);
struct super_block *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
u_long page, npages, block, pblocks, nblocks, offset;
loff_t pos;
- switch ((long)fp->f_pos) {
- case 0:
- if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
- goto out;
- fp->f_pos++;
- /* fallthrough */
- case 1:
- if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
- goto out;
- fp->f_pos++;
- /* fallthrough */
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(fp, ctx))
+ return 0;
+ ctx->pos = 1;
}
-
- pos = fp->f_pos - 2;
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+ return 0;
+ ctx->pos = 2;
+ }
+ pos = ctx->pos - 2;
if (pos > VXFS_DIRROUND(ip->i_size))
return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
for (; page < npages; page++, block = 0) {
- caddr_t kaddr;
+ char *kaddr;
struct page *pp;
pp = vxfs_get_page(ip->i_mapping, page);
if (IS_ERR(pp))
continue;
- kaddr = (caddr_t)page_address(pp);
+ kaddr = (char *)page_address(pp);
for (; block <= nblocks && block <= pblocks; block++) {
- caddr_t baddr, limit;
+ char *baddr, *limit;
struct vxfs_dirblk *dbp;
struct vxfs_direct *de;
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
(kaddr + offset) :
(baddr + VXFS_DIRBLKOV(dbp)));
- for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
- int over;
-
+ for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
if (!de->d_reclen)
break;
if (!de->d_ino)
continue;
- offset = (caddr_t)de - kaddr;
- over = filler(retp, de->d_name, de->d_namelen,
- ((page << PAGE_CACHE_SHIFT) | offset) + 2,
- de->d_ino, DT_UNKNOWN);
- if (over) {
+ offset = (char *)de - kaddr;
+ ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+ if (!dir_emit(ctx, de->d_name, de->d_namelen,
+ de->d_ino, DT_UNKNOWN)) {
vxfs_put_page(pp);
- goto done;
+ return 0;
}
}
offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
vxfs_put_page(pp);
offset = 0;
}
-
-done:
- fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
+ ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 21f46fb3a101..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -46,6 +45,7 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -88,20 +88,6 @@ static inline struct inode *wb_inode(struct list_head *head)
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
- if (bdi->wb.task) {
- wake_up_process(bdi->wb.task);
- } else {
- /*
- * The bdi thread isn't there, wake up the forker thread which
- * will create and run it.
- */
- wake_up_process(default_backing_dev_info.wb.task);
- }
-}
-
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
@@ -109,10 +95,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
spin_lock_bh(&bdi->wb_lock);
list_add_tail(&work->list, &bdi->work_list);
- if (!bdi->wb.task)
- trace_writeback_nothread(bdi, work);
- bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);
+
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
}
static void
@@ -127,10 +112,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- if (bdi->wb.task) {
- trace_writeback_nowork(bdi);
- wake_up_process(bdi->wb.task);
- }
+ trace_writeback_nowork(bdi);
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
return;
}
@@ -177,9 +160,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(bdi);
- spin_lock_bh(&bdi->wb_lock);
- bdi_wakeup_flusher(bdi);
- spin_unlock_bh(&bdi->wb_lock);
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
}
/*
@@ -463,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
- * I/O completion.
+ * I/O completion. We don't do it for sync(2) writeback because it has a
+ * separate, external IO completion path and ->sync_fs for guaranteeing
+ * inode metadata is written back correctly.
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
@@ -598,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
+ .for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -979,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
/*
* Retrieve work items and do the writeback they describe
*/
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
@@ -987,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
- /*
- * Override sync mode, in case we must wait for completion
- * because this thread is exiting now.
- */
- if (force_wait)
- work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
@@ -1020,66 +998,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
*/
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
{
- struct bdi_writeback *wb = data;
+ struct bdi_writeback *wb = container_of(to_delayed_work(work),
+ struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
+ set_worker_desc("flush-%s", dev_name(bdi->dev));
current->flags |= PF_SWAPWRITE;
- set_freezable();
- wb->last_active = jiffies;
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(current, 0);
- trace_writeback_thread_start(bdi);
-
- while (!kthread_freezable_should_stop(NULL)) {
+ if (likely(!current_is_workqueue_rescuer() ||
+ list_empty(&bdi->bdi_list))) {
/*
- * Remove own delayed wake-up timer, since we are already awake
- * and we'll take care of the periodic write-back.
+ * The normal path. Keep writing back @bdi until its
+ * work_list is empty. Note that this path is also taken
+ * if @bdi is shutting down even when we're running off the
+ * rescuer as work_list needs to be drained.
*/
- del_timer(&wb->wakeup_timer);
-
- pages_written = wb_do_writeback(wb, 0);
-
+ do {
+ pages_written = wb_do_writeback(wb);
+ trace_writeback_pages_written(pages_written);
+ } while (!list_empty(&bdi->work_list));
+ } else {
+ /*
+ * bdi_wq can't get enough workers and we're running off
+ * the emergency worker. Don't hog it. Hopefully, 1024 is
+ * enough for efficient IO.
+ */
+ pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
-
- if (pages_written)
- wb->last_active = jiffies;
-
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- continue;
- }
-
- if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
- else {
- /*
- * We have nothing to do, so can go sleep without any
- * timeout and save power. When a work is queued or
- * something is made dirty - we will be woken up.
- */
- schedule();
- }
}
- /* Flush any work that raced with us exiting */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
+ if (!list_empty(&bdi->work_list) ||
+ (wb_has_dirty_io(wb) && dirty_writeback_interval))
+ queue_delayed_work(bdi_wq, &wb->dwork,
+ msecs_to_jiffies(dirty_writeback_interval * 10));
- trace_writeback_thread_stop(bdi);
- return 0;
+ current->flags &= ~PF_SWAPWRITE;
}
-
/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world.
@@ -1399,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb)
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
+ .for_sync = 1,
};
/* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
struct fscache_object, cookie_link);
cache = object->cache;
- if (object->state >= FSCACHE_OBJECT_DYING ||
+ if (fscache_object_is_dying(object) ||
test_bit(FSCACHE_IOERROR, &cache->flags))
cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
BUG_ON(!ifsdef);
cache->flags = 0;
- ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+ ifsdef->event_mask =
+ ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
if (!tagname)
tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
{
struct fscache_object *object;
- spin_lock(&cache->object_list_lock);
-
while (!list_empty(&cache->object_list)) {
- object = list_entry(cache->object_list.next,
- struct fscache_object, cache_link);
- list_move_tail(&object->cache_link, dying_objects);
+ spin_lock(&cache->object_list_lock);
- _debug("withdraw %p", object->cookie);
+ if (!list_empty(&cache->object_list)) {
+ object = list_entry(cache->object_list.next,
+ struct fscache_object, cache_link);
+ list_move_tail(&object->cache_link, dying_objects);
- spin_lock(&object->lock);
- spin_unlock(&cache->object_list_lock);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
- spin_unlock(&object->lock);
+ _debug("withdraw %p", object->cookie);
+
+ /* This must be done under object_list_lock to prevent
+ * a race with fscache_drop_object().
+ */
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ }
+ spin_unlock(&cache->object_list_lock);
cond_resched();
- spin_lock(&cache->object_list_lock);
}
-
- spin_unlock(&cache->object_list_lock);
}
/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
atomic_set(&cookie->usage, 1);
atomic_set(&cookie->n_children, 0);
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
atomic_inc(&parent->usage);
atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
cookie->flags =
(1 << FSCACHE_COOKIE_LOOKING_UP) |
- (1 << FSCACHE_COOKIE_CREATING) |
(1 << FSCACHE_COOKIE_NO_DATA_YET);
/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
- fscache_enqueue_object(object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
object_already_extant:
ret = -ENOBUFS;
- if (object->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dead(object)) {
spin_unlock(&cookie->lock);
goto error;
}
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
ret = -EEXIST;
hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_dying(p))
ret = -ENOBUFS;
goto cant_attach_object;
}
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
hlist_for_each_entry(p, &cookie->parent->backing_objects,
cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dying(p)) {
ret = -ENOBUFS;
spin_unlock(&cookie->parent->lock);
goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
- if (object->state < FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_live(object))
fscache_raise_event(
object, FSCACHE_OBJECT_EV_INVALIDATE);
}
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
*/
void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
{
- struct fscache_cache *cache;
struct fscache_object *object;
- unsigned long event;
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
return;
}
- _enter("%p{%s,%p},%d",
- cookie, cookie->def->name, cookie->netfs_data, retire);
+ _enter("%p{%s,%p,%d},%d",
+ cookie, cookie->def->name, cookie->netfs_data,
+ atomic_read(&cookie->n_active), retire);
+
+ ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
BUG();
}
- /* wait for the cookie to finish being instantiated (or to fail) */
- if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
- fscache_stat(&fscache_n_relinquishes_waitcrt);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- }
-
- event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+ /* No further netfs-accessing operations on this cookie permitted */
+ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (retire)
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-try_again:
spin_lock(&cookie->lock);
-
- /* break links with all the active objects */
- while (!hlist_empty(&cookie->backing_objects)) {
- int n_reads;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object,
- cookie_link);
-
- _debug("RELEASE OBJ%x", object->debug_id);
-
- set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
- n_reads = atomic_read(&object->n_reads);
- if (n_reads) {
- int n_ops = object->n_ops;
- int n_in_progress = object->n_in_progress;
- spin_unlock(&cookie->lock);
- printk(KERN_ERR "FS-Cache:"
- " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
- cookie->def->name,
- n_reads, n_ops, n_in_progress);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- printk("Wait finished\n");
- goto try_again;
- }
-
- /* detach each cache object from the object cookie */
- spin_lock(&object->lock);
- hlist_del_init(&object->cookie_link);
-
- cache = object->cache;
- object->cookie = NULL;
- fscache_raise_event(object, event);
- spin_unlock(&object->lock);
-
- if (atomic_dec_and_test(&cookie->usage))
- /* the cookie refcount shouldn't be reduced to 0 yet */
- BUG();
+ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
}
+ spin_unlock(&cookie->lock);
- /* detach pointers back to the netfs */
+ /* Wait for cessation of activity requiring access to the netfs (when
+ * n_active reaches 0).
+ */
+ if (!atomic_dec_and_test(&cookie->n_active))
+ wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+
+ /* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
-
- spin_unlock(&cookie->lock);
+ BUG_ON(cookie->stores.rnode);
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
atomic_dec(&cookie->parent->n_children);
}
- /* finally dispose of the cookie */
+ /* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
struct fscache_cookie fscache_fsdef_index = {
.usage = ATOMIC_INIT(1),
+ .n_active = ATOMIC_INIT(1),
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
extern int fscache_wait_bit(void *);
extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
/*
* object.c
*/
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
- struct fscache_object *);
extern void fscache_enqueue_object(struct fscache_object *);
/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
extern const struct file_operations fscache_objlist_fops;
extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
#else
#define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
#endif
/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+ printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+ object->debug_id, object->event_mask, (1 << event));
+#endif
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
schedule();
return 0;
}
-EXPORT_SYMBOL(fscache_wait_bit);
/*
* wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
schedule();
return signal_pending(current);
}
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
/* initialise the primary index cookie */
atomic_set(&netfs->primary_index->usage, 1);
atomic_set(&netfs->primary_index->n_children, 0);
+ atomic_set(&netfs->primary_index->n_active, 1);
netfs->primary_index->def = &fscache_fsdef_netfs_def;
netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
*/
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
{
write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-EXPORT_SYMBOL(fscache_object_destroy);
/*
* find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
{
struct fscache_objlist_data *data = m->private;
struct fscache_object *obj = v;
+ struct fscache_cookie *cookie;
unsigned long config = data->config;
- uint16_t keylen, auxlen;
char _type[3], *type;
- bool no_cookie;
u8 *buf = data->buf, *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
- " EM EV F S"
+ " EM EV FL S"
" | NETFS_COOKIE_DEF TY FL NETFS_DATA");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if ((unsigned long) v == 2) {
seq_puts(m, "======== ======== ==== ===== === === === == ====="
- " == == = ="
+ " == == == ="
" | ================ == == ================");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
} \
} while(0)
+ cookie = obj->cookie;
if (~config) {
- FILTER(obj->cookie,
+ FILTER(cookie->def,
COOKIE, NOCOOKIE);
- FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+ FILTER(fscache_object_is_active(obj) ||
obj->n_ops != 0 ||
obj->n_obj_ops != 0 ||
obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
}
seq_printf(m,
- "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+ "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
obj->debug_id,
obj->parent ? obj->parent->debug_id : -1,
- fscache_object_states_short[obj->state],
+ obj->state->short_name,
obj->n_children,
obj->n_ops,
obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->flags,
work_busy(&obj->work));
- no_cookie = true;
- keylen = auxlen = 0;
- if (obj->cookie) {
- spin_lock(&obj->lock);
- if (obj->cookie) {
- switch (obj->cookie->def->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- sprintf(_type, "%02u",
- obj->cookie->def->type);
- type = _type;
- break;
- }
+ if (fscache_use_cookie(obj)) {
+ uint16_t keylen = 0, auxlen = 0;
- seq_printf(m, "%-16s %s %2lx %16p",
- obj->cookie->def->name,
- type,
- obj->cookie->flags,
- obj->cookie->netfs_data);
-
- if (obj->cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = obj->cookie->def->get_key(
- obj->cookie->netfs_data,
- buf, 400);
-
- if (obj->cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = obj->cookie->def->get_aux(
- obj->cookie->netfs_data,
- buf + keylen, 512 - keylen);
-
- no_cookie = false;
+ switch (cookie->def->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ sprintf(_type, "%02u", cookie->def->type);
+ type = _type;
+ break;
}
- spin_unlock(&obj->lock);
- if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+ seq_printf(m, "%-16s %s %2lx %16p",
+ cookie->def->name,
+ type,
+ cookie->flags,
+ cookie->netfs_data);
+
+ if (cookie->def->get_key &&
+ config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->def->get_key(cookie->netfs_data,
+ buf, 400);
+
+ if (cookie->def->get_aux &&
+ config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->def->get_aux(cookie->netfs_data,
+ buf + keylen, 512 - keylen);
+ fscache_unuse_cookie(obj);
+
+ if (keylen > 0 || auxlen > 0) {
seq_printf(m, " ");
for (p = buf; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
seq_printf(m, "%02x", *p++);
}
}
- }
- if (no_cookie)
- seq_printf(m, "<no_cookie>\n");
- else
seq_printf(m, "\n");
+ } else {
+ seq_printf(m, "<no_netfs>\n");
+ }
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/prefetch.h>
#include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
- [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
- [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
- [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
- [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
- [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
- [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
- [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
- [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
- [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
- [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
- [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
- [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
- [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state. Work states are execution states. No event processing
+ * is performed by them. The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition. Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = f \
+ }
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state. Wait states are event processing states. No execution
+ * is performed by them. Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y". The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = NULL, \
+ .transitions = { __VA_ARGS__, { 0, NULL } } \
+ }
+
+#define TRANSIT_TO(state, emask) \
+ { .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT, "?INI",
+ TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
+ TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
+ TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+ TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
+ TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
+ TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables. These are for handling unexpected
+ * events, such as an I/O error. If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+ TRANSIT_TO(ABORT_INIT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+ TRANSIT_TO(LOOKUP_FAILURE,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
-EXPORT_SYMBOL(fscache_object_states);
-
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
- [FSCACHE_OBJECT_INIT] = "INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
- [FSCACHE_OBJECT_CREATING] = "CRTN",
- [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
- [FSCACHE_OBJECT_ACTIVE] = "ACTV",
- [FSCACHE_OBJECT_INVALIDATING] = "INVL",
- [FSCACHE_OBJECT_UPDATING] = "UPDT",
- [FSCACHE_OBJECT_DYING] = "DYNG",
- [FSCACHE_OBJECT_LC_DYING] = "LCDY",
- [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
- [FSCACHE_OBJECT_RELEASING] = "RELS",
- [FSCACHE_OBJECT_RECYCLING] = "RCYC",
- [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
- [FSCACHE_OBJECT_DEAD] = "DEAD",
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+ TRANSIT_TO(KILL_OBJECT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
static int fscache_get_object(struct fscache_object *);
static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
object->debug_id, parent->debug_id, parent->n_ops);
spin_lock_nested(&parent->lock, 1);
- parent->n_ops--;
parent->n_obj_ops--;
+ parent->n_ops--;
if (parent->n_ops == 0)
fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&parent->lock);
}
/*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
*/
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
{
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
- enum fscache_object_state new_state;
- struct fscache_cookie *cookie;
- int event;
+ const struct fscache_transition *t;
+ const struct fscache_state *state, *new_state;
+ unsigned long events, event_mask;
+ int event = -1;
ASSERT(object != NULL);
_enter("{OBJ%x,%s,%lx}",
- object->debug_id, fscache_object_states[object->state],
- object->events);
-
- switch (object->state) {
- /* wait for the parent object to become ready */
- case FSCACHE_OBJECT_INIT:
- object->event_mask =
- FSCACHE_OBJECT_EVENTS_MASK &
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- fscache_initialise_object(object);
- goto done;
-
- /* look up the object metadata on disk */
- case FSCACHE_OBJECT_LOOKING_UP:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* create the object metadata on disk */
- case FSCACHE_OBJECT_CREATING:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* handle an object becoming available; start pending
- * operations and queue dependent operations for processing */
- case FSCACHE_OBJECT_AVAILABLE:
- fscache_object_available(object);
- goto active_transit;
-
- /* normal running state */
- case FSCACHE_OBJECT_ACTIVE:
- goto active_transit;
-
- /* Invalidate an object on disk */
- case FSCACHE_OBJECT_INVALIDATING:
- clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
- fscache_stat(&fscache_n_invalidates_run);
- fscache_stat(&fscache_n_cop_invalidate_object);
- fscache_invalidate_object(object);
- fscache_stat_d(&fscache_n_cop_invalidate_object);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
- goto active_transit;
-
- /* update the object metadata on disk */
- case FSCACHE_OBJECT_UPDATING:
- clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
- goto active_transit;
-
- /* handle an object dying during lookup or creation */
- case FSCACHE_OBJECT_LC_DYING:
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DYING;
- cookie = object->cookie;
- if (cookie) {
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_LOOKING_UP);
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_CREATING);
+ object->debug_id, object->state->name, object->events);
+
+ event_mask = object->event_mask;
+restart:
+ object->event_mask = 0; /* Mask normal event handling */
+ state = object->state;
+restart_masked:
+ events = object->events;
+
+ /* Handle any out-of-band events (typically an error) */
+ if (events & object->oob_event_mask) {
+ _debug("{OBJ%x} oob %lx",
+ object->debug_id, events & object->oob_event_mask);
+ for (t = object->oob_table; t->events; t++) {
+ if (events & t->events) {
+ state = t->transit_to;
+ ASSERT(state->work != NULL);
+ event = fls(events & t->events) - 1;
+ __clear_bit(event, &object->oob_event_mask);
+ clear_bit(event, &object->events);
+ goto execute_work_state;
+ }
}
- spin_unlock(&object->lock);
+ }
- fscache_done_parent_op(object);
+ /* Wait states are just transition tables */
+ if (!state->work) {
+ if (events & event_mask) {
+ for (t = state->transitions; t->events; t++) {
+ if (events & t->events) {
+ new_state = t->transit_to;
+ event = fls(events & t->events) - 1;
+ clear_bit(event, &object->events);
+ _debug("{OBJ%x} ev %d: %s -> %s",
+ object->debug_id, event,
+ state->name, new_state->name);
+ object->state = state = new_state;
+ goto execute_work_state;
+ }
+ }
- /* wait for completion of all active operations on this object
- * and the death of all child objects of this object */
- case FSCACHE_OBJECT_DYING:
- dying:
- clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
- spin_lock(&object->lock);
- _debug("dying OBJ%x {%d,%d}",
- object->debug_id, object->n_ops, object->n_children);
- if (object->n_ops == 0 && object->n_children == 0) {
- object->event_mask &=
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- object->event_mask |=
- (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR);
- } else {
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- object->event_mask |=
- 1 << FSCACHE_OBJECT_EV_CLEARED;
+ /* The event mask didn't include all the tabled bits */
+ BUG();
}
- spin_unlock(&object->lock);
- fscache_enqueue_dependents(object);
- fscache_start_operations(object);
- goto terminal_transit;
-
- /* handle an abort during initialisation */
- case FSCACHE_OBJECT_ABORT_INIT:
- _debug("handle abort init %lx", object->events);
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
- spin_lock(&object->lock);
- fscache_dequeue_object(object);
-
- object->state = FSCACHE_OBJECT_DYING;
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &object->cookie->flags))
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_CREATING);
- spin_unlock(&object->lock);
- goto dying;
-
- /* handle the netfs releasing an object and possibly marking it
- * obsolete too */
- case FSCACHE_OBJECT_RELEASING:
- case FSCACHE_OBJECT_RECYCLING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_release_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* handle the parent cache of this object being withdrawn from
- * active service */
- case FSCACHE_OBJECT_WITHDRAWING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_withdraw_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* complain about the object being woken up once it is
- * deceased */
- case FSCACHE_OBJECT_DEAD:
- printk(KERN_ERR "FS-Cache:"
- " Unexpected event in dead state %lx\n",
- object->events & object->event_mask);
- BUG();
-
- default:
- printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
- object->state);
- BUG();
- }
-
- /* determine the transition from a lookup state */
-lookup_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_LC_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_REQUEUE:
- goto done;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ /* Randomly woke up */
+ goto unmask_events;
}
- /* determine the transition from an active state */
-active_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_UPDATE:
- new_state = FSCACHE_OBJECT_UPDATING;
- goto change_state;
- case -1:
- new_state = FSCACHE_OBJECT_ACTIVE;
- goto change_state; /* sleep until event */
- default:
- goto unsupported_event;
- }
+execute_work_state:
+ _debug("{OBJ%x} exec %s", object->debug_id, state->name);
- /* determine the transition from a terminal state */
-terminal_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RETIRE:
- new_state = FSCACHE_OBJECT_RECYCLING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RELEASE:
- new_state = FSCACHE_OBJECT_RELEASING;
- goto change_state;
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_CLEARED:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ new_state = state->work(object, event);
+ event = -1;
+ if (new_state == NO_TRANSIT) {
+ _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+ fscache_enqueue_object(object);
+ event_mask = object->oob_event_mask;
+ goto unmask_events;
}
-change_state:
- spin_lock(&object->lock);
- object->state = new_state;
- spin_unlock(&object->lock);
+ _debug("{OBJ%x} %s -> %s",
+ object->debug_id, state->name, new_state->name);
+ object->state = state = new_state;
-done:
- _leave(" [->%s]", fscache_object_states[object->state]);
- return;
+ if (state->work) {
+ if (unlikely(state->work == ((void *)2UL))) {
+ _leave(" [dead]");
+ return;
+ }
+ goto restart_masked;
+ }
-unsupported_event:
- printk(KERN_ERR "FS-Cache:"
- " Unsupported event %d [%lx/%lx] in state %s\n",
- event, object->events, object->event_mask,
- fscache_object_states[object->state]);
- BUG();
+ /* Transited to wait state */
+ event_mask = object->oob_event_mask;
+ for (t = state->transitions; t->events; t++)
+ event_mask |= t->events;
+
+unmask_events:
+ object->event_mask = event_mask;
+ smp_mb();
+ events = object->events;
+ if (events & event_mask)
+ goto restart;
+ _leave(" [msk %lx]", event_mask);
}
/*
* execute an object
*/
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
_enter("{OBJ%x}", object->debug_id);
start = jiffies;
- fscache_object_state_machine(object);
+ fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- if (object->events & object->event_mask)
- fscache_enqueue_object(object);
- clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
fscache_put_object(object);
}
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+ struct fscache_cookie *cookie,
+ struct fscache_cache *cache)
+{
+ const struct fscache_transition *t;
+
+ atomic_inc(&cache->object_count);
+
+ object->state = STATE(WAIT_FOR_INIT);
+ object->oob_table = fscache_osm_init_oob;
+ object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+ spin_lock_init(&object->lock);
+ INIT_LIST_HEAD(&object->cache_link);
+ INIT_HLIST_NODE(&object->cookie_link);
+ INIT_WORK(&object->work, fscache_object_work_func);
+ INIT_LIST_HEAD(&object->dependents);
+ INIT_LIST_HEAD(&object->dep_link);
+ INIT_LIST_HEAD(&object->pending_ops);
+ object->n_children = 0;
+ object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+ object->events = 0;
+ object->store_limit = 0;
+ object->store_limit_l = 0;
+ object->cache = cache;
+ object->cookie = cookie;
+ object->parent = NULL;
+
+ object->oob_event_mask = 0;
+ for (t = object->oob_table; t->events; t++)
+ object->oob_event_mask |= t->events;
+ object->event_mask = object->oob_event_mask;
+ for (t = object->state->transitions; t->events; t++)
+ object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_event_mask = 0;
+ fscache_dequeue_object(object);
+ return transit_to(KILL_OBJECT);
+}
/*
* initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
* immediately to do a creation
* - we may need to start the process of creating a parent and we need to wait
* for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+ int event)
{
struct fscache_object *parent;
+ bool success;
- _enter("");
- ASSERT(object->cookie != NULL);
- ASSERT(object->cookie->parent != NULL);
-
- if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
- _debug("abort init %lx", object->events);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_ABORT_INIT;
- spin_unlock(&object->lock);
- return;
- }
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->cookie->lock);
- spin_lock_nested(&object->cookie->parent->lock, 1);
+ ASSERT(list_empty(&object->dep_link));
parent = object->parent;
if (!parent) {
- _debug("no parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else {
- spin_lock(&object->lock);
- spin_lock_nested(&parent->lock, 1);
- _debug("parent %s", fscache_object_states[parent->state]);
-
- if (parent->state >= FSCACHE_OBJECT_DYING) {
- _debug("bad parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
- _debug("wait");
-
- /* we may get woken up in this state by child objects
- * binding on to us, so we need to make sure we don't
- * add ourself to the list multiple times */
- if (list_empty(&object->dep_link)) {
- fscache_stat(&fscache_n_cop_grab_object);
- object->cache->ops->grab_object(object);
- fscache_stat_d(&fscache_n_cop_grab_object);
- list_add(&object->dep_link,
- &parent->dependents);
-
- /* fscache_acquire_non_index_cookie() uses this
- * to wake the chain up */
- if (parent->state == FSCACHE_OBJECT_INIT)
- fscache_enqueue_object(parent);
- }
- } else {
- _debug("go");
- parent->n_ops++;
- parent->n_obj_ops++;
- object->lookup_jif = jiffies;
- object->state = FSCACHE_OBJECT_LOOKING_UP;
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- }
+ _leave(" [no parent]");
+ return transit_to(DROP_OBJECT);
+ }
- spin_unlock(&parent->lock);
- spin_unlock(&object->lock);
+ _debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+ if (fscache_object_is_dying(parent)) {
+ _leave(" [bad parent]");
+ return transit_to(DROP_OBJECT);
}
- spin_unlock(&object->cookie->parent->lock);
- spin_unlock(&object->cookie->lock);
+ if (fscache_object_is_available(parent)) {
+ _leave(" [ready]");
+ return transit_to(PARENT_READY);
+ }
+
+ _debug("wait");
+
+ spin_lock(&parent->lock);
+ fscache_stat(&fscache_n_cop_grab_object);
+ success = false;
+ if (fscache_object_is_live(parent) &&
+ object->cache->ops->grab_object(object)) {
+ list_add(&object->dep_link, &parent->dependents);
+ success = true;
+ }
+ fscache_stat_d(&fscache_n_cop_grab_object);
+ spin_unlock(&parent->lock);
+ if (!success) {
+ _leave(" [grab failed]");
+ return transit_to(DROP_OBJECT);
+ }
+
+ /* fscache_acquire_non_index_cookie() uses this
+ * to wake the chain up */
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+ _leave(" [wait]");
+ return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+ int event)
+{
+ struct fscache_object *parent = object->parent;
+
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ ASSERT(parent != NULL);
+
+ spin_lock(&parent->lock);
+ parent->n_ops++;
+ parent->n_obj_ops++;
+ object->lookup_jif = jiffies;
+ spin_unlock(&parent->lock);
+
_leave("");
+ return transit_to(LOOK_UP_OBJECT);
}
/*
* look an object up in the cache from which it was allocated
* - we hold an "access lock" on the parent object, so the parent object cannot
* be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+ int event)
{
struct fscache_cookie *cookie = object->cookie;
- struct fscache_object *parent;
+ struct fscache_object *parent = object->parent;
int ret;
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_table = fscache_osm_lookup_oob;
- parent = object->parent;
ASSERT(parent != NULL);
ASSERTCMP(parent->n_ops, >, 0);
ASSERTCMP(parent->n_obj_ops, >, 0);
/* make sure the parent is still available */
- ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
-
- if (parent->state >= FSCACHE_OBJECT_DYING ||
- test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
- _debug("unavailable");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- _leave("");
- return;
+ ASSERT(fscache_object_is_available(parent));
+
+ if (fscache_object_is_dying(parent) ||
+ test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+ !fscache_use_cookie(object)) {
+ _leave(" [unavailable]");
+ return transit_to(LOOKUP_FAILURE);
}
- _debug("LOOKUP \"%s/%s\" in \"%s\"",
- parent->cookie->def->name, cookie->def->name,
- object->cache->tag->name);
+ _debug("LOOKUP \"%s\" in \"%s\"",
+ cookie->def->name, object->cache->tag->name);
fscache_stat(&fscache_n_object_lookups);
fscache_stat(&fscache_n_cop_lookup_object);
ret = object->cache->ops->lookup_object(object);
fscache_stat_d(&fscache_n_cop_lookup_object);
- if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ fscache_unuse_cookie(object);
if (ret == -ETIMEDOUT) {
/* probably stuck behind another object, so move this one to
* the back of the queue */
fscache_stat(&fscache_n_object_lookups_timed_out);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ _leave(" [timeout]");
+ return NO_TRANSIT;
}
- _leave("");
+ if (ret < 0) {
+ _leave(" [error]");
+ return transit_to(LOOKUP_FAILURE);
+ }
+
+ _leave(" [ok]");
+ return transit_to(OBJECT_AVAILABLE);
}
/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_negative);
- /* transit here to allow write requests to begin stacking up
- * and read requests to begin returning ENODATA */
- object->state = FSCACHE_OBJECT_CREATING;
- spin_unlock(&object->lock);
-
- set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+ /* Allow write requests to begin stacking up and read requests to begin
+ * returning ENODATA.
+ */
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
_debug("wake up lookup %p", &cookie->flags);
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- } else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
- spin_unlock(&object->lock);
}
-
_leave("");
}
EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
/* if we were still looking up, then we must have a positive lookup
* result, in which case there may be data available */
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_positive);
- clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ /* We do (presumably) have data */
+ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
-
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ /* Allow write requests to begin stacking up and read requests
+ * to begin shovelling data.
+ */
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
} else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
fscache_stat(&fscache_n_object_created);
-
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- smp_wmb();
}
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+ set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
_leave("");
}
EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
/*
* handle an object that has just become available
*/
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+ int event)
{
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->lock);
+ object->oob_table = fscache_osm_run_oob;
- if (object->cookie &&
- test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
- wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+ spin_lock(&object->lock);
fscache_done_parent_op(object);
if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
- fscache_enqueue_dependents(object);
fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
fscache_stat(&fscache_n_object_avail);
_leave("");
+ return transit_to(JUMPSTART_DEPS);
}
/*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
*/
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+ int event)
{
- struct fscache_object *parent = object->parent;
- struct fscache_cache *cache = object->cache;
+ _enter("{OBJ%x},%d", object->debug_id, event);
- _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+ return NO_TRANSIT; /* Not finished; requeue */
+ return transit_to(WAIT_FOR_CMD);
+}
- ASSERTCMP(object->cookie, ==, NULL);
- ASSERT(hlist_unhashed(&object->cookie_link));
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+ int event)
+{
+ struct fscache_cookie *cookie;
- spin_lock(&cache->object_list_lock);
- list_del_init(&object->cache_link);
- spin_unlock(&cache->object_list_lock);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_cop_drop_object);
- cache->ops->drop_object(object);
- fscache_stat_d(&fscache_n_cop_drop_object);
+ object->oob_event_mask = 0;
- if (parent) {
- _debug("release parent OBJ%x {%d}",
- parent->debug_id, parent->n_children);
+ fscache_stat(&fscache_n_cop_lookup_complete);
+ object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
- spin_lock(&parent->lock);
- parent->n_children--;
- if (parent->n_children == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
- object->parent = NULL;
+ cookie = object->cookie;
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+ fscache_done_parent_op(object);
+ return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x,%d,%d},%d",
+ object->debug_id, object->n_ops, object->n_children, event);
+
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ object->oob_event_mask = 0;
+
+ if (list_empty(&object->dependents) &&
+ object->n_ops == 0 &&
+ object->n_children == 0)
+ return transit_to(DROP_OBJECT);
+
+ if (object->n_in_progress == 0) {
+ spin_lock(&object->lock);
+ if (object->n_ops > 0 && object->n_in_progress == 0)
+ fscache_start_operations(object);
+ spin_unlock(&object->lock);
}
- /* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ if (!list_empty(&object->dependents))
+ return transit_to(KILL_DEPENDENTS);
- _leave("");
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
*/
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+ int event)
{
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_drop_object(object);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+ return NO_TRANSIT; /* Not finished */
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * withdraw an object from active service
+ * Drop an object's attachments
*/
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+ int event)
{
- struct fscache_cookie *cookie;
- bool detached;
+ struct fscache_object *parent = object->parent;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_cache *cache = object->cache;
+ bool awaken = false;
- _enter("");
+ _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
- spin_lock(&object->lock);
- cookie = object->cookie;
- if (cookie) {
- /* need to get the cookie lock before the object lock, starting
- * from the object pointer */
- atomic_inc(&cookie->usage);
- spin_unlock(&object->lock);
+ ASSERT(cookie != NULL);
+ ASSERT(!hlist_unhashed(&object->cookie_link));
- detached = false;
- spin_lock(&cookie->lock);
- spin_lock(&object->lock);
+ /* Make sure the cookie no longer points here and that the netfs isn't
+ * waiting for us.
+ */
+ spin_lock(&cookie->lock);
+ hlist_del_init(&object->cookie_link);
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ awaken = true;
+ spin_unlock(&cookie->lock);
- if (object->cookie == cookie) {
- hlist_del_init(&object->cookie_link);
- object->cookie = NULL;
- fscache_invalidation_complete(cookie);
- detached = true;
- }
- spin_unlock(&cookie->lock);
- fscache_cookie_put(cookie);
- if (detached)
- fscache_cookie_put(cookie);
- }
+ if (awaken)
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ /* Prevent a race with our last child, which has to signal EV_CLEARED
+ * before dropping our spinlock.
+ */
+ spin_lock(&object->lock);
spin_unlock(&object->lock);
- fscache_drop_object(object);
-}
+ /* Discard from the cache's collection of objects */
+ spin_lock(&cache->object_list_lock);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- * (1) recycler decides to reclaim an in-use object
- * (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- * simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
- struct fscache_object *object)
-{
- bool enqueue = false;
+ fscache_stat(&fscache_n_cop_drop_object);
+ cache->ops->drop_object(object);
+ fscache_stat_d(&fscache_n_cop_drop_object);
- _enter(",OBJ%x", object->debug_id);
+ /* The parent object wants to know when all it dependents have gone */
+ if (parent) {
+ _debug("release parent OBJ%x {%d}",
+ parent->debug_id, parent->n_children);
- spin_lock(&object->lock);
- if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
- object->state = FSCACHE_OBJECT_WITHDRAWING;
- enqueue = true;
+ spin_lock(&parent->lock);
+ parent->n_children--;
+ if (parent->n_children == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+ object->parent = NULL;
}
- spin_unlock(&object->lock);
- if (enqueue)
- fscache_enqueue_object(object);
+ /* this just shifts the object release to the work processor */
+ fscache_put_object(object);
+ fscache_stat(&fscache_n_object_dead);
_leave("");
+ return transit_to(OBJECT_DEAD);
}
/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
}
/*
- * discard a ref on a work item
+ * Discard a ref on an object
*/
static void fscache_put_object(struct fscache_object *object)
{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
fscache_stat_d(&fscache_n_cop_put_object);
}
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+ fscache_objlist_remove(object);
+
+ /* We can get rid of the cookie now */
+ fscache_cookie_put(object->cookie);
+ object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
/*
* enqueue an object for metadata-type processing
*/
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
/**
* fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
*
* Allow an object handler to sleep until the object workqueue is congested.
*
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
/*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately. We return true if the list was
+ * cleared.
*/
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
{
struct fscache_object *dep;
+ bool ret = true;
_enter("{OBJ%x}", object->debug_id);
if (list_empty(&object->dependents))
- return;
+ return true;
spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
struct fscache_object, dep_link);
list_del_init(&dep->dep_link);
-
- /* sort onto appropriate lists */
- fscache_enqueue_object(dep);
+ fscache_raise_event(dep, event);
fscache_put_object(dep);
- if (!list_empty(&object->dependents))
- cond_resched_lock(&object->lock);
+ if (!list_empty(&object->dependents) && need_resched()) {
+ ret = false;
+ break;
+ }
}
spin_unlock(&object->lock);
+ return ret;
}
/*
* remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
*/
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
*
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
/*
* Asynchronously invalidate an object.
*/
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+ int event)
{
struct fscache_operation *op;
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ /* We're going to need the cookie. If the cookie is not available then
+ * retire the object instead.
+ */
+ if (!fscache_use_cookie(object)) {
+ ASSERT(object->cookie->stores.rnode == NULL);
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ _leave(" [no cookie]");
+ return transit_to(KILL_OBJECT);
+ }
/* Reject any new read/write ops and abort any that are pending. */
fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* Now we have to wait for in-progress reads and writes */
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
- _leave(" [ENOMEM]");
- return;
- }
+ if (!op)
+ goto nomem;
fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
- op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+ op->flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_EXCLUSIVE) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* We can allow read and write requests to come in once again. They'll
* queue up behind our exclusive invalidation operation.
*/
- fscache_invalidation_complete(cookie);
- _leave("");
- return;
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ _leave(" [ok]");
+ return transit_to(UPDATE_OBJECT);
+
+nomem:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_unuse_cookie(object);
+ _leave(" [ENOMEM]");
+ return transit_to(KILL_OBJECT);
submit_op_failed:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
kfree(op);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
_leave(" [EIO]");
+ return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+ int event)
+{
+ const struct fscache_state *s;
+
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ s = _fscache_invalidate_object(object, event);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+
+ _leave("");
+ return transit_to(WAIT_FOR_CMD);
}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
- ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+ ASSERT(fscache_object_is_available(op->object));
ASSERTCMP(atomic_read(&op->usage), >, 0);
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
/* need to issue a new write op after this */
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
*/
static void fscache_report_unexpected_submission(struct fscache_object *object,
struct fscache_operation *op,
- unsigned long ostate)
+ const struct fscache_state *ostate)
{
static bool once_only;
struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
once_only = true;
kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id,
- fscache_object_states[object->state]);
- kdebug("objstate=%s [%s]",
- fscache_object_states[object->state],
- fscache_object_states[ostate]);
+ op->debug_id, object->debug_id, object->state->name);
+ kdebug("objstate=%s [%s]", object->state->name, ostate->name);
kdebug("objflags=%lx", object->flags);
kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
int fscache_submit_op(struct fscache_object *object,
struct fscache_operation *op)
{
- unsigned long ostate;
+ const struct fscache_state *ostate;
int ret;
_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
fscache_run_op(object, op);
}
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_DYING ||
- object->state == FSCACHE_OBJECT_LC_DYING ||
- object->state == FSCACHE_OBJECT_WITHDRAWING) {
+ } else if (fscache_object_is_dying(object)) {
fscache_stat(&fscache_n_op_rejected);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
}
/*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object. The caller must hold
+ * object->lock.
*/
void fscache_start_operations(struct fscache_object *object)
{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
- if (atomic_dec_and_test(&object->n_reads)) {
- clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
- &object->cookie->flags);
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_WAITING_ON_READS);
- }
- }
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+ fscache_unuse_cookie(object);
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT)) {
+ if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object)) {
+ if (fscache_object_is_active(object) &&
+ fscache_use_cookie(object)) {
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
+ fscache_unuse_cookie(object);
if (ret < 0)
fscache_abort_object(object);
}
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
- ASSERTCMP(op->n_pages, ==, 0);
+ ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
* allocate a retrieval op
*/
static struct fscache_retrieval *fscache_alloc_retrieval(
+ struct fscache_cookie *cookie,
struct address_space *mapping,
fscache_rw_complete_t end_io_func,
void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
}
fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
- op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+ atomic_inc(&cookie->n_active);
+ op->op.flags = FSCACHE_OP_MYTHREAD |
+ (1UL << FSCACHE_OP_WAITING) |
+ (1UL << FSCACHE_OP_UNUSE_COOKIE);
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
struct fscache_retrieval *op =
container_of(_op, struct fscache_retrieval, op);
- op->n_pages = 0;
+ atomic_set(&op->n_pages, 0);
}
/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, page->mapping,
+ end_io_func,context);
if (!op) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+ ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- op->n_pages = *nr_pages;
+ atomic_set(&op->n_pages, *nr_pages);
spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+ op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
*/
spin_unlock(&object->lock);
fscache_op_complete(&op->op, false);
- _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
- _op->flags, _op->state, object->state, object->flags);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+ _op->flags, _op->state, object->state->short_name,
+ object->flags);
return;
}
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
_enter("");
- while (spin_lock(&cookie->stores_lock),
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
- ARRAY_SIZE(results),
- FSCACHE_COOKIE_PENDING_TAG),
- n > 0) {
+ for (;;) {
+ spin_lock(&cookie->stores_lock);
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG);
+ if (n == 0) {
+ spin_unlock(&cookie->stores_lock);
+ break;
+ }
+
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
page_cache_release(results[i]);
}
- spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
* (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
* set)
*
- * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- * fill op)
+ * (a) no writes yet
*
* (b) writes deferred till post-creation (mark page for writing and
* return immediately)
*
* (2) negative lookup, object created, initial fill being made from netfs
- * (FSCACHE_COOKIE_INITIAL_FILL is set)
*
* (a) fill point not yet reached this page (mark page for writing and
* return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
- op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+ op->op.flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_WAITING) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
op->store_limit = object->store_limit;
+ atomic_inc(&cookie->n_active);
if (fscache_submit_op(object, &op->op) < 0)
goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
return 0;
submit_failed:
+ atomic_dec(&cookie->n_active);
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 8179e8bc4a3d..40d13c70ef51 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = {
.open = fscache_stats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = single_release,
};
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 6f96a8def147..aef34b1e635e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
@@ -92,8 +93,9 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
{
loff_t pos = 0;
struct iovec iov = { .iov_base = buf, .iov_len = count };
+ struct fuse_io_priv io = { .async = 0, .file = file };
- return fuse_direct_io(file, &iov, 1, count, &pos, 0);
+ return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -101,12 +103,13 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
{
loff_t pos = 0;
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct fuse_io_priv io = { .async = 0, .file = file };
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(file, &iov, 1, count, &pos, 1);
+ return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
}
static int cuse_open(struct inode *inode, struct file *file)
@@ -422,7 +425,7 @@ static int cuse_send_init(struct cuse_conn *cc)
BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
- req = fuse_get_req(fc, 1);
+ req = fuse_get_req_for_background(fc, 1);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
goto err;
@@ -504,7 +507,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
cc->fc.release = cuse_fc_release;
cc->fc.connected = 1;
- cc->fc.blocked = 0;
+ cc->fc.initialized = 1;
rc = cuse_send_init(cc);
if (rc) {
fuse_conn_put(&cc->fc);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 11dfa0c3fb46..1d55f9465400 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -111,7 +112,7 @@ static void restore_sigs(sigset_t *oldset)
sigprocmask(SIG_SETMASK, oldset, NULL);
}
-static void __fuse_get_request(struct fuse_req *req)
+void __fuse_get_request(struct fuse_req *req)
{
atomic_inc(&req->count);
}
@@ -130,20 +131,30 @@ static void fuse_req_init_context(struct fuse_req *req)
req->in.h.pid = current->pid;
}
-struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+ return !fc->initialized || (for_background && fc->blocked);
+}
+
+static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
+ bool for_background)
{
struct fuse_req *req;
- sigset_t oldset;
- int intr;
int err;
-
atomic_inc(&fc->num_waiting);
- block_sigs(&oldset);
- intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
- restore_sigs(&oldset);
- err = -EINTR;
- if (intr)
- goto out;
+
+ if (fuse_block_alloc(fc, for_background)) {
+ sigset_t oldset;
+ int intr;
+
+ block_sigs(&oldset);
+ intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background));
+ restore_sigs(&oldset);
+ err = -EINTR;
+ if (intr)
+ goto out;
+ }
err = -ENOTCONN;
if (!fc->connected)
@@ -151,19 +162,35 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
req = fuse_request_alloc(npages);
err = -ENOMEM;
- if (!req)
+ if (!req) {
+ if (for_background)
+ wake_up(&fc->blocked_waitq);
goto out;
+ }
fuse_req_init_context(req);
req->waiting = 1;
+ req->background = for_background;
return req;
out:
atomic_dec(&fc->num_waiting);
return ERR_PTR(err);
}
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+{
+ return __fuse_get_req(fc, npages, false);
+}
EXPORT_SYMBOL_GPL(fuse_get_req);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+ unsigned npages)
+{
+ return __fuse_get_req(fc, npages, true);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
+
/*
* Return request in fuse_file->reserved_req. However that may
* currently be in use. If that is the case, wait for it to become
@@ -225,19 +252,31 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
struct fuse_req *req;
atomic_inc(&fc->num_waiting);
- wait_event(fc->blocked_waitq, !fc->blocked);
+ wait_event(fc->blocked_waitq, fc->initialized);
req = fuse_request_alloc(0);
if (!req)
req = get_reserved_req(fc, file);
fuse_req_init_context(req);
req->waiting = 1;
+ req->background = 0;
return req;
}
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
if (atomic_dec_and_test(&req->count)) {
+ if (unlikely(req->background)) {
+ /*
+ * We get here in the unlikely case that a background
+ * request was allocated but not sent
+ */
+ spin_lock(&fc->lock);
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->lock);
+ }
+
if (req->waiting)
atomic_dec(&fc->num_waiting);
@@ -335,10 +374,15 @@ __releases(fc->lock)
list_del(&req->intr_entry);
req->state = FUSE_REQ_FINISHED;
if (req->background) {
- if (fc->num_background == fc->max_background) {
+ req->background = 0;
+
+ if (fc->num_background == fc->max_background)
fc->blocked = 0;
- wake_up_all(&fc->blocked_waitq);
- }
+
+ /* Wake up next waiter, if any */
+ if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
+ wake_up(&fc->blocked_waitq);
+
if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
@@ -442,6 +486,7 @@ __acquires(fc->lock)
static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
+ BUG_ON(req->background);
spin_lock(&fc->lock);
if (!fc->connected)
req->out.h.error = -ENOTCONN;
@@ -469,7 +514,7 @@ EXPORT_SYMBOL_GPL(fuse_request_send);
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
struct fuse_req *req)
{
- req->background = 1;
+ BUG_ON(!req->background);
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
@@ -1319,7 +1364,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
page_nr++;
ret += buf->len;
- if (pipe->inode)
+ if (pipe->files)
do_wakeup = 1;
}
@@ -2071,6 +2116,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
if (fc->connected) {
fc->connected = 0;
fc->blocked = 0;
+ fc->initialized = 1;
end_io_requests(fc);
end_queued_requests(fc);
end_polls(fc);
@@ -2089,6 +2135,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
spin_lock(&fc->lock);
fc->connected = 0;
fc->blocked = 0;
+ fc->initialized = 1;
end_queued_requests(fc);
end_polls(fc);
wake_up_all(&fc->blocked_waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ff15522481d4..0eda52738ec4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
struct fuse_conn *fc = get_fuse_conn(dir);
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
return true;
if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
return true;
- if (filp->f_pos == 0)
+ if (ctx->pos == 0)
return true;
return false;
}
@@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
{
struct inode *inode;
+ struct dentry *parent;
+ struct fuse_conn *fc;
inode = ACCESS_ONCE(entry->d_inode);
if (inode && is_bad_inode(inode))
@@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
else if (fuse_dentry_time(entry) < get_jiffies_64()) {
int err;
struct fuse_entry_out outarg;
- struct fuse_conn *fc;
struct fuse_req *req;
struct fuse_forget_link *forget;
- struct dentry *parent;
u64 attr_version;
/* For negative dentries, always do a fresh lookup */
@@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
entry_attr_timeout(&outarg),
attr_version);
fuse_change_entry_timeout(entry, &outarg);
+ } else if (inode) {
+ fc = get_fuse_conn(inode);
+ if (fc->readdirplus_auto) {
+ parent = dget_parent(entry);
+ fuse_advise_use_readdirplus(parent->d_inode);
+ dput(parent);
+ }
}
- fuse_advise_use_readdirplus(inode);
return 1;
}
@@ -1159,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
- void *dstbuf, filldir_t filldir)
+ struct dir_context *ctx)
{
while (nbytes >= FUSE_NAME_OFFSET) {
struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
size_t reclen = FUSE_DIRENT_SIZE(dirent);
- int over;
if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
return -EIO;
if (reclen > nbytes)
break;
- over = filldir(dstbuf, dirent->name, dirent->namelen,
- file->f_pos, dirent->ino, dirent->type);
- if (over)
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
break;
buf += reclen;
nbytes -= reclen;
- file->f_pos = dirent->off;
+ ctx->pos = dirent->off;
}
return 0;
@@ -1278,7 +1282,7 @@ out:
}
static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
- void *dstbuf, filldir_t filldir, u64 attr_version)
+ struct dir_context *ctx, u64 attr_version)
{
struct fuse_direntplus *direntplus;
struct fuse_dirent *dirent;
@@ -1303,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
we need to send a FORGET for each of those
which we did not link.
*/
- over = filldir(dstbuf, dirent->name, dirent->namelen,
- file->f_pos, dirent->ino,
- dirent->type);
- file->f_pos = dirent->off;
+ over = !dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type);
+ ctx->pos = dirent->off;
}
buf += reclen;
@@ -1320,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
return 0;
}
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
{
int plus, err;
size_t nbytes;
@@ -1343,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
return -ENOMEM;
}
- plus = fuse_use_readdirplus(inode, file);
+ plus = fuse_use_readdirplus(inode, ctx);
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
req->page_descs[0].length = PAGE_SIZE;
if (plus) {
attr_version = fuse_get_attr_version(fc);
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
FUSE_READDIR);
}
fuse_request_send(fc, req);
@@ -1363,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
if (!err) {
if (plus) {
err = parse_dirplusfile(page_address(page), nbytes,
- file, dstbuf, filldir,
+ file, ctx,
attr_version);
} else {
err = parse_dirfile(page_address(page), nbytes, file,
- dstbuf, filldir);
+ ctx);
}
}
@@ -1562,10 +1565,9 @@ void fuse_release_nowrite(struct inode *inode)
* vmtruncate() doesn't allow for this case, so do the rlimit checking
* and the actual truncation by hand.
*/
-static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
- struct file *file)
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+ struct file *file)
{
- struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
struct fuse_setattr_in inarg;
@@ -1574,9 +1576,6 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
loff_t oldsize;
int err;
- if (!fuse_allow_current_process(fc))
- return -EACCES;
-
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
attr->ia_valid |= ATTR_FORCE;
@@ -1671,10 +1670,15 @@ error:
static int fuse_setattr(struct dentry *entry, struct iattr *attr)
{
+ struct inode *inode = entry->d_inode;
+
+ if (!fuse_allow_current_process(get_fuse_conn(inode)))
+ return -EACCES;
+
if (attr->ia_valid & ATTR_FILE)
- return fuse_do_setattr(entry, attr, attr->ia_file);
+ return fuse_do_setattr(inode, attr, attr->ia_file);
else
- return fuse_do_setattr(entry, attr, NULL);
+ return fuse_do_setattr(inode, attr, NULL);
}
static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
@@ -1879,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
static const struct file_operations fuse_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = fuse_readdir,
+ .iterate = fuse_readdir,
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34b80ba95bad..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,8 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
+#include <linux/aio.h>
+#include <linux/falloc.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -126,11 +128,13 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
struct fuse_req *req = ff->reserved_req;
if (sync) {
+ req->background = 0;
fuse_request_send(ff->fc, req);
path_put(&req->misc.release.path);
fuse_put_request(ff->fc, req);
} else {
req->end = fuse_release_end;
+ req->background = 1;
fuse_request_send_background(ff->fc, req);
}
kfree(ff);
@@ -282,6 +286,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
WARN_ON(atomic_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
ff->reserved_req->force = 1;
+ ff->reserved_req->background = 0;
fuse_request_send(ff->fc, ff->reserved_req);
fuse_put_request(ff->fc, ff->reserved_req);
kfree(ff);
@@ -491,9 +496,114 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
req->out.args[0].size = count;
}
-static size_t fuse_send_read(struct fuse_req *req, struct file *file,
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+ unsigned i;
+
+ for (i = 0; i < req->num_pages; i++) {
+ struct page *page = req->pages[i];
+ if (write)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+ int left;
+
+ spin_lock(&io->lock);
+ if (err)
+ io->err = io->err ? : err;
+ else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+ io->bytes = pos;
+
+ left = --io->reqs;
+ spin_unlock(&io->lock);
+
+ if (!left) {
+ long res;
+
+ if (io->err)
+ res = io->err;
+ else if (io->bytes >= 0 && io->write)
+ res = -EIO;
+ else {
+ res = io->bytes < 0 ? io->size : io->bytes;
+
+ if (!is_sync_kiocb(io->iocb)) {
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ spin_unlock(&fc->lock);
+ }
+ }
+
+ aio_complete(io->iocb, res, 0);
+ kfree(io);
+ }
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct fuse_io_priv *io = req->io;
+ ssize_t pos = -1;
+
+ fuse_release_user_pages(req, !io->write);
+
+ if (io->write) {
+ if (req->misc.write.in.size != req->misc.write.out.size)
+ pos = req->misc.write.in.offset - io->offset +
+ req->misc.write.out.size;
+ } else {
+ if (req->misc.read.in.size != req->out.args[0].size)
+ pos = req->misc.read.in.offset - io->offset +
+ req->out.args[0].size;
+ }
+
+ fuse_aio_complete(io, req->out.h.error, pos);
+}
+
+static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
+ size_t num_bytes, struct fuse_io_priv *io)
+{
+ spin_lock(&io->lock);
+ io->size += num_bytes;
+ io->reqs++;
+ spin_unlock(&io->lock);
+
+ req->io = io;
+ req->end = fuse_aio_complete_req;
+
+ __fuse_get_request(req);
+ fuse_request_send_background(fc, req);
+
+ return num_bytes;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
loff_t pos, size_t count, fl_owner_t owner)
{
+ struct file *file = io->file;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -504,6 +614,10 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
inarg->read_flags |= FUSE_READ_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
}
+
+ if (io->async)
+ return fuse_async_req_send(fc, req, count, io);
+
fuse_request_send(fc, req);
return req->out.args[0].size;
}
@@ -524,6 +638,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
static int fuse_readpage(struct file *file, struct page *page)
{
+ struct fuse_io_priv io = { .async = 0, .file = file };
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
@@ -556,7 +671,7 @@ static int fuse_readpage(struct file *file, struct page *page)
req->num_pages = 1;
req->pages[0] = page;
req->page_descs[0].length = count;
- num_read = fuse_send_read(req, file, pos, count, NULL);
+ num_read = fuse_send_read(req, &io, pos, count, NULL);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -661,7 +776,12 @@ static int fuse_readpages_fill(void *_data, struct page *page)
int nr_alloc = min_t(unsigned, data->nr_pages,
FUSE_MAX_PAGES_PER_REQ);
fuse_send_readpages(req, data->file);
- data->req = req = fuse_get_req(fc, nr_alloc);
+ if (fc->async_read)
+ req = fuse_get_req_for_background(fc, nr_alloc);
+ else
+ req = fuse_get_req(fc, nr_alloc);
+
+ data->req = req;
if (IS_ERR(req)) {
unlock_page(page);
return PTR_ERR(req);
@@ -696,7 +816,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
data.file = file;
data.inode = inode;
- data.req = fuse_get_req(fc, nr_alloc);
+ if (fc->async_read)
+ data.req = fuse_get_req_for_background(fc, nr_alloc);
+ else
+ data.req = fuse_get_req(fc, nr_alloc);
data.nr_pages = nr_pages;
err = PTR_ERR(data.req);
if (IS_ERR(data.req))
@@ -758,9 +881,10 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
req->out.args[0].value = outarg;
}
-static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
loff_t pos, size_t count, fl_owner_t owner)
{
+ struct file *file = io->file;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
struct fuse_write_in *inarg = &req->misc.write.in;
@@ -771,6 +895,10 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
}
+
+ if (io->async)
+ return fuse_async_req_send(fc, req, count, io);
+
fuse_request_send(fc, req);
return req->misc.write.out.size;
}
@@ -794,11 +922,12 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
size_t res;
unsigned offset;
unsigned i;
+ struct fuse_io_priv io = { .async = 0, .file = file };
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
- res = fuse_send_write(req, file, pos, count, NULL);
+ res = fuse_send_write(req, &io, pos, count, NULL);
offset = req->page_descs[0].offset;
count = res;
@@ -971,7 +1100,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return err;
count = ocount;
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
@@ -1030,23 +1158,10 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
out:
current->backing_dev_info = NULL;
mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
return written ? written : err;
}
-static void fuse_release_user_pages(struct fuse_req *req, int write)
-{
- unsigned i;
-
- for (i = 0; i < req->num_pages; i++) {
- struct page *page = req->pages[i];
- if (write)
- set_page_dirty_lock(page);
- put_page(page);
- }
-}
-
static inline void fuse_page_descs_length_init(struct fuse_req *req,
unsigned index, unsigned nr_pages)
{
@@ -1148,10 +1263,11 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
return min(npages, FUSE_MAX_PAGES_PER_REQ);
}
-ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
int write)
{
+ struct file *file = io->file;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
@@ -1162,7 +1278,10 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
iov_iter_init(&ii, iov, nr_segs, count, 0);
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ if (io->async)
+ req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1177,11 +1296,12 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
}
if (write)
- nres = fuse_send_write(req, file, pos, nbytes, owner);
+ nres = fuse_send_write(req, io, pos, nbytes, owner);
else
- nres = fuse_send_read(req, file, pos, nbytes, owner);
+ nres = fuse_send_read(req, io, pos, nbytes, owner);
- fuse_release_user_pages(req, !write);
+ if (!io->async)
+ fuse_release_user_pages(req, !write);
if (req->out.h.error) {
if (!res)
res = req->out.h.error;
@@ -1197,7 +1317,11 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
break;
if (count) {
fuse_put_request(fc, req);
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ if (io->async)
+ req = fuse_get_req_for_background(fc,
+ fuse_iter_npages(&ii));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
break;
}
@@ -1211,17 +1335,19 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
-static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos,
+ size_t count)
{
ssize_t res;
+ struct file *file = io->file;
struct inode *inode = file_inode(file);
if (is_bad_inode(inode))
return -EIO;
- res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs),
- ppos, 0);
+ res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
fuse_invalidate_attr(inode);
@@ -1231,23 +1357,23 @@ static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov,
static ssize_t fuse_direct_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct fuse_io_priv io = { .async = 0, .file = file };
struct iovec iov = { .iov_base = buf, .iov_len = count };
- return __fuse_direct_read(file, &iov, 1, ppos);
+ return __fuse_direct_read(&io, &iov, 1, ppos, count);
}
-static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
+static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
+ const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
+ struct file *file = io->file;
struct inode *inode = file_inode(file);
size_t count = iov_length(iov, nr_segs);
ssize_t res;
res = generic_write_checks(file, ppos, &count, 0);
- if (!res) {
- res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
- if (res > 0)
- fuse_write_update_size(inode, *ppos);
- }
+ if (!res)
+ res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
fuse_invalidate_attr(inode);
@@ -1260,13 +1386,16 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
struct inode *inode = file_inode(file);
ssize_t res;
+ struct fuse_io_priv io = { .async = 0, .file = file };
if (is_bad_inode(inode))
return -EIO;
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(file, &iov, 1, ppos);
+ res = __fuse_direct_write(&io, &iov, 1, ppos);
+ if (res > 0)
+ fuse_write_update_size(inode, *ppos);
mutex_unlock(&inode->i_mutex);
return res;
@@ -1375,6 +1504,7 @@ static int fuse_writepage_locked(struct page *page)
if (!req)
goto err;
+ req->background = 1; /* writeback always goes to bg_queue */
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
goto err_free;
@@ -2228,21 +2358,99 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
return 0;
}
+static void fuse_do_truncate(struct file *file)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct iattr attr;
+
+ attr.ia_valid = ATTR_SIZE;
+ attr.ia_size = i_size_read(inode);
+
+ attr.ia_file = file;
+ attr.ia_valid |= ATTR_FILE;
+
+ fuse_do_setattr(inode, &attr, file);
+}
+
+static inline loff_t fuse_round_up(loff_t off)
+{
+ return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
+
static ssize_t
fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
ssize_t ret = 0;
- struct file *file = NULL;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
+ struct inode *inode;
+ loff_t i_size;
+ size_t count = iov_length(iov, nr_segs);
+ struct fuse_io_priv *io;
- file = iocb->ki_filp;
pos = offset;
+ inode = file->f_mapping->host;
+ i_size = i_size_read(inode);
+
+ /* optimization for short read */
+ if (async_dio && rw != WRITE && offset + count > i_size) {
+ if (offset >= i_size)
+ return 0;
+ count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+ }
+
+ io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_init(&io->lock);
+ io->reqs = 1;
+ io->bytes = -1;
+ io->size = 0;
+ io->offset = offset;
+ io->write = (rw == WRITE);
+ io->err = 0;
+ io->file = file;
+ /*
+ * By default, we want to optimize all I/Os with async request
+ * submission to the client filesystem if supported.
+ */
+ io->async = async_dio;
+ io->iocb = iocb;
+
+ /*
+ * We cannot asynchronously extend the size of a file. We have no method
+ * to wait on real async I/O requests, so we must submit this request
+ * synchronously.
+ */
+ if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+ io->async = false;
if (rw == WRITE)
- ret = __fuse_direct_write(file, iov, nr_segs, &pos);
+ ret = __fuse_direct_write(io, iov, nr_segs, &pos);
else
- ret = __fuse_direct_read(file, iov, nr_segs, &pos);
+ ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+
+ if (io->async) {
+ fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+ /* we have a non-extending, async request, so return */
+ if (!is_sync_kiocb(iocb))
+ return -EIOCBQUEUED;
+
+ ret = wait_on_sync_kiocb(iocb);
+ } else {
+ kfree(io);
+ }
+
+ if (rw == WRITE) {
+ if (ret > 0)
+ fuse_write_update_size(inode, pos);
+ else if (ret < 0 && offset + count > i_size)
+ fuse_do_truncate(file);
+ }
return ret;
}
@@ -2251,6 +2459,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
loff_t length)
{
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file->f_inode;
struct fuse_conn *fc = ff->fc;
struct fuse_req *req;
struct fuse_fallocate_in inarg = {
@@ -2260,13 +2469,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE);
if (fc->no_fallocate)
return -EOPNOTSUPP;
+ if (lock_inode) {
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_set_nowrite(inode);
+ }
+
req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
req->in.h.opcode = FUSE_FALLOCATE;
req->in.h.nodeid = ff->nodeid;
@@ -2281,6 +2500,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
}
fuse_put_request(fc, req);
+ if (err)
+ goto out;
+
+ /* we could have extended the file */
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ fuse_write_update_size(inode, offset + length);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ truncate_pagecache_range(inode, offset, offset + length - 1);
+
+ fuse_invalidate_attr(inode);
+
+out:
+ if (lock_inode) {
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_release_nowrite(inode);
+ mutex_unlock(&inode->i_mutex);
+ }
+
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6aeba864f070..fde7249a3a96 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -228,6 +228,20 @@ enum fuse_req_state {
FUSE_REQ_FINISHED
};
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+ int async;
+ spinlock_t lock;
+ unsigned reqs;
+ ssize_t bytes;
+ size_t size;
+ __u64 offset;
+ bool write;
+ int err;
+ struct kiocb *iocb;
+ struct file *file;
+};
+
/**
* A request to the client
*/
@@ -332,6 +346,9 @@ struct fuse_req {
/** Inode used in the request or NULL */
struct inode *inode;
+ /** AIO control block */
+ struct fuse_io_priv *io;
+
/** Link on fi->writepages */
struct list_head writepages_entry;
@@ -417,6 +434,10 @@ struct fuse_conn {
/** Batching of FORGET requests (positive indicates FORGET batch) */
int forget_batch;
+ /** Flag indicating that INIT reply has been received. Allocating
+ * any fuse request will be suspended until the flag is set */
+ int initialized;
+
/** Flag indicating if connection is blocked. This will be
the case before the INIT reply is received, and if there
are too many outstading backgrounds requests */
@@ -520,6 +541,9 @@ struct fuse_conn {
/** Does the filesystem want adaptive readdirplus? */
unsigned readdirplus_auto:1;
+ /** Does the filesystem support asynchronous direct-IO submission? */
+ unsigned async_dio:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -708,6 +732,13 @@ void fuse_request_free(struct fuse_req *req);
* caller should specify # elements in req->pages[] explicitly
*/
struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+ unsigned npages);
+
+/*
+ * Increment reference count on request
+ */
+void __fuse_get_request(struct fuse_req *req);
/**
* Get a request, may fail with -ENOMEM,
@@ -823,7 +854,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
-ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
@@ -835,4 +866,7 @@ int fuse_dev_release(struct inode *inode, struct file *file);
void fuse_write_update_size(struct inode *inode, loff_t pos);
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+ struct file *file);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 137185c3884f..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -346,6 +346,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
fc->destroy_req = NULL;
req->in.h.opcode = FUSE_DESTROY;
req->force = 1;
+ req->background = 0;
fuse_request_send(fc, req);
fuse_put_request(fc, req);
}
@@ -362,6 +363,7 @@ void fuse_conn_kill(struct fuse_conn *fc)
spin_lock(&fc->lock);
fc->connected = 0;
fc->blocked = 0;
+ fc->initialized = 1;
spin_unlock(&fc->lock);
/* Flush all readers on this fs */
kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -581,7 +583,8 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
- fc->blocked = 1;
+ fc->blocked = 0;
+ fc->initialized = 0;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
}
@@ -782,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
static void sanitize_global_limit(unsigned *limit)
{
if (*limit == 0)
- *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
sizeof(struct fuse_req);
if (*limit >= 1 << 16)
@@ -864,10 +867,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
- if (arg->flags & FUSE_DO_READDIRPLUS)
+ if (arg->flags & FUSE_DO_READDIRPLUS) {
fc->do_readdirplus = 1;
- if (arg->flags & FUSE_READDIRPLUS_AUTO)
- fc->readdirplus_auto = 1;
+ if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
+ }
+ if (arg->flags & FUSE_ASYNC_DIO)
+ fc->async_dio = 1;
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@@ -880,7 +886,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->max_write = max_t(unsigned, 4096, fc->max_write);
fc->conn_init = 1;
}
- fc->blocked = 0;
+ fc->initialized = 1;
wake_up_all(&fc->blocked_waitq);
}
@@ -895,7 +901,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
- FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO;
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1043,6 +1049,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
init_req = fuse_request_alloc(0);
if (!init_req)
goto err_put_root;
+ init_req->background = 1;
if (is_bdev) {
fc->destroy_req = fuse_request_alloc(0);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index eb08c9e43c2a..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
be found here: http://sources.redhat.com/cluster
The "nolock" lock module is now built in to GFS2 by default. If
- you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
- networking.
+ you want to use the DLM, be sure to enable IPv4/6 networking.
config GFS2_FS_LOCKING_DLM
bool "GFS2 DLM locking"
depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
- HOTPLUG && DLM && CONFIGFS_FS && SYSFS
+ CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
help
Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 24f414f0ce61..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
@@ -109,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
goto out;
}
return 1;
@@ -298,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
/* Is the page fully outside i_size? (truncate in progress) */
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0,
+ PAGE_CACHE_SIZE);
unlock_page(page);
continue;
}
@@ -942,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
goto out;
bh = head = page_buffers(page);
do {
+ if (pos + bh->b_size > stop)
+ return;
+
if (offset <= pos)
gfs2_discard(sdp, bh);
pos += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
out:
- if (offset == 0)
+ if (!partial_page)
try_to_release_page(page, 0);
}
@@ -1055,7 +1063,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
if (atomic_read(&bh->b_count))
goto cannot_release;
bd = bh->b_private;
- if (bd && bd->bd_ail)
+ if (bd && bd->bd_tr)
goto cannot_release;
if (buffer_pinned(bh) || buffer_dirty(bh))
goto not_possible;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e83657f046e..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -787,7 +787,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
goto out_rlist;
if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(ip, ip->i_res);
+ gfs2_rs_deltree(ip->i_res);
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
unstuff = 1;
}
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+ 0 : RES_QUOTA), 0);
if (error)
goto do_grow_release;
@@ -1286,17 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
if (ret)
return ret;
+ ret = get_write_access(inode);
+ if (ret)
+ return ret;
+
inode_dio_wait(inode);
ret = gfs2_rs_alloc(GFS2_I(inode));
if (ret)
- return ret;
+ goto out;
oldsize = inode->i_size;
- if (newsize >= oldsize)
- return do_grow(inode, newsize);
+ if (newsize >= oldsize) {
+ ret = do_grow(inode, newsize);
+ goto out;
+ }
- return do_shrink(inode, oldsize, newsize);
+ ret = do_shrink(inode, oldsize, newsize);
+out:
+ put_write_access(inode);
+ return ret;
}
int gfs2_truncatei_resume(struct gfs2_inode *ip)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
return 0;
}
-static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
{
str->hash = gfs2_disk_hash(str->name, str->len);
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c3e82bd23179..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
return ERR_PTR(-EIO);
}
- hc = kmalloc(hsize, GFP_NOFS);
- ret = -ENOMEM;
+ hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
+ if (hc == NULL)
+ hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
+
if (hc == NULL)
return ERR_PTR(-ENOMEM);
ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
- kfree(hc);
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
return ERR_PTR(ret);
}
spin_lock(&inode->i_lock);
- if (ip->i_hash_cache)
- kfree(hc);
- else
+ if (ip->i_hash_cache) {
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
+ } else {
ip->i_hash_cache = hc;
+ }
spin_unlock(&inode->i_lock);
return ip->i_hash_cache;
@@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
__be64 *hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
- kfree(hc);
+ if (is_vmalloc_addr(hc))
+ vfree(hc);
+ else
+ kfree(hc);
}
static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
if (IS_ERR(hc))
return PTR_ERR(hc);
- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
+ hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+ if (hc2 == NULL)
+ hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
+
if (!hc2)
return -ENOMEM;
+ h = hc2;
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
goto out_kfree;
@@ -1145,7 +1161,10 @@ fail:
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
out_kfree:
- kfree(hc2);
+ if (is_vmalloc_addr(hc2))
+ vfree(hc2);
+ else
+ kfree(hc2);
return error;
}
@@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
/**
* do_filldir_main - read out directory entries
* @dip: The GFS2 inode
- * @offset: The offset in the file to read from
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
+ * @ctx: what to feed the entries to
* @darr: an array of struct gfs2_dirent pointers to read
* @entries: the number of entries in darr
* @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
* the possibility that they will fall into different readdir buffers or
* that someone will want to seek to that location.
*
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
*/
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
- void *opaque, filldir_t filldir,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
const struct gfs2_dirent **darr, u32 entries,
int *copied)
{
@@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
u64 off, off_next;
unsigned int x, y;
int run = 0;
- int error = 0;
sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
@@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
off_next = be32_to_cpu(dent_next->de_hash);
off_next = gfs2_disk_hash2offset(off_next);
- if (off < *offset)
+ if (off < ctx->pos)
continue;
- *offset = off;
+ ctx->pos = off;
if (off_next == off) {
if (*copied && !run)
@@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
} else
run = 0;
} else {
- if (off < *offset)
+ if (off < ctx->pos)
continue;
- *offset = off;
+ ctx->pos = off;
}
- error = filldir(opaque, (const char *)(dent + 1),
+ if (!dir_emit(ctx, (const char *)(dent + 1),
be16_to_cpu(dent->de_name_len),
- off, be64_to_cpu(dent->de_inum.no_addr),
- be16_to_cpu(dent->de_type));
- if (error)
+ be64_to_cpu(dent->de_inum.no_addr),
+ be16_to_cpu(dent->de_type)))
return 1;
*copied = 1;
}
- /* Increment the *offset by one, so the next time we come into the
+ /* Increment the ctx->pos by one, so the next time we come into the
do_filldir fxn, we get the next entry instead of the last one in the
current leaf */
- (*offset)++;
+ ctx->pos++;
return 0;
}
@@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
kfree(ptr);
}
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, int *copied, unsigned *depth,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
+ int *copied, unsigned *depth,
u64 leaf_no)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
} while(lfn);
BUG_ON(entries2 != entries);
- error = do_filldir_main(ip, offset, opaque, filldir, darr,
- entries, copied);
+ error = do_filldir_main(ip, ctx, darr, entries, copied);
out_free:
for(i = 0; i < leaf; i++)
brelse(larr[i]);
@@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
* @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
+ * @ctx: actor to feed the entries to
*
* Returns: errno
*/
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra)
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
unsigned depth = 0;
hsize = 1 << dip->i_depth;
- hash = gfs2_dir_offset2hash(*offset);
+ hash = gfs2_dir_offset2hash(ctx->pos);
index = hash >> (32 - dip->i_depth);
if (dip->i_hash_cache == NULL)
@@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
gfs2_dir_readahead(inode, hsize, index, f_ra);
while (index < hsize) {
- error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+ error = gfs2_dir_read_leaf(inode, ctx,
&copied, &depth,
be64_to_cpu(lp[index]));
if (error)
@@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
return error;
}
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra)
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;
if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir, f_ra);
+ return dir_e_read(inode, ctx, f_ra);
if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
@@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
error = -EIO;
goto out;
}
- error = do_filldir_main(dip, offset, opaque, filldir, darr,
+ error = do_filldir_main(dip, ctx, darr,
dip->i_entries, &copied);
out:
kfree(darr);
@@ -1537,9 +1548,9 @@ out:
/**
* gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
*
* This routine searches a directory for a file or another directory.
* Assumes a glock is held on dip.
@@ -1547,22 +1558,25 @@ out:
* Returns: errno
*/
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ bool fail_on_exist)
{
struct buffer_head *bh;
struct gfs2_dirent *dent;
- struct inode *inode;
+ u64 addr, formal_ino;
+ u16 dtype;
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
if (IS_ERR(dent))
return ERR_CAST(dent);
- inode = gfs2_inode_lookup(dir->i_sb,
- be16_to_cpu(dent->de_type),
- be64_to_cpu(dent->de_inum.no_addr),
- be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+ dtype = be16_to_cpu(dent->de_type);
+ addr = be64_to_cpu(dent->de_inum.no_addr);
+ formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
- return inode;
+ if (fail_on_exist)
+ return ERR_PTR(-EEXIST);
+ return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
}
return ERR_PTR(-ENOENT);
}
@@ -1846,6 +1860,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
ht = kzalloc(size, GFP_NOFS);
+ if (ht == NULL)
+ ht = vzalloc(size);
if (!ht)
return -ENOMEM;
@@ -1933,7 +1949,10 @@ out_rlist:
gfs2_rlist_free(&rlist);
gfs2_quota_unhold(dip);
out:
- kfree(ht);
+ if (is_vmalloc_addr(ht))
+ vfree(ht);
+ else
+ kfree(ht);
return error;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
struct gfs2_inum;
extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename);
+ const struct qstr *filename,
+ bool fail_on_exist);
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir, struct file_ra_state *f_ra);
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
}
struct get_name_filldir {
+ struct dir_context ctx;
struct gfs2_inum_host inum;
char *name;
};
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct inode *dir = parent->d_inode;
struct inode *inode = child->d_inode;
struct gfs2_inode *dip, *ip;
- struct get_name_filldir gnfd;
+ struct get_name_filldir gnfd = {
+ .ctx.actor = get_name_filldir,
+ .name = name
+ };
struct gfs2_holder gh;
- u64 offset = 0;
int error;
struct file_ra_state f_ra = { .start = 0 };
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
*name = 0;
gnfd.inum.no_addr = ip->i_no_addr;
gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
- gnfd.name = name;
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
if (error)
return error;
- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+ error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
gfs2_glock_dq_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d79c2dadc536..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
@@ -81,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
}
/**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
* @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
+ * @ctx: What to feed directory entries to
*
* Returns: errno
*/
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file->f_mapping->host;
struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_holder d_gh;
- u64 offset = file->f_pos;
int error;
- gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
- error = gfs2_glock_nq(&d_gh);
- if (error) {
- gfs2_holder_uninit(&d_gh);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ if (error)
return error;
- }
- error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+ error = gfs2_dir_read(dir, ctx, &file->f_ra);
gfs2_glock_dq_uninit(&d_gh);
- file->f_pos = offset;
-
return error;
}
@@ -401,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(vma->vm_file);
+ ret = get_write_access(inode);
+ if (ret)
+ goto out;
+
ret = gfs2_rs_alloc(ip);
if (ret)
- return ret;
+ goto out_write_access;
gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
- goto out;
+ goto out_uninit;
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
@@ -479,12 +477,15 @@ out_quota_unlock:
gfs2_quota_unlock(ip);
out_unlock:
gfs2_glock_dq(&gh);
-out:
+out_uninit:
gfs2_holder_uninit(&gh);
if (ret == 0) {
set_page_dirty(page);
wait_for_stable_page(page);
}
+out_write_access:
+ put_write_access(inode);
+out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
}
@@ -530,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
*
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
*/
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder i_gh;
struct gfs2_file *fp;
- int error;
+ int ret;
+
+ if (S_ISREG(inode->i_mode)) {
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+ }
- fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+ fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
if (!fp)
return -ENOMEM;
@@ -552,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+ bool need_unlock = false;
if (S_ISREG(ip->i_inode.i_mode)) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (error)
- goto fail;
+ return error;
+ need_unlock = true;
+ }
- if (!(file->f_flags & O_LARGEFILE) &&
- i_size_read(inode) > MAX_NON_LFS) {
- error = -EOVERFLOW;
- goto fail_gunlock;
- }
+ error = gfs2_open_common(inode, file);
+ if (need_unlock)
gfs2_glock_dq_uninit(&i_gh);
- }
-
- return 0;
-fail_gunlock:
- gfs2_glock_dq_uninit(&i_gh);
-fail:
- file->private_data = NULL;
- kfree(fp);
return error;
}
@@ -593,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if ((file->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
- gfs2_rs_delete(ip);
+ if (!(file->f_mode & FMODE_WRITE))
+ return 0;
+ gfs2_rs_delete(ip);
return 0;
}
@@ -888,7 +912,7 @@ out_uninit:
* cluster; until we do, disable leases (by just returning -EINVAL),
* unless the administrator has requested purely local locking.
*
- * Locking: called under lock_flocks
+ * Locking: called under i_lock
*
* Returns: errno
*/
@@ -1040,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
};
const struct file_operations gfs2_dir_fops = {
- .readdir = gfs2_readdir,
+ .iterate = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
@@ -1070,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
};
const struct file_operations gfs2_dir_fops_nolock = {
- .readdir = gfs2_readdir,
+ .iterate = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
.release = gfs2_release,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index cf3515546739..9435384562a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -912,7 +912,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
*/
static void handle_callback(struct gfs2_glock *gl, unsigned int state,
- unsigned long delay)
+ unsigned long delay, bool remote)
{
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
@@ -925,8 +925,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
gl->gl_demote_state = LM_ST_UNLOCKED;
}
if (gl->gl_ops->go_callback)
- gl->gl_ops->go_callback(gl);
- trace_gfs2_demote_rq(gl);
+ gl->gl_ops->go_callback(gl, remote);
+ trace_gfs2_demote_rq(gl, remote);
}
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
@@ -1017,11 +1017,11 @@ do_cancel:
return;
trap_recursive:
- print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+ printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip);
printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
printk(KERN_ERR "lock type: %d req lock state : %d\n",
gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
- print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+ printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip);
printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
printk(KERN_ERR "lock type: %d req lock state : %d\n",
gh->gh_gl->gl_name.ln_type, gh->gh_state);
@@ -1091,7 +1091,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
spin_lock(&gl->gl_spin);
if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0);
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
if (find_first_holder(gl) == NULL) {
@@ -1279,19 +1279,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
gfs2_glock_dq(&ghs[num_gh]);
}
-/**
- * gfs2_glock_dq_uninit_m - release multiple glocks
- * @num_gh: the number of structures
- * @ghs: an array of struct gfs2_holder structures
- *
- */
-
-void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
-{
- while (num_gh--)
- gfs2_glock_dq_uninit(&ghs[num_gh]);
-}
-
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
{
unsigned long delay = 0;
@@ -1309,7 +1296,7 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
}
spin_lock(&gl->gl_spin);
- handle_callback(gl, state, delay);
+ handle_callback(gl, state, delay, true);
spin_unlock(&gl->gl_spin);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
@@ -1422,7 +1409,7 @@ __acquires(&lru_lock)
spin_unlock(&lru_lock);
spin_lock(&gl->gl_spin);
if (demote_ok(gl))
- handle_callback(gl, LM_ST_UNLOCKED, 0);
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
smp_mb__after_clear_bit();
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1547,7 +1534,7 @@ static void clear_glock(struct gfs2_glock *gl)
spin_lock(&gl->gl_spin);
if (gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0);
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
spin_unlock(&gl->gl_spin);
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1590,6 +1577,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
+ flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index fd580b7861d5..69f66e3d22bf 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,6 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
extern __printf(2, 3)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 444b6503ebc4..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
- sector_t blocknr;
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
continue;
gfs2_ail_error(gl, bh);
}
- blocknr = bh->b_blocknr;
- bh->b_private = NULL;
- gfs2_remove_from_ail(bd); /* drops ref on bh */
-
- bd->bd_bh = NULL;
- bd->bd_blkno = blocknr;
-
gfs2_trans_add_revoke(sdp, bd);
}
GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
@@ -515,12 +507,12 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
*
* gl_spin lock is held while calling this
*/
-static void iopen_go_callback(struct gfs2_glock *gl)
+static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
struct gfs2_sbd *sdp = gl->gl_sbd;
- if (sdp->sd_vfs->s_flags & MS_RDONLY)
+ if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
return;
if (gl->gl_demote_state == LM_ST_UNLOCKED &&
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 5c29216e9cc1..26aabd7caba7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -31,7 +31,6 @@ struct gfs2_holder;
struct gfs2_glock;
struct gfs2_quota_data;
struct gfs2_trans;
-struct gfs2_ail;
struct gfs2_jdesc;
struct gfs2_sbd;
struct lm_lockops;
@@ -53,7 +52,7 @@ struct gfs2_log_header_host {
struct gfs2_log_operations {
void (*lo_before_commit) (struct gfs2_sbd *sdp);
- void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+ void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass);
int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
@@ -139,7 +138,7 @@ struct gfs2_bufdata {
struct list_head bd_list;
const struct gfs2_log_operations *bd_ops;
- struct gfs2_ail *bd_ail;
+ struct gfs2_trans *bd_tr;
struct list_head bd_ail_st_list;
struct list_head bd_ail_gl_list;
};
@@ -211,7 +210,7 @@ struct gfs2_glock_operations {
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
- void (*go_callback) (struct gfs2_glock *gl);
+ void (*go_callback)(struct gfs2_glock *gl, bool remote);
const int go_type;
const unsigned long go_flags;
#define GLOF_ASPACE 1
@@ -433,6 +432,7 @@ struct gfs2_trans {
struct gfs2_holder tr_t_gh;
int tr_touched;
+ int tr_attached;
unsigned int tr_num_buf_new;
unsigned int tr_num_databuf_new;
@@ -440,14 +440,12 @@ struct gfs2_trans {
unsigned int tr_num_databuf_rm;
unsigned int tr_num_revoke;
unsigned int tr_num_revoke_rm;
-};
-struct gfs2_ail {
- struct list_head ai_list;
+ struct list_head tr_list;
- unsigned int ai_first;
- struct list_head ai_ail1_list;
- struct list_head ai_ail2_list;
+ unsigned int tr_first;
+ struct list_head tr_ail1_list;
+ struct list_head tr_ail2_list;
};
struct gfs2_journal_extent {
@@ -710,6 +708,7 @@ struct gfs2_sbd {
spinlock_t sd_log_lock;
+ struct gfs2_trans *sd_log_tr;
unsigned int sd_log_blks_reserved;
unsigned int sd_log_commited_buf;
unsigned int sd_log_commited_databuf;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index cc00bd1d1f87..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail_refresh:
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_iopen:
@@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
goto out;
}
- inode = gfs2_dir_search(dir, name);
+ inode = gfs2_dir_search(dir, name, false);
if (IS_ERR(inode))
error = PTR_ERR(inode);
out:
@@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
if (!dip->i_inode.i_nlink)
return -ENOENT;
- error = gfs2_dir_check(&dip->i_inode, name, NULL);
- switch (error) {
- case -ENOENT:
- error = 0;
- break;
- case 0:
- return -EEXIST;
- default:
- return error;
- }
-
if (dip->i_entries == (u32)-1)
return -EFBIG;
if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -392,11 +382,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
int error;
int dblocks = 1;
- error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+ error = gfs2_quota_lock_check(ip);
if (error)
goto out;
- error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+ error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+ if (error)
+ goto out_quota;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0);
if (error)
goto out_ipreserv;
@@ -409,6 +403,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
out_ipreserv:
gfs2_inplace_release(ip);
+out_quota:
+ gfs2_quota_unlock(ip);
out:
return error;
}
@@ -440,59 +436,27 @@ static void gfs2_init_dir(struct buffer_head *dibh,
*/
static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
- const char *symname, struct buffer_head **bhp)
+ const char *symname)
{
- struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_dinode *di;
struct buffer_head *dibh;
- struct timespec tv = CURRENT_TIME;
dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
- gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
di = (struct gfs2_dinode *)dibh->b_data;
+ gfs2_dinode_out(ip, di);
- di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
- di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
- di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
- di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
- di->di_nlink = 0;
- di->di_size = cpu_to_be64(ip->i_inode.i_size);
- di->di_blocks = cpu_to_be64(1);
- di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
- di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
- di->di_generation = cpu_to_be64(ip->i_generation);
- di->di_flags = 0;
di->__pad1 = 0;
- di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
- di->di_height = 0;
di->__pad2 = 0;
di->__pad3 = 0;
- di->di_depth = 0;
- di->di_entries = 0;
memset(&di->__pad4, 0, sizeof(di->__pad4));
- di->di_eattr = 0;
- di->di_atime_nsec = cpu_to_be32(tv.tv_nsec);
- di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
- di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
switch(ip->i_inode.i_mode & S_IFMT) {
- case S_IFREG:
- if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
- gfs2_tune_get(sdp, gt_new_files_jdata))
- di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- break;
case S_IFDIR:
- di->di_flags |= cpu_to_be32(dip->i_diskflags &
- GFS2_DIF_INHERIT_JDATA);
- di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
- di->di_entries = cpu_to_be32(2);
gfs2_init_dir(dibh, dip);
break;
case S_IFLNK:
@@ -501,63 +465,17 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
}
set_buffer_uptodate(dibh);
-
- *bhp = dibh;
-}
-
-static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
- const char *symname, struct buffer_head **bhp)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- int error;
-
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
-
- error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
- if (error)
- return error;
-
- error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
- if (error)
- goto out_quota;
-
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
- if (error)
- goto out_quota;
-
- init_dinode(dip, ip, symname, bhp);
- gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
- gfs2_trans_end(sdp);
-
-out_quota:
- gfs2_quota_unlock(dip);
- return error;
+ brelse(dibh);
}
static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
- struct gfs2_inode *ip)
+ struct gfs2_inode *ip, int arq)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- int alloc_required;
- struct buffer_head *dibh;
int error;
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
-
- error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
- if (error)
- goto fail;
-
- error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
- if (alloc_required < 0)
- goto fail_quota_locks;
- if (alloc_required) {
- error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
+ if (arq) {
+ error = gfs2_quota_lock_check(dip);
if (error)
goto fail_quota_locks;
@@ -581,26 +499,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
goto fail_end_trans;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto fail_end_trans;
- set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- return 0;
-
fail_end_trans:
gfs2_trans_end(sdp);
-
fail_ipreserv:
- if (alloc_required)
- gfs2_inplace_release(dip);
-
+ gfs2_inplace_release(dip);
fail_quota_locks:
gfs2_quota_unlock(dip);
-
-fail:
return error;
}
@@ -631,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
* @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
* @mode: The permissions on the new inode
* @dev: For device nodes, this is the device number
* @symname: For symlinks, this is the link destination
@@ -640,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
*/
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+ struct file *file,
umode_t mode, dev_t dev, const char *symname,
- unsigned int size, int excl)
+ unsigned int size, int excl, int *opened)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -649,9 +555,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
+ struct dentry *d;
int error;
- struct buffer_head *bh = NULL;
u32 aflags = 0;
+ int arq;
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
@@ -660,36 +567,81 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
return error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (error)
goto fail;
error = create_ok(dip, name, mode);
- if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- gfs2_glock_dq_uninit(ghs);
- d_instantiate(dentry, inode);
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
- }
if (error)
goto fail_gunlock;
- inode = new_inode(sdp->sd_vfs);
- if (!inode) {
+ inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+ error = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ d = d_splice_alias(inode, dentry);
+ error = 0;
+ if (file && !IS_ERR(d)) {
+ if (d == NULL)
+ d = dentry;
+ if (S_ISREG(inode->i_mode))
+ error = finish_open(file, d, gfs2_open_common, opened);
+ else
+ error = finish_no_open(file, d);
+ }
gfs2_glock_dq_uninit(ghs);
- return -ENOMEM;
+ if (IS_ERR(d))
+ return PTR_RET(d);
+ return error;
+ } else if (error != -ENOENT) {
+ goto fail_gunlock;
}
+
+ arq = error = gfs2_diradd_alloc_required(dir, name);
+ if (error < 0)
+ goto fail_gunlock;
+
+ inode = new_inode(sdp->sd_vfs);
+ error = -ENOMEM;
+ if (!inode)
+ goto fail_gunlock;
+
ip = GFS2_I(inode);
error = gfs2_rs_alloc(ip);
if (error)
goto fail_free_inode;
- set_bit(GIF_INVALID, &ip->i_flags);
inode->i_mode = mode;
+ set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ gfs2_set_inode_blocks(inode, 1);
munge_mode_uid_gid(dip, inode);
ip->i_goal = dip->i_goal;
+ ip->i_diskflags = 0;
+ ip->i_eattr = 0;
+ ip->i_height = 0;
+ ip->i_depth = 0;
+ ip->i_entries = 0;
+
+ switch(mode & S_IFMT) {
+ case S_IFREG:
+ if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+ gfs2_tune_get(sdp, gt_new_files_jdata))
+ ip->i_diskflags |= GFS2_DIF_JDATA;
+ gfs2_set_aops(inode);
+ break;
+ case S_IFDIR:
+ ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA);
+ ip->i_diskflags |= GFS2_DIF_JDATA;
+ ip->i_entries = 2;
+ break;
+ }
+ gfs2_set_inode_flags(inode);
if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
(dip->i_diskflags & GFS2_DIF_TOPDIR))
@@ -708,10 +660,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_free_inode;
- error = make_dinode(dip, ip, symname, &bh);
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
if (error)
goto fail_gunlock2;
+ init_dinode(dip, ip, symname);
+ gfs2_trans_end(sdp);
+
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (error)
goto fail_gunlock2;
@@ -725,10 +680,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_set_iop(inode);
insert_inode_hash(inode);
- error = gfs2_inode_refresh(ip);
- if (error)
- goto fail_gunlock3;
-
error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock3;
@@ -737,20 +688,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock3;
- error = link_dinode(dip, name, ip);
+ error = link_dinode(dip, name, ip, arq);
if (error)
goto fail_gunlock3;
- if (bh)
- brelse(bh);
-
- gfs2_trans_end(sdp);
- gfs2_inplace_release(dip);
- gfs2_quota_unlock(dip);
mark_inode_dirty(inode);
- gfs2_glock_dq_uninit_m(2, ghs);
d_instantiate(dentry, inode);
- return 0;
+ if (file)
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+ gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(ghs + 1);
+ return error;
fail_gunlock3:
gfs2_glock_dq_uninit(ghs + 1);
@@ -769,12 +717,12 @@ fail_free_inode:
fail_gunlock:
gfs2_glock_dq_uninit(ghs);
if (inode && !IS_ERR(inode)) {
+ clear_nlink(inode);
+ mark_inode_dirty(inode);
set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
iput(inode);
}
fail:
- if (bh)
- brelse(bh);
return error;
}
@@ -790,36 +738,56 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
}
/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
* @dir: The directory inode
* @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
*
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
*
* Returns: errno
*/
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct file *file, int *opened)
{
- struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode && !IS_ERR(inode)) {
- struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
- struct gfs2_holder gh;
- int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error) {
- iput(inode);
- return ERR_PTR(error);
- }
- gfs2_glock_dq_uninit(&gh);
+ struct inode *inode;
+ struct dentry *d;
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl;
+ int error;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (!inode)
+ return NULL;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ gl = GFS2_I(inode)->i_gl;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
}
- return d_splice_alias(inode, dentry);
+
+ d = d_splice_alias(inode, dentry);
+ if (file && S_ISREG(inode->i_mode))
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+
+ gfs2_glock_dq_uninit(&gh);
+ if (error)
+ return ERR_PTR(error);
+ return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned flags)
+{
+ return __gfs2_lookup(dir, dentry, NULL, NULL);
}
/**
@@ -1137,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
}
/**
@@ -1151,7 +1119,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
}
/**
@@ -1166,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+ return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ struct dentry *d;
+ bool excl = !!(flags & O_EXCL);
+
+ d = __gfs2_lookup(dir, dentry, file, opened);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ if (d == NULL)
+ d = dentry;
+ if (d->d_inode) {
+ if (!(*opened & FILE_OPENED))
+ return finish_no_open(file, d);
+ return 0;
+ }
+
+ if (!(flags & O_CREAT))
+ return -ENOENT;
+
+ return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
}
/*
@@ -1846,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .atomic_open = gfs2_atomic_open,
};
const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9a2ca8be7647..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
{
- bd->bd_ail = NULL;
+ bd->bd_tr = NULL;
list_del_init(&bd->bd_ail_st_list);
list_del_init(&bd->bd_ail_gl_list);
atomic_dec(&bd->bd_gl->gl_ail_count);
@@ -90,7 +90,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
struct writeback_control *wbc,
- struct gfs2_ail *ai)
+ struct gfs2_trans *tr)
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
{
@@ -99,15 +99,15 @@ __acquires(&sdp->sd_ail_lock)
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
- list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
+ list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
- gfs2_assert(sdp, bd->bd_ail == ai);
+ gfs2_assert(sdp, bd->bd_tr == tr);
if (!buffer_busy(bh)) {
if (!buffer_uptodate(bh))
gfs2_io_error_bh(sdp, bh);
- list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+ list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
continue;
}
@@ -116,7 +116,7 @@ __acquires(&sdp->sd_ail_lock)
if (gl == bd->bd_gl)
continue;
gl = bd->bd_gl;
- list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list);
mapping = bh->b_page->mapping;
if (!mapping)
continue;
@@ -144,15 +144,15 @@ __acquires(&sdp->sd_ail_lock)
void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
{
struct list_head *head = &sdp->sd_ail1_list;
- struct gfs2_ail *ai;
+ struct gfs2_trans *tr;
trace_gfs2_ail_flush(sdp, wbc, 1);
spin_lock(&sdp->sd_ail_lock);
restart:
- list_for_each_entry_reverse(ai, head, ai_list) {
+ list_for_each_entry_reverse(tr, head, tr_list) {
if (wbc->nr_to_write <= 0)
break;
- if (gfs2_ail1_start_one(sdp, wbc, ai))
+ if (gfs2_ail1_start_one(sdp, wbc, tr))
goto restart;
}
spin_unlock(&sdp->sd_ail_lock);
@@ -183,20 +183,20 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
*
*/
-static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
- list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+ list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list,
bd_ail_st_list) {
bh = bd->bd_bh;
- gfs2_assert(sdp, bd->bd_ail == ai);
+ gfs2_assert(sdp, bd->bd_tr == tr);
if (buffer_busy(bh))
continue;
if (!buffer_uptodate(bh))
gfs2_io_error_bh(sdp, bh);
- list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+ list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
}
}
@@ -210,16 +210,17 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
{
- struct gfs2_ail *ai, *s;
+ struct gfs2_trans *tr, *s;
+ int oldest_tr = 1;
int ret;
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
- gfs2_ail1_empty_one(sdp, ai);
- if (list_empty(&ai->ai_ail1_list))
- list_move(&ai->ai_list, &sdp->sd_ail2_list);
+ list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
+ gfs2_ail1_empty_one(sdp, tr);
+ if (list_empty(&tr->tr_ail1_list) && oldest_tr)
+ list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
- break;
+ oldest_tr = 0;
}
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
@@ -229,13 +230,13 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
{
- struct gfs2_ail *ai;
+ struct gfs2_trans *tr;
struct gfs2_bufdata *bd;
struct buffer_head *bh;
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
- list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+ list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
if (!buffer_locked(bh))
continue;
@@ -256,40 +257,40 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
*
*/
-static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
- struct list_head *head = &ai->ai_ail2_list;
+ struct list_head *head = &tr->tr_ail2_list;
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
bd = list_entry(head->prev, struct gfs2_bufdata,
bd_ail_st_list);
- gfs2_assert(sdp, bd->bd_ail == ai);
+ gfs2_assert(sdp, bd->bd_tr == tr);
gfs2_remove_from_ail(bd);
}
}
static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
{
- struct gfs2_ail *ai, *safe;
+ struct gfs2_trans *tr, *safe;
unsigned int old_tail = sdp->sd_log_tail;
int wrap = (new_tail < old_tail);
int a, b, rm;
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
- a = (old_tail <= ai->ai_first);
- b = (ai->ai_first < new_tail);
+ list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) {
+ a = (old_tail <= tr->tr_first);
+ b = (tr->tr_first < new_tail);
rm = (wrap) ? (a || b) : (a && b);
if (!rm)
continue;
- gfs2_ail2_empty_one(sdp, ai);
- list_del(&ai->ai_list);
- gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
- gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
- kfree(ai);
+ gfs2_ail2_empty_one(sdp, tr);
+ list_del(&tr->tr_list);
+ gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
+ gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
+ kfree(tr);
}
spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
- unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+ unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
int did_wait = 0;
@@ -435,7 +436,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
static unsigned int current_tail(struct gfs2_sbd *sdp)
{
- struct gfs2_ail *ai;
+ struct gfs2_trans *tr;
unsigned int tail;
spin_lock(&sdp->sd_ail_lock);
@@ -443,8 +444,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
if (list_empty(&sdp->sd_ail1_list)) {
tail = sdp->sd_log_head;
} else {
- ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
- tail = ai->ai_first;
+ tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
+ tr_list);
+ tail = tr->tr_first;
}
spin_unlock(&sdp->sd_ail_lock);
@@ -544,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
spin_unlock(&sdp->sd_ordered_lock);
}
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+ struct buffer_head *bh = bd->bd_bh;
+ struct gfs2_glock *gl = bd->bd_gl;
+
+ gfs2_remove_from_ail(bd);
+ bd->bd_bh = NULL;
+ bh->b_private = NULL;
+ bd->bd_blkno = bh->b_blocknr;
+ bd->bd_ops = &gfs2_revoke_lops;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd, *tmp;
+ int have_revokes = 0;
+ int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+ gfs2_ail1_empty(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (list_empty(&bd->bd_list)) {
+ have_revokes = 1;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock(&sdp->sd_ail_lock);
+ if (have_revokes == 0)
+ return;
+ while (sdp->sd_log_num_revoke > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ max_revokes -= sdp->sd_log_num_revoke;
+ if (!sdp->sd_log_num_revoke) {
+ atomic_dec(&sdp->sd_log_blks_free);
+ /* If no blocks have been reserved, we need to also
+ * reserve a block for the header */
+ if (!sdp->sd_log_blks_reserved)
+ atomic_dec(&sdp->sd_log_blks_free);
+ }
+ gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (max_revokes == 0)
+ goto out_of_blocks;
+ if (!list_empty(&bd->bd_list))
+ continue;
+ gfs2_add_revoke(sdp, bd);
+ max_revokes--;
+ }
+ }
+out_of_blocks:
+ spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
+
+ if (!sdp->sd_log_num_revoke) {
+ atomic_inc(&sdp->sd_log_blks_free);
+ if (!sdp->sd_log_blks_reserved)
+ atomic_inc(&sdp->sd_log_blks_free);
+ }
+}
+
/**
* log_write_header - Get and initialize a journal header buffer
* @sdp: The GFS2 superblock
@@ -561,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
lh = page_address(page);
clear_page(lh);
- gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -600,7 +671,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
{
- struct gfs2_ail *ai;
+ struct gfs2_trans *tr;
down_write(&sdp->sd_log_flush_lock);
@@ -611,9 +682,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
}
trace_gfs2_log_flush(sdp, 1);
- ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
- INIT_LIST_HEAD(&ai->ai_ail1_list);
- INIT_LIST_HEAD(&ai->ai_ail2_list);
+ tr = sdp->sd_log_tr;
+ if (tr) {
+ sdp->sd_log_tr = NULL;
+ INIT_LIST_HEAD(&tr->tr_ail1_list);
+ INIT_LIST_HEAD(&tr->tr_ail2_list);
+ }
if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
@@ -630,7 +704,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
sdp->sd_log_flush_head = sdp->sd_log_head;
sdp->sd_log_flush_wrapped = 0;
- ai->ai_first = sdp->sd_log_flush_head;
+ if (tr)
+ tr->tr_first = sdp->sd_log_flush_head;
gfs2_ordered_write(sdp);
lops_before_commit(sdp);
@@ -643,7 +718,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, 0);
}
- lops_after_commit(sdp, ai);
+ lops_after_commit(sdp, tr);
gfs2_log_lock(sdp);
sdp->sd_log_head = sdp->sd_log_flush_head;
@@ -653,16 +728,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
sdp->sd_log_commited_revoke = 0;
spin_lock(&sdp->sd_ail_lock);
- if (!list_empty(&ai->ai_ail1_list)) {
- list_add(&ai->ai_list, &sdp->sd_ail1_list);
- ai = NULL;
+ if (tr && !list_empty(&tr->tr_ail1_list)) {
+ list_add(&tr->tr_list, &sdp->sd_ail1_list);
+ tr = NULL;
}
spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
trace_gfs2_log_flush(sdp, 0);
up_write(&sdp->sd_log_flush_lock);
- kfree(ai);
+ kfree(tr);
}
static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -687,6 +762,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
sdp->sd_jdesc->jd_blocks);
sdp->sd_log_blks_reserved = reserved;
+ if (sdp->sd_log_tr == NULL &&
+ (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
+ gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
+ sdp->sd_log_tr = tr;
+ tr->tr_attached = 1;
+ }
gfs2_log_unlock(sdp);
}
@@ -708,7 +789,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
log_refund(sdp, tr);
- up_read(&sdp->sd_log_flush_lock);
if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a5055977a214..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -53,8 +54,8 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
* to in-place disk block, remove it from the AIL.
*/
spin_lock(&sdp->sd_ail_lock);
- if (bd->bd_ail)
- list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+ if (bd->bd_tr)
+ list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
spin_unlock(&sdp->sd_ail_lock);
get_bh(bh);
atomic_inc(&sdp->sd_log_pinned);
@@ -94,7 +95,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
*/
static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
- struct gfs2_ail *ai)
+ struct gfs2_trans *tr)
{
struct gfs2_bufdata *bd = bh->b_private;
@@ -109,7 +110,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
maybe_release_space(bd);
spin_lock(&sdp->sd_ail_lock);
- if (bd->bd_ail) {
+ if (bd->bd_tr) {
list_del(&bd->bd_ail_st_list);
brelse(bh);
} else {
@@ -117,8 +118,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
atomic_inc(&gl->gl_ail_count);
}
- bd->bd_ail = ai;
- list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ bd->bd_tr = tr;
+ list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
@@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error)
fs_err(sdp, "Error %d writing to log\n", error);
}
- bio_for_each_segment(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, error);
@@ -300,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
u64 nblk;
if (bio) {
- nblk = bio->bi_sector + bio_sectors(bio);
+ nblk = bio_end_sector(bio);
nblk >>= sdp->sd_fsb2bb_shift;
if (blkno == nblk)
return bio;
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
kunmap_atomic(kaddr);
}
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_bufdata *bda, *bdb;
+
+ bda = list_entry(a, struct gfs2_bufdata, bd_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+ if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ return -1;
+ if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ return 1;
+ return 0;
+}
+
static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
unsigned int total, struct list_head *blist,
bool is_databuf)
@@ -413,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
__be64 *ptr;
gfs2_log_lock(sdp);
+ list_sort(NULL, blist, blocknr_cmp);
bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
while(total) {
num = total;
if (total > limit)
num = limit;
gfs2_log_unlock(sdp);
- page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num);
+ page = gfs2_get_log_desc(sdp,
+ is_databuf ? GFS2_LOG_DESC_JDATA :
+ GFS2_LOG_DESC_METADATA, num + 1, num);
ld = page_address(page);
gfs2_log_lock(sdp);
ptr = (__be64 *)(ld + 1);
@@ -480,17 +498,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
&sdp->sd_log_le_buf, 0);
}
-static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
struct list_head *head = &sdp->sd_log_le_buf;
struct gfs2_bufdata *bd;
+ if (tr == NULL) {
+ gfs2_assert(sdp, list_empty(head));
+ return;
+ }
+
while (!list_empty(head)) {
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
sdp->sd_log_num_buf--;
- gfs2_unpin(sdp, bd->bd_bh, ai);
+ gfs2_unpin(sdp, bd->bd_bh, tr);
}
gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
}
@@ -583,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
struct page *page;
unsigned int length;
+ gfs2_write_revokes(sdp);
if (!sdp->sd_log_num_revoke)
return;
@@ -613,7 +637,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
gfs2_log_write_page(sdp, page);
}
-static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
struct list_head *head = &sdp->sd_log_le_revoke;
struct gfs2_bufdata *bd;
@@ -791,16 +815,21 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
}
-static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
struct list_head *head = &sdp->sd_log_le_databuf;
struct gfs2_bufdata *bd;
+ if (tr == NULL) {
+ gfs2_assert(sdp, list_empty(head));
+ return;
+ }
+
while (!list_empty(head)) {
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
sdp->sd_log_num_databuf--;
- gfs2_unpin(sdp, bd->bd_bh, ai);
+ gfs2_unpin(sdp, bd->bd_bh, tr);
}
gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
}
@@ -824,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
.lo_name = "revoke",
};
-const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_name = "rg",
-};
-
const struct gfs2_log_operations gfs2_databuf_lops = {
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
@@ -839,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
const struct gfs2_log_operations *gfs2_log_ops[] = {
&gfs2_databuf_lops,
&gfs2_buf_lops,
- &gfs2_rg_lops,
&gfs2_revoke_lops,
NULL,
};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index ba77b7da8325..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
extern const struct gfs2_log_operations gfs2_glock_lops;
extern const struct gfs2_log_operations gfs2_buf_lops;
extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
@@ -55,12 +54,13 @@ static inline void lops_before_commit(struct gfs2_sbd *sdp)
gfs2_log_ops[x]->lo_before_commit(sdp);
}
-static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static inline void lops_after_commit(struct gfs2_sbd *sdp,
+ struct gfs2_trans *tr)
{
int x;
for (x = 0; gfs2_log_ops[x]; x++)
if (gfs2_log_ops[x]->lo_after_commit)
- gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+ gfs2_log_ops[x]->lo_after_commit(sdp, tr);
}
static inline void lops_before_scan(struct gfs2_jdesc *jd,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index b059bbb5059e..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -295,11 +295,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
}
if (bd) {
spin_lock(&sdp->sd_ail_lock);
- if (bd->bd_ail) {
- gfs2_remove_from_ail(bd);
- bh->b_private = NULL;
- bd->bd_bh = NULL;
- bd->bd_blkno = bh->b_blocknr;
+ if (bd->bd_tr) {
gfs2_trans_add_revoke(sdp, bd);
}
spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
goto fail_quotad;
p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start logd thread: %d\n", error);
return error;
}
sdp->sd_logd_process = p;
p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start quotad thread: %d\n", error);
goto fail;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c7c840e916f8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd)
{
struct kqid qid = qd->qd_id;
return (2 * (u64)from_kqid(&init_user_ns, qid)) +
- (qid.type == USRQUOTA) ? 0 : 1;
+ ((qid.type == USRQUOTA) ? 0 : 1);
}
static u64 qd2offset(struct gfs2_quota_data *qd)
@@ -721,7 +721,7 @@ get_a_page:
goto unlock_out;
}
- gfs2_trans_add_meta(ip->i_gl, bh);
+ gfs2_trans_add_data(ip->i_gl, bh);
kaddr = kmap_atomic(page);
if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
- return gfs2_quota_sync(sb, type);
-}
-
int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5a51265a4341..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -592,7 +592,7 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
* @rs: The reservation to remove
*
*/
-static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
+static void __rs_deltree(struct gfs2_blkreserv *rs)
{
struct gfs2_rgrpd *rgd;
@@ -605,7 +605,7 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
RB_CLEAR_NODE(&rs->rs_node);
if (rs->rs_free) {
- /* return reserved blocks to the rgrp and the ip */
+ /* return reserved blocks to the rgrp */
BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
rs->rs_free = 0;
@@ -619,14 +619,14 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
* @rs: The reservation to remove
*
*/
-void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
{
struct gfs2_rgrpd *rgd;
rgd = rs->rs_rbm.rgd;
if (rgd) {
spin_lock(&rgd->rd_rsspin);
- __rs_deltree(ip, rs);
+ __rs_deltree(rs);
spin_unlock(&rgd->rd_rsspin);
}
}
@@ -638,9 +638,11 @@ void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
*/
void gfs2_rs_delete(struct gfs2_inode *ip)
{
+ struct inode *inode = &ip->i_inode;
+
down_write(&ip->i_rw_mutex);
- if (ip->i_res) {
- gfs2_rs_deltree(ip, ip->i_res);
+ if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
+ gfs2_rs_deltree(ip->i_res);
BUG_ON(ip->i_res->rs_free);
kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
ip->i_res = NULL;
@@ -664,7 +666,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd)
spin_lock(&rgd->rd_rsspin);
while ((n = rb_first(&rgd->rd_rstree))) {
rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
- __rs_deltree(NULL, rs);
+ __rs_deltree(rs);
}
spin_unlock(&rgd->rd_rsspin);
}
@@ -1286,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
minlen = max_t(u64, r.minlen,
q->limits.discard_granularity) >> bs_shift;
+ if (end <= start || minlen > sdp->sd_max_rg_data)
+ return -EINVAL;
+
rgd = gfs2_blk2rgrpd(sdp, start, 0);
- rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
- if (end <= start ||
- minlen > sdp->sd_max_rg_data ||
- start > rgd_end->rd_data0 + rgd_end->rd_data)
- return -EINVAL;
+ if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+ && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+ return -EINVAL; /* start is beyond the end of the fs */
while (1) {
@@ -1334,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
}
out:
- r.len = trimmed << 9;
+ r.len = trimmed << bs_shift;
if (copy_to_user(argp, &r, sizeof(r)))
return -EFAULT;
@@ -1401,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
u32 extlen;
u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
int ret;
+ struct inode *inode = &ip->i_inode;
- extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
- extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+ if (S_ISDIR(inode->i_mode))
+ extlen = 1;
+ else {
+ extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+ extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+ }
if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
return;
@@ -1874,7 +1883,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
/* Drop reservation, if we couldn't use reserved rgrp */
if (gfs2_rs_active(rs))
- gfs2_rs_deltree(ip, rs);
+ gfs2_rs_deltree(rs);
check_rgrp:
/* Check for unlinked inodes which can be reclaimed */
if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
@@ -2087,7 +2096,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
if (rs->rs_free && !ret)
goto out;
}
- __rs_deltree(ip, rs);
+ __rs_deltree(rs);
}
out:
spin_unlock(&rgd->rd_rsspin);
@@ -2180,13 +2189,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
if (dinode)
gfs2_trans_add_unrevoke(sdp, block, 1);
- /*
- * This needs reviewing to see why we cannot do the quota change
- * at this point in the dinode case.
- */
- if (ndata)
- gfs2_quota_change(ip, ndata, ip->i_inode.i_uid,
- ip->i_inode.i_gid);
+ gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
rbm.rgd->rd_free_clone -= *nblocks;
trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 842185853f6b..5b3f4a896e6c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -47,7 +47,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
extern int gfs2_rs_alloc(struct gfs2_inode *ip);
-extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
extern void gfs2_rs_delete(struct gfs2_inode *ip);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index cab77b8ba84f..e5639dec66c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode)
/* Must not read inode block until block type has been verified */
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1512,10 +1513,12 @@ out_truncate:
out_unlock:
/* Error path for case 1 */
if (gfs2_rs_active(ip->i_res))
- gfs2_rs_deltree(ip, ip->i_res);
+ gfs2_rs_deltree(ip->i_res);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq(&ip->i_iopen_gh);
+ }
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED && error != -EROFS)
@@ -1534,6 +1537,7 @@ out:
ip->i_gl = NULL;
if (ip->i_iopen_gh.gh_gl) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 2ee13e841e9f..20c007d747ab 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -159,9 +159,9 @@ TRACE_EVENT(gfs2_glock_put,
/* Callback (local or remote) requesting lock demotion */
TRACE_EVENT(gfs2_demote_rq,
- TP_PROTO(const struct gfs2_glock *gl),
+ TP_PROTO(const struct gfs2_glock *gl, bool remote),
- TP_ARGS(gl),
+ TP_ARGS(gl, remote),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -170,6 +170,7 @@ TRACE_EVENT(gfs2_demote_rq,
__field( u8, cur_state )
__field( u8, dmt_state )
__field( unsigned long, flags )
+ __field( bool, remote )
),
TP_fast_assign(
@@ -179,14 +180,16 @@ TRACE_EVENT(gfs2_demote_rq,
__entry->cur_state = glock_trace_state(gl->gl_state);
__entry->dmt_state = glock_trace_state(gl->gl_demote_state);
__entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+ __entry->remote = remote;
),
- TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
+ TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
(unsigned long long)__entry->glnum,
glock_trace_name(__entry->cur_state),
glock_trace_name(__entry->dmt_state),
- show_glock_flags(__entry->flags))
+ show_glock_flags(__entry->flags),
+ __entry->remote ? "remote" : "local")
);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 88162fae27a5..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -96,7 +96,8 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
static void gfs2_print_trans(const struct gfs2_trans *tr)
{
- print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+ printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n",
+ (void *)tr->tr_ip);
printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n",
tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
@@ -135,8 +136,10 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
if (tr->tr_t_gh.gh_gl) {
gfs2_glock_dq(&tr->tr_t_gh);
gfs2_holder_uninit(&tr->tr_t_gh);
- kfree(tr);
+ if (!tr->tr_attached)
+ kfree(tr);
}
+ up_read(&sdp->sd_log_flush_lock);
if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
gfs2_log_flush(sdp, NULL);
@@ -267,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr = current->journal_info;
BUG_ON(!list_empty(&bd->bd_list));
- BUG_ON(!list_empty(&bd->bd_ail_st_list));
- BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- bd->bd_ops = &gfs2_revoke_lops;
+ gfs2_add_revoke(sdp, bd);
tr->tr_touched = 1;
tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 571abe97b42a..de69d8a24f6d 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
return -ENOMEM;
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
- dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+ hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+ tree->cnid, __builtin_return_address(0));
mutex_lock(&tree->tree_lock);
return 0;
}
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
{
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
- dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+ hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+ fd->tree->cnid, __builtin_return_address(0));
mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
}
@@ -135,8 +137,8 @@ int hfs_brec_find(struct hfs_find_data *fd)
return res;
invalid:
- printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
- height, bnode->height, bnode->type, nidx, parent);
+ pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+ height, bnode->height, bnode->type, nidx, parent);
res = -EIO;
release:
hfs_bnode_put(bnode);
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index c6e97366e8ac..28307bc9ec1e 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
}
}
- dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+ hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
HFS_SB(sb)->free_ablocks -= *num_bits;
hfs_bitmap_dirty(sb);
out:
@@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
if (!count)
return 0;
- dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count);
+ hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count);
/* are all of the bits in range? */
if ((start + count) > HFS_SB(sb)->fs_ablocks)
return -2;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index cdb41a1f6a64..d3fa6bd9503e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -100,7 +100,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
struct hfs_btree *tree;
struct page *src_page, *dst_page;
- dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
tree = src_node->tree;
@@ -120,7 +120,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
struct page *page;
void *ptr;
- dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
src += node->page_offset;
@@ -138,16 +138,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
__be32 cnid;
int i, off, key_off;
- dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+ hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
hfs_bnode_read(node, &desc, 0, sizeof(desc));
- dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+ hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
desc.type, desc.height, be16_to_cpu(desc.num_recs));
off = node->tree->node_size - 2;
for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
key_off = hfs_bnode_read_u16(node, off);
- dprint(DBG_BNODE_MOD, " %d", key_off);
+ hfs_dbg_cont(BNODE_MOD, " %d", key_off);
if (i && node->type == HFS_NODE_INDEX) {
int tmp;
@@ -155,17 +155,18 @@ void hfs_bnode_dump(struct hfs_bnode *node)
tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
else
tmp = node->tree->max_key_len + 1;
- dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off));
+ hfs_dbg_cont(BNODE_MOD, " (%d,%d",
+ tmp, hfs_bnode_read_u8(node, key_off));
hfs_bnode_read(node, &cnid, key_off + tmp, 4);
- dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+ hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
} else if (i && node->type == HFS_NODE_LEAF) {
int tmp;
tmp = hfs_bnode_read_u8(node, key_off);
- dprint(DBG_BNODE_MOD, " (%d)", tmp);
+ hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
}
}
- dprint(DBG_BNODE_MOD, "\n");
+ hfs_dbg_cont(BNODE_MOD, "\n");
}
void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -220,7 +221,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
struct hfs_bnode *node;
if (cnid >= tree->node_count) {
- printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+ pr_err("request for non-existent node %d in B*Tree\n", cnid);
return NULL;
}
@@ -243,7 +244,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
loff_t off;
if (cnid >= tree->node_count) {
- printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+ pr_err("request for non-existent node %d in B*Tree\n", cnid);
return NULL;
}
@@ -257,8 +258,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
node->this = cnid;
set_bit(HFS_BNODE_NEW, &node->flags);
atomic_set(&node->refcnt, 1);
- dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
- node->tree->cnid, node->this);
+ hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+ node->tree->cnid, node->this);
init_waitqueue_head(&node->lock_wq);
spin_lock(&tree->hash_lock);
node2 = hfs_bnode_findhash(tree, cnid);
@@ -301,7 +302,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
{
struct hfs_bnode **p;
- dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+ hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
node->tree->cnid, node->this, atomic_read(&node->refcnt));
for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
*p && *p != node; p = &(*p)->next_hash)
@@ -414,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, num);
spin_unlock(&tree->hash_lock);
- BUG_ON(node);
+ if (node) {
+ pr_crit("new node %u already hashed?\n", num);
+ WARN_ON(1);
+ return node;
+ }
node = __hfs_bnode_create(tree, num);
if (!node)
return ERR_PTR(-ENOMEM);
@@ -443,8 +448,9 @@ void hfs_bnode_get(struct hfs_bnode *node)
{
if (node) {
atomic_inc(&node->refcnt);
- dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
- node->tree->cnid, node->this, atomic_read(&node->refcnt));
+ hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+ node->tree->cnid, node->this,
+ atomic_read(&node->refcnt));
}
}
@@ -455,8 +461,9 @@ void hfs_bnode_put(struct hfs_bnode *node)
struct hfs_btree *tree = node->tree;
int i;
- dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
- node->tree->cnid, node->this, atomic_read(&node->refcnt));
+ hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+ node->tree->cnid, node->this,
+ atomic_read(&node->refcnt));
BUG_ON(!atomic_read(&node->refcnt));
if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
return;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 92fb358ce824..9f4ee7f52026 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -47,15 +47,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
if (node->tree->attributes & HFS_TREE_BIGKEYS) {
retval = hfs_bnode_read_u16(node, recoff) + 2;
if (retval > node->tree->max_key_len + 2) {
- printk(KERN_ERR "hfs: keylen %d too large\n",
- retval);
+ pr_err("keylen %d too large\n", retval);
retval = 0;
}
} else {
retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
if (retval > node->tree->max_key_len + 1) {
- printk(KERN_ERR "hfs: keylen %d too large\n",
- retval);
+ pr_err("keylen %d too large\n", retval);
retval = 0;
}
}
@@ -94,7 +92,8 @@ again:
end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
end_off = hfs_bnode_read_u16(node, end_rec_off);
end_rec_off -= 2;
- dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+ hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+ rec, size, end_off, end_rec_off);
if (size > end_rec_off - end_off) {
if (new_node)
panic("not enough room!\n");
@@ -190,7 +189,8 @@ again:
mark_inode_dirty(tree->inode);
}
hfs_bnode_dump(node);
- dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+ hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+ fd->record, fd->keylength + fd->entrylength);
if (!--node->num_recs) {
hfs_bnode_unlink(node);
if (!node->parent)
@@ -240,7 +240,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
if (IS_ERR(new_node))
return new_node;
hfs_bnode_get(node);
- dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+ hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
node->this, new_node->this, node->next);
new_node->next = node->next;
new_node->prev = node->this;
@@ -374,7 +374,8 @@ again:
newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
else
fd->keylength = newkeylen = tree->max_key_len + 1;
- dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+ hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+ rec, fd->keylength, newkeylen);
rec_off = tree->node_size - (rec + 2) * 2;
end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -385,7 +386,7 @@ again:
end_off = hfs_bnode_read_u16(parent, end_rec_off);
if (end_rec_off - end_off < diff) {
- printk(KERN_DEBUG "hfs: splitting index node...\n");
+ printk(KERN_DEBUG "splitting index node...\n");
fd->bnode = parent;
new_node = hfs_bnode_split(fd);
if (IS_ERR(new_node))
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1cbdeea1db44..1ab19e660e69 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -48,7 +48,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
if (HFS_I(tree->inode)->alloc_blocks >
HFS_I(tree->inode)->first_blocks) {
- printk(KERN_ERR "hfs: invalid btree extent records\n");
+ pr_err("invalid btree extent records\n");
unlock_new_inode(tree->inode);
goto free_inode;
}
@@ -60,8 +60,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
if (!HFS_I(tree->inode)->first_blocks) {
- printk(KERN_ERR "hfs: invalid btree extent records "
- "(0 size).\n");
+ pr_err("invalid btree extent records (0 size)\n");
unlock_new_inode(tree->inode);
goto free_inode;
}
@@ -100,15 +99,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
switch (id) {
case HFS_EXT_CNID:
if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
- printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
- tree->max_key_len);
+ pr_err("invalid extent max_key_len %d\n",
+ tree->max_key_len);
goto fail_page;
}
break;
case HFS_CAT_CNID:
if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
- printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
- tree->max_key_len);
+ pr_err("invalid catalog max_key_len %d\n",
+ tree->max_key_len);
goto fail_page;
}
break;
@@ -146,8 +145,9 @@ void hfs_btree_close(struct hfs_btree *tree)
while ((node = tree->node_hash[i])) {
tree->node_hash[i] = node->next_hash;
if (atomic_read(&node->refcnt))
- printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n",
- node->tree->cnid, node->this, atomic_read(&node->refcnt));
+ pr_err("node %d:%d still has %d user(s)!\n",
+ node->tree->cnid, node->this,
+ atomic_read(&node->refcnt));
hfs_bnode_free(node);
tree->node_hash_cnt--;
}
@@ -290,7 +290,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
kunmap(*pagep);
nidx = node->next;
if (!nidx) {
- printk(KERN_DEBUG "hfs: create new bmap node...\n");
+ printk(KERN_DEBUG "create new bmap node...\n");
next_node = hfs_bmap_new_bmap(node, idx);
} else
next_node = hfs_bnode_find(tree, nidx);
@@ -316,7 +316,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
u32 nidx;
u8 *data, byte, m;
- dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+ hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
tree = node->tree;
nidx = node->this;
node = hfs_bnode_find(tree, 0);
@@ -331,7 +331,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
hfs_bnode_put(node);
if (!i) {
/* panic */;
- printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+ pr_crit("unable to free bnode %u. bmap not found!\n",
+ node->this);
return;
}
node = hfs_bnode_find(tree, i);
@@ -339,7 +340,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
return;
if (node->type != HFS_NODE_MAP) {
/* panic */;
- printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+ pr_crit("invalid bmap found! (%u,%d)\n",
+ node->this, node->type);
hfs_bnode_put(node);
return;
}
@@ -352,7 +354,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
- printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+ pr_crit("trying to free free bnode %u(%d)\n",
+ node->this, node->type);
kunmap(page);
hfs_bnode_put(node);
return;
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 424b0337f524..ff0316b925a5 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -87,12 +87,15 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
int entry_size;
int err;
- dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+ hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+ str->name, cnid, inode->i_nlink);
if (dir->i_size >= HFS_MAX_VALENCE)
return -ENOSPC;
sb = dir->i_sb;
- hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return err;
hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -184,14 +187,14 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
type = rec.type;
if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
- printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+ pr_err("found bad thread record in catalog\n");
return -EIO;
}
fd->search_key->cat.ParID = rec.thread.ParID;
len = fd->search_key->cat.CName.len = rec.thread.CName.len;
if (len > HFS_NAMELEN) {
- printk(KERN_ERR "hfs: bad catalog namelength\n");
+ pr_err("bad catalog namelength\n");
return -EIO;
}
memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
@@ -212,9 +215,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
struct list_head *pos;
int res, type;
- dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+ hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
sb = dir->i_sb;
- hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ if (res)
+ return res;
hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
res = hfs_brec_find(&fd);
@@ -278,10 +283,13 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
int entry_size, type;
int err;
- dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+ hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+ cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
sb = src_dir->i_sb;
- hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+ err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+ if (err)
+ return err;
dst_fd = src_fd;
/* find the old dir entry and read the data */
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 5f7f1abd5f6d..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,7 +25,9 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
int res;
- hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+ res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+ if (res)
+ return ERR_PTR(res);
hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
res = hfs_brec_read(&fd, &rec, sizeof(rec));
if (res) {
@@ -49,9 +51,9 @@ done:
/*
* hfs_readdir
*/
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFS_MAX_NAMELEN];
@@ -60,23 +62,24 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct hfs_readdir_data *rd;
u16 type;
- if (filp->f_pos >= inode->i_size)
+ if (ctx->pos >= inode->i_size)
return 0;
- hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ if (err)
+ return err;
hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
err = hfs_brec_find(&fd);
if (err)
goto out;
- switch ((u32)filp->f_pos) {
- case 0:
+ if (ctx->pos == 0) {
/* This is completely artificial... */
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
err = -EIO;
goto out;
@@ -84,31 +87,29 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
if (entry.type != HFS_CDR_THD) {
- printk(KERN_ERR "hfs: bad catalog folder thread\n");
+ pr_err("bad catalog folder thread\n");
err = -EIO;
goto out;
}
//if (fd.entrylength < HFS_MIN_THREAD_SZ) {
- // printk(KERN_ERR "hfs: truncated catalog thread\n");
+ // pr_err("truncated catalog thread\n");
// err = -EIO;
// goto out;
//}
- if (filldir(dirent, "..", 2, 1,
+ if (!dir_emit(ctx, "..", 2,
be32_to_cpu(entry.thread.ParID), DT_DIR))
goto out;
- filp->f_pos++;
- /* fall through */
- default:
- if (filp->f_pos >= inode->i_size)
- goto out;
- err = hfs_brec_goto(&fd, filp->f_pos - 1);
- if (err)
- goto out;
+ ctx->pos = 2;
}
+ if (ctx->pos >= inode->i_size)
+ goto out;
+ err = hfs_brec_goto(&fd, ctx->pos - 1);
+ if (err)
+ goto out;
for (;;) {
if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
- printk(KERN_ERR "hfs: walked past end of dir\n");
+ pr_err("walked past end of dir\n");
err = -EIO;
goto out;
}
@@ -123,43 +124,43 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
if (type == HFS_CDR_DIR) {
if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
- printk(KERN_ERR "hfs: small dir entry\n");
+ pr_err("small dir entry\n");
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.dir.DirID), DT_DIR))
break;
} else if (type == HFS_CDR_FIL) {
if (fd.entrylength < sizeof(struct hfs_cat_file)) {
- printk(KERN_ERR "hfs: small file entry\n");
+ pr_err("small file entry\n");
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.file.FlNum), DT_REG))
break;
} else {
- printk(KERN_ERR "hfs: bad catalog entry type %d\n", type);
+ pr_err("bad catalog entry type %d\n", type);
err = -EIO;
goto out;
}
- filp->f_pos++;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos++;
+ if (ctx->pos >= inode->i_size)
goto out;
err = hfs_brec_goto(&fd, 1);
if (err)
goto out;
}
- rd = filp->private_data;
+ rd = file->private_data;
if (!rd) {
rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
if (!rd) {
err = -ENOMEM;
goto out;
}
- filp->private_data = rd;
- rd->file = filp;
+ file->private_data = rd;
+ rd->file = file;
list_add(&rd->list, &HFS_I(inode)->open_dir_list);
}
memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -172,7 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
{
struct hfs_readdir_data *rd = file->private_data;
if (rd) {
+ mutex_lock(&inode->i_mutex);
list_del(&rd->list);
+ mutex_unlock(&inode->i_mutex);
kfree(rd);
}
return 0;
@@ -300,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
const struct file_operations hfs_dir_operations = {
.read = generic_read_dir,
- .readdir = hfs_readdir,
+ .iterate = hfs_readdir,
.llseek = generic_file_llseek,
.release = hfs_dir_release,
};
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index a67955a0c36f..e33a0d36a93e 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -107,7 +107,7 @@ static u16 hfs_ext_lastblock(struct hfs_extent *ext)
return be16_to_cpu(ext->block) + be16_to_cpu(ext->count);
}
-static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
{
int res;
@@ -116,26 +116,31 @@ static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd
res = hfs_brec_find(fd);
if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
if (res != -ENOENT)
- return;
+ return res;
hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
} else {
if (res)
- return;
+ return res;
hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength);
HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY;
}
+ return 0;
}
-void hfs_ext_write_extent(struct inode *inode)
+int hfs_ext_write_extent(struct inode *inode)
{
struct hfs_find_data fd;
+ int res = 0;
if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
- hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
- __hfs_ext_write_extent(inode, &fd);
+ res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+ if (res)
+ return res;
+ res = __hfs_ext_write_extent(inode, &fd);
hfs_find_exit(&fd);
}
+ return res;
}
static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent,
@@ -161,8 +166,11 @@ static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode
{
int res;
- if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY)
- __hfs_ext_write_extent(inode, fd);
+ if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
+ res = __hfs_ext_write_extent(inode, fd);
+ if (res)
+ return res;
+ }
res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino,
block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA);
@@ -185,9 +193,11 @@ static int hfs_ext_read_extent(struct inode *inode, u16 block)
block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks)
return 0;
- hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
- res = __hfs_ext_cache_extent(&fd, inode, block);
- hfs_find_exit(&fd);
+ res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+ if (!res) {
+ res = __hfs_ext_cache_extent(&fd, inode, block);
+ hfs_find_exit(&fd);
+ }
return res;
}
@@ -195,11 +205,12 @@ static void hfs_dump_extent(struct hfs_extent *extent)
{
int i;
- dprint(DBG_EXTENT, " ");
+ hfs_dbg(EXTENT, " ");
for (i = 0; i < 3; i++)
- dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block),
- be16_to_cpu(extent[i].count));
- dprint(DBG_EXTENT, "\n");
+ hfs_dbg_cont(EXTENT, " %u:%u",
+ be16_to_cpu(extent[i].block),
+ be16_to_cpu(extent[i].count));
+ hfs_dbg_cont(EXTENT, "\n");
}
static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
@@ -298,7 +309,9 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
if (total_blocks == blocks)
return 0;
- hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+ res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+ if (res)
+ return res;
do {
res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type);
if (res)
@@ -392,10 +405,10 @@ int hfs_extend_file(struct inode *inode)
goto out;
}
- dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+ hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
if (!HFS_I(inode)->first_blocks) {
- dprint(DBG_EXTENT, "first extents\n");
+ hfs_dbg(EXTENT, "first extents\n");
/* no extents yet */
HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
@@ -437,8 +450,10 @@ out:
return res;
insert_extent:
- dprint(DBG_EXTENT, "insert new extent\n");
- hfs_ext_write_extent(inode);
+ hfs_dbg(EXTENT, "insert new extent\n");
+ res = hfs_ext_write_extent(inode);
+ if (res)
+ goto out;
memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start);
@@ -460,13 +475,13 @@ void hfs_file_truncate(struct inode *inode)
u32 size;
int res;
- dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
- (long long)HFS_I(inode)->phys_size, inode->i_size);
+ hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n",
+ inode->i_ino, (long long)HFS_I(inode)->phys_size,
+ inode->i_size);
if (inode->i_size > HFS_I(inode)->phys_size) {
struct address_space *mapping = inode->i_mapping;
void *fsdata;
struct page *page;
- int res;
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
@@ -488,7 +503,12 @@ void hfs_file_truncate(struct inode *inode)
goto out;
mutex_lock(&HFS_I(inode)->extents_lock);
- hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+ res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+ if (res) {
+ mutex_unlock(&HFS_I(inode)->extents_lock);
+ /* XXX: We lack error handling of hfs_file_truncate() */
+ return;
+ }
while (1) {
if (alloc_cnt == HFS_I(inode)->first_blocks) {
hfs_free_extents(sb, HFS_I(inode)->first_extents,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 693df9fe52b2..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -9,6 +9,12 @@
#ifndef _LINUX_HFS_FS_H
#define _LINUX_HFS_FS_H
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/mutex.h>
@@ -34,8 +40,18 @@
//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
#define DBG_MASK (0)
-#define dprint(flg, fmt, args...) \
- if (flg & DBG_MASK) printk(fmt , ## args)
+#define hfs_dbg(flg, fmt, ...) \
+do { \
+ if (DBG_##flg & DBG_MASK) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...) \
+do { \
+ if (DBG_##flg & DBG_MASK) \
+ pr_cont(fmt, ##__VA_ARGS__); \
+} while (0)
+
/*
* struct hfs_inode_info
@@ -174,7 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations;
/* extent.c */
extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
-extern void hfs_ext_write_extent(struct inode *);
+extern int hfs_ext_write_extent(struct inode *);
extern int hfs_extend_file(struct inode *);
extern void hfs_file_truncate(struct inode *);
@@ -213,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
/* string.c */
extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
- struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
extern int hfs_strcmp(const unsigned char *, unsigned int,
const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
/* trans.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 3031dfdd2358..f9299d8a64e3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfs_fs.h"
#include "btree.h"
@@ -237,7 +238,7 @@ void hfs_delete_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino);
+ hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
if (S_ISDIR(inode->i_mode)) {
HFS_SB(sb)->folder_count--;
if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
@@ -416,9 +417,12 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct inode *main_inode = inode;
struct hfs_find_data fd;
hfs_cat_rec rec;
+ int res;
- dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino);
- hfs_ext_write_extent(inode);
+ hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+ res = hfs_ext_write_extent(inode);
+ if (res)
+ return res;
if (inode->i_ino < HFS_FIRSTUSER_CNID) {
switch (inode->i_ino) {
@@ -515,7 +519,11 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
if (!inode)
return ERR_PTR(-ENOMEM);
- hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+ res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+ if (res) {
+ iput(inode);
+ return ERR_PTR(res);
+ }
fd.search_key->cat = HFS_I(dir)->cat_key;
res = hfs_brec_read(&fd, &rec, sizeof(rec));
if (!res) {
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index b7ec224910c5..aa3f0d6d043c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -48,7 +48,7 @@ static int hfs_get_last_session(struct super_block *sb,
*start = (sector_t)te.cdte_addr.lba << 2;
return 0;
}
- printk(KERN_ERR "hfs: invalid session number or type of track\n");
+ pr_err("invalid session number or type of track\n");
return -EINVAL;
}
ms_info.addr_format = CDROM_LBA;
@@ -101,7 +101,7 @@ int hfs_mdb_get(struct super_block *sb)
HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
- printk(KERN_ERR "hfs: bad allocation block size %d\n", size);
+ pr_err("bad allocation block size %d\n", size);
goto out_bh;
}
@@ -118,7 +118,7 @@ int hfs_mdb_get(struct super_block *sb)
size >>= 1;
brelse(bh);
if (!sb_set_blocksize(sb, size)) {
- printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size);
+ pr_err("unable to set blocksize to %u\n", size);
goto out;
}
@@ -162,8 +162,8 @@ int hfs_mdb_get(struct super_block *sb)
}
if (!HFS_SB(sb)->alt_mdb) {
- printk(KERN_WARNING "hfs: unable to locate alternate MDB\n");
- printk(KERN_WARNING "hfs: continuing without an alternate MDB\n");
+ pr_warn("unable to locate alternate MDB\n");
+ pr_warn("continuing without an alternate MDB\n");
}
HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
@@ -178,7 +178,7 @@ int hfs_mdb_get(struct super_block *sb)
while (size) {
bh = sb_bread(sb, off >> sb->s_blocksize_bits);
if (!bh) {
- printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+ pr_err("unable to read volume bitmap\n");
goto out;
}
off2 = off & (sb->s_blocksize - 1);
@@ -192,23 +192,22 @@ int hfs_mdb_get(struct super_block *sb)
HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
if (!HFS_SB(sb)->ext_tree) {
- printk(KERN_ERR "hfs: unable to open extent tree\n");
+ pr_err("unable to open extent tree\n");
goto out;
}
HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
if (!HFS_SB(sb)->cat_tree) {
- printk(KERN_ERR "hfs: unable to open catalog tree\n");
+ pr_err("unable to open catalog tree\n");
goto out;
}
attrib = mdb->drAtrb;
if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
- printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
- "running fsck.hfs is recommended. mounting read-only.\n");
+ pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
}
if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
- printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n");
+ pr_warn("filesystem is marked locked, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
}
if (!(sb->s_flags & MS_RDONLY)) {
@@ -312,7 +311,7 @@ void hfs_mdb_commit(struct super_block *sb)
while (size) {
bh = sb_bread(sb, block);
if (!bh) {
- printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+ pr_err("unable to read volume bitmap\n");
break;
}
len = min((int)sb->s_blocksize - off, size);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
/*
* Hash a string to an integer in a case-independent way
*/
-int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
{
const unsigned char *name = this->name;
unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
* Test for equality of two strings in the HFS filename character ordering.
* return 1 on failure and 0 on success
*/
-int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
const unsigned char *n1, *n2;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index bbaaa8a4ee64..2d2039e754cd 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -117,12 +117,11 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
if (!(*flags & MS_RDONLY)) {
if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
- printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
- "running fsck.hfs is recommended. leaving read-only.\n");
+ pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
- printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+ pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
}
@@ -253,29 +252,29 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
switch (token) {
case opt_uid:
if (match_int(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: uid requires an argument\n");
+ pr_err("uid requires an argument\n");
return 0;
}
hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
if (!uid_valid(hsb->s_uid)) {
- printk(KERN_ERR "hfs: invalid uid %d\n", tmp);
+ pr_err("invalid uid %d\n", tmp);
return 0;
}
break;
case opt_gid:
if (match_int(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: gid requires an argument\n");
+ pr_err("gid requires an argument\n");
return 0;
}
hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
if (!gid_valid(hsb->s_gid)) {
- printk(KERN_ERR "hfs: invalid gid %d\n", tmp);
+ pr_err("invalid gid %d\n", tmp);
return 0;
}
break;
case opt_umask:
if (match_octal(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: umask requires a value\n");
+ pr_err("umask requires a value\n");
return 0;
}
hsb->s_file_umask = (umode_t)tmp;
@@ -283,39 +282,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
break;
case opt_file_umask:
if (match_octal(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: file_umask requires a value\n");
+ pr_err("file_umask requires a value\n");
return 0;
}
hsb->s_file_umask = (umode_t)tmp;
break;
case opt_dir_umask:
if (match_octal(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: dir_umask requires a value\n");
+ pr_err("dir_umask requires a value\n");
return 0;
}
hsb->s_dir_umask = (umode_t)tmp;
break;
case opt_part:
if (match_int(&args[0], &hsb->part)) {
- printk(KERN_ERR "hfs: part requires an argument\n");
+ pr_err("part requires an argument\n");
return 0;
}
break;
case opt_session:
if (match_int(&args[0], &hsb->session)) {
- printk(KERN_ERR "hfs: session requires an argument\n");
+ pr_err("session requires an argument\n");
return 0;
}
break;
case opt_type:
if (match_fourchar(&args[0], &hsb->s_type)) {
- printk(KERN_ERR "hfs: type requires a 4 character value\n");
+ pr_err("type requires a 4 character value\n");
return 0;
}
break;
case opt_creator:
if (match_fourchar(&args[0], &hsb->s_creator)) {
- printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+ pr_err("creator requires a 4 character value\n");
return 0;
}
break;
@@ -324,14 +323,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
break;
case opt_codepage:
if (hsb->nls_disk) {
- printk(KERN_ERR "hfs: unable to change codepage\n");
+ pr_err("unable to change codepage\n");
return 0;
}
p = match_strdup(&args[0]);
if (p)
hsb->nls_disk = load_nls(p);
if (!hsb->nls_disk) {
- printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p);
+ pr_err("unable to load codepage \"%s\"\n", p);
kfree(p);
return 0;
}
@@ -339,14 +338,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
break;
case opt_iocharset:
if (hsb->nls_io) {
- printk(KERN_ERR "hfs: unable to change iocharset\n");
+ pr_err("unable to change iocharset\n");
return 0;
}
p = match_strdup(&args[0]);
if (p)
hsb->nls_io = load_nls(p);
if (!hsb->nls_io) {
- printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p);
+ pr_err("unable to load iocharset \"%s\"\n", p);
kfree(p);
return 0;
}
@@ -360,7 +359,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
if (hsb->nls_disk && !hsb->nls_io) {
hsb->nls_io = load_nls_default();
if (!hsb->nls_io) {
- printk(KERN_ERR "hfs: unable to load default iocharset\n");
+ pr_err("unable to load default iocharset\n");
return 0;
}
}
@@ -400,7 +399,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
res = -EINVAL;
if (!parse_options((char *)data, sbi)) {
- printk(KERN_ERR "hfs: unable to parse mount options.\n");
+ pr_err("unable to parse mount options\n");
goto bail;
}
@@ -411,14 +410,16 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
res = hfs_mdb_get(sb);
if (res) {
if (!silent)
- printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n",
+ pr_warn("can't find a HFS filesystem on dev %s\n",
hfs_mdb_name(sb));
res = -EINVAL;
goto bail;
}
/* try to get the root inode */
- hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+ if (res)
+ goto bail_no_root;
res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
if (!res) {
if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
@@ -447,7 +448,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
bail_no_root:
- printk(KERN_ERR "hfs: get root inode failed.\n");
+ pr_err("get root inode failed\n");
bail:
hfs_mdb_put(sb);
return res;
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 8d691f124714..0f47890299c4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -56,7 +56,7 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
if (name) {
len = strlen(name);
if (len > HFSPLUS_ATTR_MAX_STRLEN) {
- printk(KERN_ERR "hfs: invalid xattr name's length\n");
+ pr_err("invalid xattr name's length\n");
return -EINVAL;
}
hfsplus_asc2uni(sb,
@@ -166,10 +166,10 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid,
{
int err = 0;
- dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+ hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
if (!HFSPLUS_SB(sb)->attr_tree) {
- printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ pr_err("attributes file doesn't exist\n");
return -EINVAL;
}
@@ -228,11 +228,11 @@ int hfsplus_create_attr(struct inode *inode,
int entry_size;
int err;
- dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n",
+ hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n",
name ? name : NULL, inode->i_ino);
if (!HFSPLUS_SB(sb)->attr_tree) {
- printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ pr_err("attributes file doesn't exist\n");
return -EINVAL;
}
@@ -307,10 +307,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
break;
case HFSPLUS_ATTR_FORK_DATA:
case HFSPLUS_ATTR_EXTENTS:
- printk(KERN_ERR "hfs: only inline data xattr are supported\n");
+ pr_err("only inline data xattr are supported\n");
return -EOPNOTSUPP;
default:
- printk(KERN_ERR "hfs: invalid extended attribute record\n");
+ pr_err("invalid extended attribute record\n");
return -ENOENT;
}
@@ -328,11 +328,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
struct super_block *sb = inode->i_sb;
struct hfs_find_data fd;
- dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n",
+ hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n",
name ? name : NULL, inode->i_ino);
if (!HFSPLUS_SB(sb)->attr_tree) {
- printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ pr_err("attributes file doesn't exist\n");
return -EINVAL;
}
@@ -346,7 +346,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
if (err)
goto out;
} else {
- printk(KERN_ERR "hfs: invalid extended attribute name\n");
+ pr_err("invalid extended attribute name\n");
err = -EINVAL;
goto out;
}
@@ -369,10 +369,10 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
int err = 0;
struct hfs_find_data fd;
- dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+ hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid);
if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
- printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ pr_err("attributes file doesn't exist\n");
return -EINVAL;
}
@@ -384,7 +384,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd);
if (err) {
if (err != -ENOENT)
- printk(KERN_ERR "hfs: xattr search failed.\n");
+ pr_err("xattr search failed\n");
goto end_delete_all;
}
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index d73c98d1ee99..c1422d91cd36 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,7 +22,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
return -ENOMEM;
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
- dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+ hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
switch (tree->cnid) {
case HFSPLUS_CAT_CNID:
@@ -44,7 +44,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
{
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
- dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+ hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
fd->tree->cnid, __builtin_return_address(0));
mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
@@ -56,7 +56,8 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode,
int *end,
int *cur_rec)
{
- __be32 cur_cnid, search_cnid;
+ __be32 cur_cnid;
+ __be32 search_cnid;
if (bnode->tree->cnid == HFSPLUS_EXT_CNID) {
cur_cnid = fd->key->ext.cnid;
@@ -67,8 +68,11 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode,
} else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) {
cur_cnid = fd->key->attr.cnid;
search_cnid = fd->search_key->attr.cnid;
- } else
+ } else {
+ cur_cnid = 0; /* used-uninitialized warning */
+ search_cnid = 0;
BUG();
+ }
if (cur_cnid == search_cnid) {
(*end) = (*cur_rec);
@@ -204,7 +208,7 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
return res;
invalid:
- printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+ pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
height, bnode->height, bnode->type, nidx, parent);
res = -EIO;
release:
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 6feefc0cb48a..d2954451519e 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -30,7 +30,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
if (!len)
return size;
- dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+ hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
mutex_lock(&sbi->alloc_mutex);
mapping = sbi->alloc_file->i_mapping;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
@@ -89,14 +89,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
else
end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
}
- dprint(DBG_BITMAP, "bitmap full\n");
+ hfs_dbg(BITMAP, "bitmap full\n");
start = size;
goto out;
found:
start = offset + (curr - pptr) * 32 + i;
if (start >= size) {
- dprint(DBG_BITMAP, "bitmap full\n");
+ hfs_dbg(BITMAP, "bitmap full\n");
goto out;
}
/* do any partial u32 at the start */
@@ -154,7 +154,7 @@ done:
*max = offset + (curr - pptr) * 32 + i - start;
sbi->free_blocks -= *max;
hfsplus_mark_mdb_dirty(sb);
- dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
+ hfs_dbg(BITMAP, "-> %u,%u\n", start, *max);
out:
mutex_unlock(&sbi->alloc_mutex);
return start;
@@ -173,7 +173,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
if (!count)
return 0;
- dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
+ hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count);
/* are all of the bits in range? */
if ((offset + count) > sbi->total_blocks)
return -ENOENT;
@@ -238,8 +238,7 @@ out:
return 0;
kaboom:
- printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n",
- PTR_ERR(page));
+ pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page));
mutex_unlock(&sbi->alloc_mutex);
return -EIO;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index f31ac6f404f1..11c860204520 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -130,7 +130,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
struct page **src_page, **dst_page;
int l;
- dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
tree = src_node->tree;
@@ -188,7 +188,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
struct page **src_page, **dst_page;
int l;
- dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
src += node->page_offset;
@@ -302,16 +302,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
__be32 cnid;
int i, off, key_off;
- dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+ hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
hfs_bnode_read(node, &desc, 0, sizeof(desc));
- dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+ hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
desc.type, desc.height, be16_to_cpu(desc.num_recs));
off = node->tree->node_size - 2;
for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
key_off = hfs_bnode_read_u16(node, off);
- dprint(DBG_BNODE_MOD, " %d", key_off);
+ hfs_dbg(BNODE_MOD, " %d", key_off);
if (i && node->type == HFS_NODE_INDEX) {
int tmp;
@@ -320,17 +320,17 @@ void hfs_bnode_dump(struct hfs_bnode *node)
tmp = hfs_bnode_read_u16(node, key_off) + 2;
else
tmp = node->tree->max_key_len + 2;
- dprint(DBG_BNODE_MOD, " (%d", tmp);
+ hfs_dbg_cont(BNODE_MOD, " (%d", tmp);
hfs_bnode_read(node, &cnid, key_off + tmp, 4);
- dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+ hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
} else if (i && node->type == HFS_NODE_LEAF) {
int tmp;
tmp = hfs_bnode_read_u16(node, key_off);
- dprint(DBG_BNODE_MOD, " (%d)", tmp);
+ hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
}
}
- dprint(DBG_BNODE_MOD, "\n");
+ hfs_dbg_cont(BNODE_MOD, "\n");
}
void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -366,7 +366,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
/* move down? */
if (!node->prev && !node->next)
- dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
+ hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n");
if (!node->parent) {
tree->root = 0;
tree->depth = 0;
@@ -386,7 +386,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
struct hfs_bnode *node;
if (cnid >= tree->node_count) {
- printk(KERN_ERR "hfs: request for non-existent node "
+ pr_err("request for non-existent node "
"%d in B*Tree\n",
cnid);
return NULL;
@@ -409,7 +409,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
loff_t off;
if (cnid >= tree->node_count) {
- printk(KERN_ERR "hfs: request for non-existent node "
+ pr_err("request for non-existent node "
"%d in B*Tree\n",
cnid);
return NULL;
@@ -425,8 +425,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
node->this = cnid;
set_bit(HFS_BNODE_NEW, &node->flags);
atomic_set(&node->refcnt, 1);
- dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
- node->tree->cnid, node->this);
+ hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+ node->tree->cnid, node->this);
init_waitqueue_head(&node->lock_wq);
spin_lock(&tree->hash_lock);
node2 = hfs_bnode_findhash(tree, cnid);
@@ -470,7 +470,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
{
struct hfs_bnode **p;
- dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+ hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
node->tree->cnid, node->this, atomic_read(&node->refcnt));
for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
*p && *p != node; p = &(*p)->next_hash)
@@ -588,7 +588,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
node = hfs_bnode_findhash(tree, num);
spin_unlock(&tree->hash_lock);
if (node) {
- printk(KERN_CRIT "new node %u already hashed?\n", num);
+ pr_crit("new node %u already hashed?\n", num);
WARN_ON(1);
return node;
}
@@ -620,7 +620,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
{
if (node) {
atomic_inc(&node->refcnt);
- dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
+ hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
}
@@ -633,7 +633,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
struct hfs_btree *tree = node->tree;
int i;
- dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
+ hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
BUG_ON(!atomic_read(&node->refcnt));
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 298d4e45604b..6e560d56094b 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -45,13 +45,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
if (!recoff)
return 0;
if (recoff > node->tree->node_size - 2) {
- printk(KERN_ERR "hfs: recoff %d too large\n", recoff);
+ pr_err("recoff %d too large\n", recoff);
return 0;
}
retval = hfs_bnode_read_u16(node, recoff) + 2;
if (retval > node->tree->max_key_len + 2) {
- printk(KERN_ERR "hfs: keylen %d too large\n",
+ pr_err("keylen %d too large\n",
retval);
retval = 0;
}
@@ -90,7 +90,7 @@ again:
end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
end_off = hfs_bnode_read_u16(node, end_rec_off);
end_rec_off -= 2;
- dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+ hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
rec, size, end_off, end_rec_off);
if (size > end_rec_off - end_off) {
if (new_node)
@@ -191,7 +191,7 @@ again:
mark_inode_dirty(tree->inode);
}
hfs_bnode_dump(node);
- dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+ hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
fd->record, fd->keylength + fd->entrylength);
if (!--node->num_recs) {
hfs_bnode_unlink(node);
@@ -244,7 +244,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
if (IS_ERR(new_node))
return new_node;
hfs_bnode_get(node);
- dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+ hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
node->this, new_node->this, node->next);
new_node->next = node->next;
new_node->prev = node->this;
@@ -379,7 +379,7 @@ again:
newkeylen = hfs_bnode_read_u16(node, 14) + 2;
else
fd->keylength = newkeylen = tree->max_key_len + 2;
- dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+ hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
rec, fd->keylength, newkeylen);
rec_off = tree->node_size - (rec + 2) * 2;
@@ -391,7 +391,7 @@ again:
end_off = hfs_bnode_read_u16(parent, end_rec_off);
if (end_rec_off - end_off < diff) {
- dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
+ hfs_dbg(BNODE_MOD, "splitting index node\n");
fd->bnode = parent;
new_node = hfs_bnode_split(fd);
if (IS_ERR(new_node))
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index efb689c21a95..0c6540c91167 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -40,8 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
tree->inode = inode;
if (!HFSPLUS_I(tree->inode)->first_blocks) {
- printk(KERN_ERR
- "hfs: invalid btree extent records (0 size).\n");
+ pr_err("invalid btree extent records (0 size)\n");
goto free_inode;
}
@@ -68,12 +67,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
switch (id) {
case HFSPLUS_EXT_CNID:
if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
- printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+ pr_err("invalid extent max_key_len %d\n",
tree->max_key_len);
goto fail_page;
}
if (tree->attributes & HFS_TREE_VARIDXKEYS) {
- printk(KERN_ERR "hfs: invalid extent btree flag\n");
+ pr_err("invalid extent btree flag\n");
goto fail_page;
}
@@ -81,12 +80,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
break;
case HFSPLUS_CAT_CNID:
if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
- printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+ pr_err("invalid catalog max_key_len %d\n",
tree->max_key_len);
goto fail_page;
}
if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
- printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+ pr_err("invalid catalog btree flag\n");
goto fail_page;
}
@@ -100,19 +99,19 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
break;
case HFSPLUS_ATTR_CNID:
if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) {
- printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n",
+ pr_err("invalid attributes max_key_len %d\n",
tree->max_key_len);
goto fail_page;
}
tree->keycmp = hfsplus_attr_bin_cmp_key;
break;
default:
- printk(KERN_ERR "hfs: unknown B*Tree requested\n");
+ pr_err("unknown B*Tree requested\n");
goto fail_page;
}
if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
- printk(KERN_ERR "hfs: invalid btree flag\n");
+ pr_err("invalid btree flag\n");
goto fail_page;
}
@@ -155,7 +154,7 @@ void hfs_btree_close(struct hfs_btree *tree)
while ((node = tree->node_hash[i])) {
tree->node_hash[i] = node->next_hash;
if (atomic_read(&node->refcnt))
- printk(KERN_CRIT "hfs: node %d:%d "
+ pr_crit("node %d:%d "
"still has %d user(s)!\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
@@ -303,7 +302,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
kunmap(*pagep);
nidx = node->next;
if (!nidx) {
- dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
+ hfs_dbg(BNODE_MOD, "create new bmap node\n");
next_node = hfs_bmap_new_bmap(node, idx);
} else
next_node = hfs_bnode_find(tree, nidx);
@@ -329,7 +328,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
u32 nidx;
u8 *data, byte, m;
- dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+ hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
BUG_ON(!node->this);
tree = node->tree;
nidx = node->this;
@@ -345,7 +344,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
hfs_bnode_put(node);
if (!i) {
/* panic */;
- printk(KERN_CRIT "hfs: unable to free bnode %u. "
+ pr_crit("unable to free bnode %u. "
"bmap not found!\n",
node->this);
return;
@@ -355,7 +354,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
return;
if (node->type != HFS_NODE_MAP) {
/* panic */;
- printk(KERN_CRIT "hfs: invalid bmap found! "
+ pr_crit("invalid bmap found! "
"(%u,%d)\n",
node->this, node->type);
hfs_bnode_put(node);
@@ -370,7 +369,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
- printk(KERN_CRIT "hfs: trying to free free bnode "
+ pr_crit("trying to free free bnode "
"%u(%d)\n",
node->this, node->type);
kunmap(page);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 840d71edd193..968ce411db53 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -188,12 +188,12 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
type = be16_to_cpu(tmp.type);
if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) {
- printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+ pr_err("found bad thread record in catalog\n");
return -EIO;
}
if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
- printk(KERN_ERR "hfs: catalog name length corrupted\n");
+ pr_err("catalog name length corrupted\n");
return -EIO;
}
@@ -212,7 +212,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
int entry_size;
int err;
- dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+ hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
str->name, cnid, inode->i_nlink);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
@@ -271,8 +271,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
int err, off;
u16 type;
- dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
- str ? str->name : NULL, cnid);
+ hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return err;
@@ -361,7 +360,7 @@ int hfsplus_rename_cat(u32 cnid,
int entry_size, type;
int err;
- dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+ hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 031c24e50521..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -103,7 +103,7 @@ again:
} else if (!dentry->d_fsdata)
dentry->d_fsdata = (void *)(unsigned long)cnid;
} else {
- printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n");
+ pr_err("invalid catalog entry type in lookup\n");
err = -EIO;
goto fail;
}
@@ -121,9 +121,9 @@ fail:
return ERR_PTR(err);
}
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct hfsplus_readdir_data *rd;
u16 type;
- if (filp->f_pos >= inode->i_size)
+ if (file->f_pos >= inode->i_size)
return 0;
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (err)
goto out;
- switch ((u32)filp->f_pos) {
- case 0:
+ if (ctx->pos == 0) {
/* This is completely artificial... */
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
err = -EIO;
goto out;
@@ -159,31 +158,28 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
fd.entrylength);
if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
- printk(KERN_ERR "hfs: bad catalog folder thread\n");
+ pr_err("bad catalog folder thread\n");
err = -EIO;
goto out;
}
if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) {
- printk(KERN_ERR "hfs: truncated catalog thread\n");
+ pr_err("truncated catalog thread\n");
err = -EIO;
goto out;
}
- if (filldir(dirent, "..", 2, 1,
+ if (!dir_emit(ctx, "..", 2,
be32_to_cpu(entry.thread.parentID), DT_DIR))
goto out;
- filp->f_pos++;
- /* fall through */
- default:
- if (filp->f_pos >= inode->i_size)
- goto out;
- err = hfs_brec_goto(&fd, filp->f_pos - 1);
- if (err)
- goto out;
+ ctx->pos = 2;
}
-
+ if (ctx->pos >= inode->i_size)
+ goto out;
+ err = hfs_brec_goto(&fd, ctx->pos - 1);
+ if (err)
+ goto out;
for (;;) {
if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
- printk(KERN_ERR "hfs: walked past end of dir\n");
+ pr_err("walked past end of dir\n");
err = -EIO;
goto out;
}
@@ -203,7 +199,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (type == HFSPLUS_FOLDER) {
if (fd.entrylength <
sizeof(struct hfsplus_cat_folder)) {
- printk(KERN_ERR "hfs: small dir entry\n");
+ pr_err("small dir entry\n");
err = -EIO;
goto out;
}
@@ -211,40 +207,40 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
HFSPLUS_SB(sb)->hidden_dir->i_ino ==
be32_to_cpu(entry.folder.id))
goto next;
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.folder.id), DT_DIR))
break;
} else if (type == HFSPLUS_FILE) {
if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
- printk(KERN_ERR "hfs: small file entry\n");
+ pr_err("small file entry\n");
err = -EIO;
goto out;
}
- if (filldir(dirent, strbuf, len, filp->f_pos,
+ if (!dir_emit(ctx, strbuf, len,
be32_to_cpu(entry.file.id), DT_REG))
break;
} else {
- printk(KERN_ERR "hfs: bad catalog entry type\n");
+ pr_err("bad catalog entry type\n");
err = -EIO;
goto out;
}
next:
- filp->f_pos++;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos++;
+ if (ctx->pos >= inode->i_size)
goto out;
err = hfs_brec_goto(&fd, 1);
if (err)
goto out;
}
- rd = filp->private_data;
+ rd = file->private_data;
if (!rd) {
rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
if (!rd) {
err = -ENOMEM;
goto out;
}
- filp->private_data = rd;
- rd->file = filp;
+ file->private_data = rd;
+ rd->file = file;
list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
}
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
const struct file_operations hfsplus_dir_operations = {
.fsync = hfsplus_file_fsync,
.read = generic_read_dir,
- .readdir = hfsplus_readdir,
+ .iterate = hfsplus_readdir,
.unlocked_ioctl = hfsplus_ioctl,
.llseek = generic_file_llseek,
.release = hfsplus_dir_release,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fe0a76213d9e..fbb212fbb1ef 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,7 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
}
-static void __hfsplus_ext_write_extent(struct inode *inode,
+static int __hfsplus_ext_write_extent(struct inode *inode,
struct hfs_find_data *fd)
{
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@ -98,13 +98,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
res = hfs_brec_find(fd, hfs_find_rec_by_key);
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
- return;
+ return res;
hfs_brec_insert(fd, hip->cached_extents,
sizeof(hfsplus_extent_rec));
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
} else {
if (res)
- return;
+ return res;
hfs_bnode_write(fd->bnode, hip->cached_extents,
fd->entryoffset, fd->entrylength);
hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
@@ -117,11 +117,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
* to explicily mark the inode dirty, too.
*/
set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
+
+ return 0;
}
static int hfsplus_ext_write_extent_locked(struct inode *inode)
{
- int res;
+ int res = 0;
if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
struct hfs_find_data fd;
@@ -129,10 +131,10 @@ static int hfsplus_ext_write_extent_locked(struct inode *inode)
res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
if (res)
return res;
- __hfsplus_ext_write_extent(inode, &fd);
+ res = __hfsplus_ext_write_extent(inode, &fd);
hfs_find_exit(&fd);
}
- return 0;
+ return res;
}
int hfsplus_ext_write_extent(struct inode *inode)
@@ -175,8 +177,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
WARN_ON(!mutex_is_locked(&hip->extents_lock));
- if (hip->extent_state & HFSPLUS_EXT_DIRTY)
- __hfsplus_ext_write_extent(inode, fd);
+ if (hip->extent_state & HFSPLUS_EXT_DIRTY) {
+ res = __hfsplus_ext_write_extent(inode, fd);
+ if (res)
+ return res;
+ }
res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
block, HFSPLUS_IS_RSRC(inode) ?
@@ -265,7 +270,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
mutex_unlock(&hip->extents_lock);
done:
- dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+ hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n",
inode->i_ino, (long long)iblock, dblock);
mask = (1 << sbi->fs_shift) - 1;
@@ -288,11 +293,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent)
{
int i;
- dprint(DBG_EXTENT, " ");
+ hfs_dbg(EXTENT, " ");
for (i = 0; i < 8; i++)
- dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block),
- be32_to_cpu(extent[i].block_count));
- dprint(DBG_EXTENT, "\n");
+ hfs_dbg_cont(EXTENT, " %u:%u",
+ be32_to_cpu(extent[i].start_block),
+ be32_to_cpu(extent[i].block_count));
+ hfs_dbg_cont(EXTENT, "\n");
}
static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
@@ -348,8 +354,8 @@ found:
if (count <= block_nr) {
err = hfsplus_block_free(sb, start, count);
if (err) {
- printk(KERN_ERR "hfs: can't free extent\n");
- dprint(DBG_EXTENT, " start: %u count: %u\n",
+ pr_err("can't free extent\n");
+ hfs_dbg(EXTENT, " start: %u count: %u\n",
start, count);
}
extent->block_count = 0;
@@ -359,8 +365,8 @@ found:
count -= block_nr;
err = hfsplus_block_free(sb, start + count, block_nr);
if (err) {
- printk(KERN_ERR "hfs: can't free extent\n");
- dprint(DBG_EXTENT, " start: %u count: %u\n",
+ pr_err("can't free extent\n");
+ hfs_dbg(EXTENT, " start: %u count: %u\n",
start, count);
}
extent->block_count = cpu_to_be32(count);
@@ -432,7 +438,7 @@ int hfsplus_file_extend(struct inode *inode)
if (sbi->alloc_file->i_size * 8 <
sbi->total_blocks - sbi->free_blocks + 8) {
/* extend alloc file */
- printk(KERN_ERR "hfs: extend alloc file! "
+ pr_err("extend alloc file! "
"(%llu,%u,%u)\n",
sbi->alloc_file->i_size * 8,
sbi->total_blocks, sbi->free_blocks);
@@ -459,11 +465,11 @@ int hfsplus_file_extend(struct inode *inode)
}
}
- dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+ hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
if (hip->alloc_blocks <= hip->first_blocks) {
if (!hip->first_blocks) {
- dprint(DBG_EXTENT, "first extents\n");
+ hfs_dbg(EXTENT, "first extents\n");
/* no extents yet */
hip->first_extents[0].start_block = cpu_to_be32(start);
hip->first_extents[0].block_count = cpu_to_be32(len);
@@ -500,7 +506,7 @@ out:
return res;
insert_extent:
- dprint(DBG_EXTENT, "insert new extent\n");
+ hfs_dbg(EXTENT, "insert new extent\n");
res = hfsplus_ext_write_extent_locked(inode);
if (res)
goto out;
@@ -525,9 +531,8 @@ void hfsplus_file_truncate(struct inode *inode)
u32 alloc_cnt, blk_cnt, start;
int res;
- dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
- inode->i_ino, (long long)hip->phys_size,
- inode->i_size);
+ hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n",
+ inode->i_ino, (long long)hip->phys_size, inode->i_size);
if (inode->i_size > hip->phys_size) {
struct address_space *mapping = inode->i_mapping;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 05b11f36024c..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -10,6 +10,12 @@
#ifndef _LINUX_HFSPLUS_FS_H
#define _LINUX_HFSPLUS_FS_H
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/buffer_head.h>
@@ -32,9 +38,17 @@
#endif
#define DBG_MASK (0)
-#define dprint(flg, fmt, args...) \
- if (flg & DBG_MASK) \
- printk(fmt , ## args)
+#define hfs_dbg(flg, fmt, ...) \
+do { \
+ if (DBG_##flg & DBG_MASK) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...) \
+do { \
+ if (DBG_##flg & DBG_MASK) \
+ pr_cont(fmt, ##__VA_ARGS__); \
+} while (0)
/* Runtime config options */
#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */
@@ -481,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
const struct hfsplus_unistr *, char *, int *);
int hfsplus_asc2uni(struct super_block *,
struct hfsplus_unistr *, int, const char *, int);
-int hfsplus_hash_dentry(const struct dentry *dentry,
- const struct inode *inode, struct qstr *str);
-int hfsplus_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
/* wrapper.c */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 160ccc9cdb4b..f833d35630ab 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
@@ -357,7 +358,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
if (!error)
error = error2;
} else {
- printk(KERN_ERR "hfs: sync non-existent attributes tree\n");
+ pr_err("sync non-existent attributes tree\n");
}
}
@@ -573,7 +574,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
HFSPLUS_I(inode)->create_date = file->create_date;
} else {
- printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
+ pr_err("bad catalog entry used to create inode\n");
res = -EIO;
}
return res;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index ed257c671615..968eab5bc1f5 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -113,67 +113,67 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
switch (token) {
case opt_creator:
if (match_fourchar(&args[0], &sbi->creator)) {
- printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+ pr_err("creator requires a 4 character value\n");
return 0;
}
break;
case opt_type:
if (match_fourchar(&args[0], &sbi->type)) {
- printk(KERN_ERR "hfs: type requires a 4 character value\n");
+ pr_err("type requires a 4 character value\n");
return 0;
}
break;
case opt_umask:
if (match_octal(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: umask requires a value\n");
+ pr_err("umask requires a value\n");
return 0;
}
sbi->umask = (umode_t)tmp;
break;
case opt_uid:
if (match_int(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: uid requires an argument\n");
+ pr_err("uid requires an argument\n");
return 0;
}
sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
if (!uid_valid(sbi->uid)) {
- printk(KERN_ERR "hfs: invalid uid specified\n");
+ pr_err("invalid uid specified\n");
return 0;
}
break;
case opt_gid:
if (match_int(&args[0], &tmp)) {
- printk(KERN_ERR "hfs: gid requires an argument\n");
+ pr_err("gid requires an argument\n");
return 0;
}
sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
if (!gid_valid(sbi->gid)) {
- printk(KERN_ERR "hfs: invalid gid specified\n");
+ pr_err("invalid gid specified\n");
return 0;
}
break;
case opt_part:
if (match_int(&args[0], &sbi->part)) {
- printk(KERN_ERR "hfs: part requires an argument\n");
+ pr_err("part requires an argument\n");
return 0;
}
break;
case opt_session:
if (match_int(&args[0], &sbi->session)) {
- printk(KERN_ERR "hfs: session requires an argument\n");
+ pr_err("session requires an argument\n");
return 0;
}
break;
case opt_nls:
if (sbi->nls) {
- printk(KERN_ERR "hfs: unable to change nls mapping\n");
+ pr_err("unable to change nls mapping\n");
return 0;
}
p = match_strdup(&args[0]);
if (p)
sbi->nls = load_nls(p);
if (!sbi->nls) {
- printk(KERN_ERR "hfs: unable to load "
+ pr_err("unable to load "
"nls mapping \"%s\"\n",
p);
kfree(p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7b87284e46dc..4c4d142cf890 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -132,7 +132,7 @@ static int hfsplus_system_write_inode(struct inode *inode)
if (tree) {
int err = hfs_btree_write(tree);
if (err) {
- printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n",
+ pr_err("b-tree write err: %d, ino %lu\n",
err, inode->i_ino);
return err;
}
@@ -145,7 +145,7 @@ static int hfsplus_write_inode(struct inode *inode,
{
int err;
- dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+ hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
err = hfsplus_ext_write_extent(inode);
if (err)
@@ -160,7 +160,7 @@ static int hfsplus_write_inode(struct inode *inode,
static void hfsplus_evict_inode(struct inode *inode)
{
- dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+ hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
if (HFSPLUS_IS_RSRC(inode)) {
@@ -179,7 +179,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
if (!wait)
return 0;
- dprint(DBG_SUPER, "hfsplus_sync_fs\n");
+ hfs_dbg(SUPER, "hfsplus_sync_fs\n");
/*
* Explicitly write out the special metadata inodes.
@@ -251,7 +251,7 @@ static void delayed_sync_fs(struct work_struct *work)
err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
if (err)
- printk(KERN_ERR "hfs: delayed sync fs err %d\n", err);
+ pr_err("delayed sync fs err %d\n", err);
}
void hfsplus_mark_mdb_dirty(struct super_block *sb)
@@ -275,7 +275,7 @@ static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
- dprint(DBG_SUPER, "hfsplus_put_super\n");
+ hfs_dbg(SUPER, "hfsplus_put_super\n");
cancel_delayed_work_sync(&sbi->sync_work);
@@ -333,25 +333,19 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
return -EINVAL;
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
- printk(KERN_WARNING "hfs: filesystem was "
- "not cleanly unmounted, "
- "running fsck.hfsplus is recommended. "
- "leaving read-only.\n");
+ pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
} else if (force) {
/* nothing */
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
- printk(KERN_WARNING "hfs: filesystem is marked locked, "
- "leaving read-only.\n");
+ pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
- printk(KERN_WARNING "hfs: filesystem is "
- "marked journaled, "
- "leaving read-only.\n");
+ pr_warn("filesystem is marked journaled, leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
}
@@ -397,7 +391,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
if (!hfsplus_parse_options(data, sbi)) {
- printk(KERN_ERR "hfs: unable to parse mount options\n");
+ pr_err("unable to parse mount options\n");
goto out_unload_nls;
}
@@ -405,14 +399,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
nls = sbi->nls;
sbi->nls = load_nls("utf8");
if (!sbi->nls) {
- printk(KERN_ERR "hfs: unable to load nls for utf8\n");
+ pr_err("unable to load nls for utf8\n");
goto out_unload_nls;
}
/* Grab the volume header */
if (hfsplus_read_wrapper(sb)) {
if (!silent)
- printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
+ pr_warn("unable to find HFS+ superblock\n");
goto out_unload_nls;
}
vhdr = sbi->s_vhdr;
@@ -421,7 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = HFSPLUS_VOLHEAD_SIG;
if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
- printk(KERN_ERR "hfs: wrong filesystem version\n");
+ pr_err("wrong filesystem version\n");
goto out_free_vhdr;
}
sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
@@ -445,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
(last_fs_page > (pgoff_t)(~0ULL))) {
- printk(KERN_ERR "hfs: filesystem size too large.\n");
+ pr_err("filesystem size too large\n");
goto out_free_vhdr;
}
@@ -454,22 +448,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = MAX_LFS_FILESIZE;
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
- printk(KERN_WARNING "hfs: Filesystem was "
- "not cleanly unmounted, "
- "running fsck.hfsplus is recommended. "
- "mounting read-only.\n");
+ pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
- printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
+ pr_warn("Filesystem is marked locked, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_WARNING "hfs: write access to "
- "a journaled filesystem is not supported, "
- "use the force option at your own risk, "
- "mounting read-only.\n");
+ pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
}
@@ -478,18 +466,18 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
/* Load metadata objects (B*Trees) */
sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
if (!sbi->ext_tree) {
- printk(KERN_ERR "hfs: failed to load extents file\n");
+ pr_err("failed to load extents file\n");
goto out_free_vhdr;
}
sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
if (!sbi->cat_tree) {
- printk(KERN_ERR "hfs: failed to load catalog file\n");
+ pr_err("failed to load catalog file\n");
goto out_close_ext_tree;
}
if (vhdr->attr_file.total_blocks != 0) {
sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
if (!sbi->attr_tree) {
- printk(KERN_ERR "hfs: failed to load attributes file\n");
+ pr_err("failed to load attributes file\n");
goto out_close_cat_tree;
}
}
@@ -497,7 +485,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
if (IS_ERR(inode)) {
- printk(KERN_ERR "hfs: failed to load allocation file\n");
+ pr_err("failed to load allocation file\n");
err = PTR_ERR(inode);
goto out_close_attr_tree;
}
@@ -506,7 +494,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
/* Load the root directory */
root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
if (IS_ERR(root)) {
- printk(KERN_ERR "hfs: failed to load root directory\n");
+ pr_err("failed to load root directory\n");
err = PTR_ERR(root);
goto out_put_alloc_file;
}
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
* Composed unicode characters are decomposed and case-folding is performed
* if the appropriate bits are (un)set on the superblock.
*/
-int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
{
struct super_block *sb = dentry->d_sb;
const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
* Composed unicode characters are decomposed and case-folding is performed
* if the appropriate bits are (un)set on the superblock.
*/
-int hfsplus_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct super_block *sb = parent->d_sb;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 90effcccca9a..b51a6079108d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -156,7 +156,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
*start = (sector_t)te.cdte_addr.lba << 2;
return 0;
}
- printk(KERN_ERR "hfs: invalid session number or type of track\n");
+ pr_err("invalid session number or type of track\n");
return -EINVAL;
}
ms_info.addr_format = CDROM_LBA;
@@ -234,8 +234,7 @@ reread:
error = -EINVAL;
if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
- printk(KERN_WARNING
- "hfs: invalid secondary volume header\n");
+ pr_warn("invalid secondary volume header\n");
goto out_free_backup_vhdr;
}
@@ -259,8 +258,7 @@ reread:
blocksize >>= 1;
if (sb_set_blocksize(sb, blocksize) != blocksize) {
- printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
- blocksize);
+ pr_err("unable to set blocksize to %u!\n", blocksize);
goto out_free_backup_vhdr;
}
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e8a4b0815c61..f66346155df5 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -107,19 +107,19 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
if (err) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
return err;
}
err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
if (err) {
- printk(KERN_ERR "hfs: catalog searching failed\n");
+ pr_err("catalog searching failed\n");
goto end_setxattr;
}
if (!strcmp_xattr_finder_info(name)) {
if (flags & XATTR_CREATE) {
- printk(KERN_ERR "hfs: xattr exists yet\n");
+ pr_err("xattr exists yet\n");
err = -EOPNOTSUPP;
goto end_setxattr;
}
@@ -165,7 +165,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
if (hfsplus_attr_exists(inode, name)) {
if (flags & XATTR_CREATE) {
- printk(KERN_ERR "hfs: xattr exists yet\n");
+ pr_err("xattr exists yet\n");
err = -EOPNOTSUPP;
goto end_setxattr;
}
@@ -177,7 +177,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
goto end_setxattr;
} else {
if (flags & XATTR_REPLACE) {
- printk(KERN_ERR "hfs: cannot replace xattr\n");
+ pr_err("cannot replace xattr\n");
err = -EOPNOTSUPP;
goto end_setxattr;
}
@@ -210,7 +210,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
cat_entry_flags);
hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
} else {
- printk(KERN_ERR "hfs: invalid catalog entry type\n");
+ pr_err("invalid catalog entry type\n");
err = -EIO;
goto end_setxattr;
}
@@ -269,7 +269,7 @@ static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry,
if (size >= record_len) {
res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
if (res) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
return res;
}
res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -340,13 +340,13 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
entry = hfsplus_alloc_attr_entry();
if (!entry) {
- printk(KERN_ERR "hfs: can't allocate xattr entry\n");
+ pr_err("can't allocate xattr entry\n");
return -ENOMEM;
}
res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
if (res) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
goto failed_getxattr_init;
}
@@ -355,7 +355,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
if (res == -ENOENT)
res = -ENODATA;
else
- printk(KERN_ERR "hfs: xattr searching failed\n");
+ pr_err("xattr searching failed\n");
goto out;
}
@@ -368,17 +368,17 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
offsetof(struct hfsplus_attr_inline_data,
length));
if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) {
- printk(KERN_ERR "hfs: invalid xattr record size\n");
+ pr_err("invalid xattr record size\n");
res = -EIO;
goto out;
}
} else if (record_type == HFSPLUS_ATTR_FORK_DATA ||
record_type == HFSPLUS_ATTR_EXTENTS) {
- printk(KERN_ERR "hfs: only inline data xattr are supported\n");
+ pr_err("only inline data xattr are supported\n");
res = -EOPNOTSUPP;
goto out;
} else {
- printk(KERN_ERR "hfs: invalid xattr record\n");
+ pr_err("invalid xattr record\n");
res = -EIO;
goto out;
}
@@ -427,7 +427,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
if (res) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
return res;
}
@@ -506,7 +506,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
if (err) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
return err;
}
@@ -525,8 +525,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
for (;;) {
key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset);
if (key_len == 0 || key_len > fd.tree->max_key_len) {
- printk(KERN_ERR "hfs: invalid xattr key length: %d\n",
- key_len);
+ pr_err("invalid xattr key length: %d\n", key_len);
res = -EIO;
goto end_listxattr;
}
@@ -541,7 +540,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
if (hfsplus_uni2asc(inode->i_sb,
(const struct hfsplus_unistr *)&fd.key->attr.key_name,
strbuf, &xattr_name_len)) {
- printk(KERN_ERR "hfs: unicode conversion failed\n");
+ pr_err("unicode conversion failed\n");
res = -EIO;
goto end_listxattr;
}
@@ -598,13 +597,13 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name)
err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
if (err) {
- printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ pr_err("can't init xattr find struct\n");
return err;
}
err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
if (err) {
- printk(KERN_ERR "hfs: catalog searching failed\n");
+ pr_err("catalog searching failed\n");
goto end_removexattr;
}
@@ -643,7 +642,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name)
flags);
hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
} else {
- printk(KERN_ERR "hfs: invalid catalog entry type\n");
+ pr_err("invalid catalog entry type\n");
err = -EIO;
goto end_removexattr;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0f6e52d22b84..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -7,6 +7,7 @@
*/
#include <linux/fs.h>
+#include <linux/magic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -45,8 +46,6 @@ static const struct dentry_operations hostfs_dentry_ops = {
static char *root_ino = "";
static int append = 0;
-#define HOSTFS_SUPER_MAGIC 0x00c0ffee
-
static const struct inode_operations hostfs_iops;
static const struct inode_operations hostfs_dir_iops;
static const struct inode_operations hostfs_link_iops;
@@ -121,7 +120,7 @@ static char *dentry_name(struct dentry *dentry)
if (!name)
return NULL;
- return __dentry_name(dentry, name); /* will unlock */
+ return __dentry_name(dentry, name);
}
static char *inode_name(struct inode *ino)
@@ -229,10 +228,11 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kzalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL);
if (hi == NULL)
return NULL;
hi->fd = -1;
+ hi->mode = 0;
inode_init_once(&hi->vfs_inode);
return &hi->vfs_inode;
}
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
.show_options = hostfs_show_options,
};
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
{
void *dir;
char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
__putname(name);
if (dir == NULL)
return -error;
- next = file->f_pos;
+ next = ctx->pos;
while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
- error = (*filldir)(ent, name, len, file->f_pos,
- ino, type);
- if (error) break;
- file->f_pos = next;
+ if (!dir_emit(ctx, name, len, ino, type))
+ break;
+ ctx->pos = next;
}
close_dir(dir);
return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
- .readdir = hostfs_readdir,
+ .iterate = hostfs_readdir,
.read = generic_read_dir,
};
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2e..4d0a1afa058c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
*/
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include "hpfs_fn.h"
+void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
+{
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+ return;
+
+ bh = sb_find_get_block(s, secno);
+ if (bh) {
+ if (buffer_uptodate(bh)) {
+ brelse(bh);
+ return;
+ }
+ brelse(bh);
+ };
+
+ blk_start_plug(&plug);
+ while (n > 0) {
+ if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+ break;
+ sb_breadahead(s, secno);
+ secno++;
+ n--;
+ }
+ blk_finish_plug(&plug);
+}
+
/* Map a sector into a buffer and return pointers to it and to the buffer. */
void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
hpfs_lock_assert(s);
+ hpfs_prefetch_sectors(s, secno, ahead);
+
cond_resched();
*bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
return NULL;
}
+ hpfs_prefetch_sectors(s, secno, 4 + ahead);
+
qbh->data = data = kmalloc(2048, GFP_NOFS);
if (!data) {
printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
* Note: the dentry argument is the parent dentry.
*/
-static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
unsigned long hash;
int i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
return 0;
}
-static int hpfs_compare_dentry(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
unsigned al = len;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 546f6d39713a..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
if (whence == SEEK_DATA || whence == SEEK_HOLE)
return -EINVAL;
+ mutex_lock(&i->i_mutex);
hpfs_lock(s);
/*printk("dir lseek\n");*/
if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
- mutex_lock(&i->i_mutex);
pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
while (pos != new_off) {
if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
else goto fail;
if (pos == 12) goto fail;
}
- mutex_unlock(&i->i_mutex);
+ hpfs_add_pos(i, &filp->f_pos);
ok:
+ filp->f_pos = new_off;
hpfs_unlock(s);
- return filp->f_pos = new_off;
-fail:
mutex_unlock(&i->i_mutex);
+ return new_off;
+fail:
/*printk("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
+ mutex_unlock(&i->i_mutex);
return -ESPIPE;
}
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
struct quad_buffer_head qbh;
struct hpfs_dirent *de;
int lc;
- long old_pos;
+ loff_t next_pos;
unsigned char *tempname;
int c1, c2 = 0;
int ret = 0;
@@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
}
lc = hpfs_sb(inode->i_sb)->sb_lowercase;
- if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
- filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+ if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
+ ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
goto out;
}
- if (filp->f_pos == 13) {
+ if (ctx->pos == 13) {
ret = -ENOENT;
goto out;
}
@@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
accepted by filldir, but what can I do?
maybe killall -9 ls helps */
if (hpfs_sb(inode->i_sb)->sb_chk)
- if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+ if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
ret = -EFSERROR;
goto out;
}
- if (filp->f_pos == 12)
+ if (ctx->pos == 12)
goto out;
- if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
- printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+ if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
+ printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
goto out;
}
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
goto out;
- filp->f_pos = 11;
+ ctx->pos = 11;
}
- if (filp->f_pos == 11) {
- if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+ if (ctx->pos == 11) {
+ if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
goto out;
- filp->f_pos = 1;
+ ctx->pos = 1;
}
- if (filp->f_pos == 1) {
- filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
- hpfs_add_pos(inode, &filp->f_pos);
- filp->f_version = inode->i_version;
+ if (ctx->pos == 1) {
+ ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+ hpfs_add_pos(inode, &file->f_pos);
+ file->f_version = inode->i_version;
}
- old_pos = filp->f_pos;
- if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+ next_pos = ctx->pos;
+ if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+ ctx->pos = next_pos;
ret = -EIOERROR;
goto out;
}
@@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (hpfs_sb(inode->i_sb)->sb_chk) {
if (de->first && !de->last && (de->namelen != 2
|| de ->name[0] != 1 || de->name[1] != 1))
- hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+ hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
if (de->last && (de->namelen != 1 || de ->name[0] != 255))
- hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+ hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
}
hpfs_brelse4(&qbh);
+ ctx->pos = next_pos;
goto again;
}
tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
- if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
- filp->f_pos = old_pos;
+ if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
if (tempname != de->name) kfree(tempname);
hpfs_brelse4(&qbh);
goto out;
}
+ ctx->pos = next_pos;
if (tempname != de->name) kfree(tempname);
hpfs_brelse4(&qbh);
}
@@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
{
.llseek = hpfs_dir_lseek,
.read = generic_read_dir,
- .readdir = hpfs_readdir,
+ .iterate = hpfs_readdir,
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 9f9dbeceeee7..4e9dabcf1f4c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
*/
#include "hpfs_fn.h"
+#include <linux/mpage.h>
#define BLOCKS(size) (((size) + 511) >> 9)
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
* so we must ignore such errors.
*/
-static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
{
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
struct buffer_head *bh;
if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
n = file_secno - hpfs_inode->i_file_sec;
- if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n;
+ if (n < hpfs_inode->i_n_secs) {
+ *n_secs = hpfs_inode->i_n_secs - n;
+ return hpfs_inode->i_disk_sec + n;
+ }
if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
if (disk_secno == -1) return 0;
if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+ n = file_secno - hpfs_inode->i_file_sec;
+ if (n < hpfs_inode->i_n_secs) {
+ *n_secs = hpfs_inode->i_n_secs - n;
+ return hpfs_inode->i_disk_sec + n;
+ }
+ *n_secs = 1;
return disk_secno;
}
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
{
int r;
secno s;
+ unsigned n_secs;
hpfs_lock(inode->i_sb);
- s = hpfs_bmap(inode, iblock);
+ s = hpfs_bmap(inode, iblock, &n_secs);
if (s) {
+ if (bh_result->b_size >> 9 < n_secs)
+ n_secs = bh_result->b_size >> 9;
map_bh(bh_result, inode->i_sb, s);
+ bh_result->b_size = n_secs << 9;
goto ret_0;
}
if (!create) goto ret_0;
@@ -95,24 +109,40 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
return r;
}
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, hpfs_get_block);
+}
+
static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
{
- return block_write_full_page(page,hpfs_get_block, wbc);
+ return block_write_full_page(page, hpfs_get_block, wbc);
}
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+}
+
+static int hpfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_read_full_page(page,hpfs_get_block);
+ return mpage_writepages(mapping, wbc, hpfs_get_block);
}
static void hpfs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ hpfs_lock(inode->i_sb);
+
if (to > inode->i_size) {
truncate_pagecache(inode, to, inode->i_size);
hpfs_truncate(inode);
}
+
+ hpfs_unlock(inode->i_sb);
}
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
@@ -131,6 +161,24 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int hpfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *pagep, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int err;
+ err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+ if (err < len)
+ hpfs_write_failed(mapping, pos + len);
+ if (!(err < 0)) {
+ /* make sure we write it on close, if not earlier */
+ hpfs_lock(inode->i_sb);
+ hpfs_i(inode)->i_dirty = 1;
+ hpfs_unlock(inode->i_sb);
+ }
+ return err;
+}
+
static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,hpfs_get_block);
@@ -139,31 +187,19 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations hpfs_aops = {
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
+ .readpages = hpfs_readpages,
+ .writepages = hpfs_writepages,
.write_begin = hpfs_write_begin,
- .write_end = generic_write_end,
+ .write_end = hpfs_write_end,
.bmap = _hpfs_bmap
};
-static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- ssize_t retval;
-
- retval = do_sync_write(file, buf, count, ppos);
- if (retval > 0) {
- hpfs_lock(file->f_path.dentry->d_sb);
- hpfs_i(file_inode(file))->i_dirty = 1;
- hpfs_unlock(file->f_path.dentry->d_sb);
- }
- return retval;
-}
-
const struct file_operations hpfs_file_ops =
{
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
- .write = hpfs_file_write,
+ .write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
.release = hpfs_file_release,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b5..1b398636e990 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
#define ALLOC_FWD_MAX 128
#define ALLOC_M 1
#define FNODE_RD_AHEAD 16
-#define ANODE_RD_AHEAD 16
-#define DNODE_RD_AHEAD 4
+#define ANODE_RD_AHEAD 0
+#define DNODE_RD_AHEAD 72
+#define COUNT_RD_AHEAD 62
#define FREE_DNODES_ADD 58
#define FREE_DNODES_DEL 29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
/* buffer.c */
+void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+void hpfs_prefetch_bitmap(struct super_block *, unsigned);
unsigned char *hpfs_load_code_page(struct super_block *, secno);
__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d78359..3aa66ae1031e 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
struct quad_buffer_head *qbh, char *id)
{
secno sec;
- if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+ __le32 *ret;
+ unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+ if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
return NULL;
}
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
return NULL;
}
- return hpfs_map_4sectors(s, sec, qbh, 4);
+ ret = hpfs_map_4sectors(s, sec, qbh, 4);
+ if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
+ return ret;
+}
+
+void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
+{
+ unsigned to_prefetch, next_prefetch;
+ unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+ if (unlikely(bmp_block >= n_bands))
+ return;
+ to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+ if (unlikely(bmp_block + 1 >= n_bands))
+ next_prefetch = 0;
+ else
+ next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
+ hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
}
/*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e706957..4334cda8dba1 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
unsigned long *bits;
unsigned count;
- bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+ bits = hpfs_map_4sectors(s, secno, &qbh, 0);
if (!bits)
return 0;
count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
unsigned n, count, n_bands;
n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
count = 0;
- for (n = 0; n < n_bands; n++)
+ for (n = 0; n < COUNT_RD_AHEAD; n++) {
+ hpfs_prefetch_bitmap(s, n);
+ }
+ for (n = 0; n < n_bands; n++) {
+ hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ }
return count;
}
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
sbi->sb_max_fwd_alloc = 0xffffff;
-
+
+ if (sbi->sb_fs_size >= 0x80000000) {
+ hpfs_error(s, "invalid size in superblock: %08x",
+ (unsigned)sbi->sb_fs_size);
+ goto bail4;
+ }
+
/* Load bitmap directory */
if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
goto bail4;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 126d3c2e2dee..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
struct dentry *parent;
char *root, *name;
const char *seg_name;
- int len, seg_len;
+ int len, seg_len, root_len;
len = 0;
parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
}
root = "proc";
- len += strlen(root);
+ root_len = strlen(root);
+ len += root_len;
name = kmalloc(len + extra + 1, GFP_KERNEL);
if (name == NULL)
return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
while (parent->d_parent != parent) {
if (is_pid(parent)) {
seg_name = "pid";
- seg_len = strlen("pid");
+ seg_len = strlen(seg_name);
}
else {
seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
len -= seg_len + 1;
name[len] = '/';
- strncpy(&name[len + 1], seg_name, seg_len);
+ memcpy(&name[len + 1], seg_name, seg_len);
parent = parent->d_parent;
}
- strncpy(name, root, strlen(root));
+ memcpy(name, root, root_len);
return name;
}
@@ -436,7 +437,6 @@ static int hppfs_open(struct inode *inode, struct file *file)
path.mnt = inode->i_sb->s_fs_info;
path.dentry = HPPFS_I(inode)->proc_dentry;
- /* XXX This isn't closed anywhere */
data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
err = PTR_ERR(data->proc_file);
if (IS_ERR(data->proc_file))
@@ -523,17 +523,28 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
return default_llseek(file, off, where);
}
+static int hppfs_release(struct inode *inode, struct file *file)
+{
+ struct hppfs_private *data = file->private_data;
+ struct file *proc_file = data->proc_file;
+ if (proc_file)
+ fput(proc_file);
+ kfree(data);
+ return 0;
+}
+
static const struct file_operations hppfs_file_fops = {
.owner = NULL,
.llseek = hppfs_llseek,
.read = hppfs_read,
.write = hppfs_write,
.open = hppfs_open,
+ .release = hppfs_release,
};
struct hppfs_dirent {
- void *vfs_dirent;
- filldir_t filldir;
+ struct dir_context ctx;
+ struct dir_context *caller;
struct dentry *dentry;
};
@@ -545,43 +556,32 @@ static int hppfs_filldir(void *d, const char *name, int size,
if (file_removed(dirent->dentry, name))
return 0;
- return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
- inode, type);
+ dirent->caller->pos = dirent->ctx.pos;
+ return !dir_emit(dirent->caller, name, size, inode, type);
}
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
{
struct hppfs_private *data = file->private_data;
struct file *proc_file = data->proc_file;
- int (*readdir)(struct file *, void *, filldir_t);
- struct hppfs_dirent dirent = ((struct hppfs_dirent)
- { .vfs_dirent = ent,
- .filldir = filldir,
- .dentry = file->f_path.dentry
- });
+ struct hppfs_dirent d = {
+ .ctx.actor = hppfs_filldir,
+ .caller = ctx,
+ .dentry = file->f_path.dentry
+ };
int err;
-
- readdir = file_inode(proc_file)->i_fop->readdir;
-
- proc_file->f_pos = file->f_pos;
- err = (*readdir)(proc_file, &dirent, hppfs_filldir);
- file->f_pos = proc_file->f_pos;
-
+ proc_file->f_pos = ctx->pos;
+ err = iterate_dir(proc_file, &d.ctx);
+ ctx->pos = d.ctx.pos;
return err;
}
-static int hppfs_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- return filemap_write_and_wait_range(file->f_mapping, start, end);
-}
-
static const struct file_operations hppfs_dir_fops = {
.owner = NULL,
- .readdir = hppfs_readdir,
+ .iterate = hppfs_readdir,
.open = hppfs_dir_open,
- .fsync = hppfs_fsync,
.llseek = default_llseek,
+ .release = hppfs_release,
};
static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 523464e62849..a3f868ae3fd4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void)
static int get_hstate_idx(int page_size_log)
{
- struct hstate *h;
+ struct hstate *h = hstate_sizelog(page_size_log);
- if (!page_size_log)
- return default_hstate_idx;
- h = size_to_hstate(1 << page_size_log);
if (!h)
return -1;
return h - hstates;
@@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = {
.d_dname = hugetlb_dname
};
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
- size_t size, vm_flags_t acctflag,
- struct user_struct **user,
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+ vm_flags_t acctflag, struct user_struct **user,
int creat_flags, int page_size_log)
{
struct file *file = ERR_PTR(-ENOMEM);
@@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
struct path path;
struct super_block *sb;
struct qstr quick_string;
- struct hstate *hstate;
- unsigned long num_pages;
int hstate_idx;
hstate_idx = get_hstate_idx(page_size_log);
@@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
if (!inode)
goto out_dentry;
- hstate = hstate_inode(inode);
- size += addr & ~huge_page_mask(hstate);
- num_pages = ALIGN(size, huge_page_size(hstate)) >>
- huge_page_shift(hstate);
file = ERR_PTR(-ENOMEM);
- if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+ if (hugetlb_reserve_pages(inode, 0,
+ size >> huge_page_shift(hstate_inode(inode)), NULL,
+ acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index a898b3d43ccf..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
*/
void inc_nlink(struct inode *inode)
{
- if (WARN_ON(inode->i_nlink == 0))
+ if (unlikely(inode->i_nlink == 0)) {
+ WARN_ON(!(inode->i_state & I_LINKABLE));
atomic_long_dec(&inode->i_sb->s_remove_count);
+ }
inode->__i_nlink++;
}
@@ -1803,7 +1805,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
- inode->i_fop = &def_fifo_fops;
+ inode->i_fop = &pipefifo_fops;
else if (S_ISSOCK(mode))
inode->i_fop = &bad_sock_fops;
else
diff --git a/fs/internal.h b/fs/internal.h
index 4be78237d896..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,11 +96,12 @@ struct open_flags {
umode_t mode;
int acc_mode;
int intent;
+ int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
- const struct open_flags *op, int flags);
+ const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
- const char *, const struct open_flags *, int lookup_flags);
+ const char *, const struct open_flags *);
extern long do_handle_open(int mountdirfd,
struct file_handle __user *ufh, int open_flag);
@@ -130,3 +131,15 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
* read_write.c
*/
extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
+
+/*
+ * splice.c
+ */
+extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len, unsigned int flags);
+
+/*
+ * pipe.c
+ */
+extern const struct file_operations pipefifo_fops;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
/*
* This should _really_ be cleaned up some day..
*/
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
- void *dirent, filldir_t filldir,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
+ struct dir_context *ctx,
char *tmpname, struct iso_directory_record *tmpde)
{
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
struct iso_directory_record *de;
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- offset = filp->f_pos & (bufsize - 1);
- block = filp->f_pos >> bufbits;
+ offset = ctx->pos & (bufsize - 1);
+ block = ctx->pos >> bufbits;
- while (filp->f_pos < inode->i_size) {
+ while (ctx->pos < inode->i_size) {
int de_len;
if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
de = (struct iso_directory_record *) (bh->b_data + offset);
- de_len = *(unsigned char *) de;
+ de_len = *(unsigned char *)de;
/*
* If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if (de_len == 0) {
brelse(bh);
bh = NULL;
- filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
- block = filp->f_pos >> bufbits;
+ ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+ block = ctx->pos >> bufbits;
offset = 0;
continue;
}
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if (de->flags[-sbi->s_high_sierra] & 0x80) {
first_de = 0;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
first_de = 1;
/* Handle the case of the '.' directory */
if (de->name_len[0] == 1 && de->name[0] == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+ if (!dir_emit_dot(file, ctx))
break;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
/* Handle the case of the '..' directory */
if (de->name_len[0] == 1 && de->name[0] == 1) {
- inode_number = parent_ino(filp->f_path.dentry);
- if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
+ if (!dir_emit_dotdot(file, ctx))
break;
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
(!sbi->s_showassoc &&
(de->flags[-sbi->s_high_sierra] & 4))) {
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
}
}
if (len > 0) {
- if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+ if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
break;
}
- filp->f_pos += de_len;
+ ctx->pos += de_len;
continue;
}
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
* handling split directory entries.. The real work is done by
* "do_isofs_readdir()".
*/
-static int isofs_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
{
int result;
char *tmpname;
struct iso_directory_record *tmpde;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
tmpname = (char *)__get_free_page(GFP_KERNEL);
if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
tmpde = (struct iso_directory_record *) (tmpname+1024);
- result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+ result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
free_page((unsigned long) tmpname);
return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = isofs_readdir,
+ .iterate = isofs_readdir,
};
/*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
#define BEQUIET
-static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
- struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, const struct inode *inode,
- struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+ const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
static int isofs_dentry_cmp(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+ const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
#ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
- struct qstr *qstr);
-static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
- struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi_ms(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+ const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
static int isofs_dentry_cmp_ms(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+ const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
#endif
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
}
static int
-isofs_hash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hash_common(dentry, qstr, 0);
}
static int
-isofs_hashi(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hashi_common(dentry, qstr, 0);
}
static int
-isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 0, 0);
}
static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
#ifdef CONFIG_JOLIET
static int
-isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hash_common(dentry, qstr, 1);
}
static int
-isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hashi_common(dentry, qstr, 1);
}
static int
-isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 1, 0);
}
static int
-isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
qstr.name = compare;
qstr.len = dlen;
- return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
- dentry->d_name.len, dentry->d_name.name, &qstr);
+ return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
}
/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 86b39b167c23..11bb11f48b3a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
for (i = 0; i < bufs; i++) {
wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(write_op, wbuf[i]);
+ /*
+ * Here we write back pagecache data that may be mmaped. Since
+ * we cannot afford to clean the page and set PageWriteback
+ * here due to lock ordering (page lock ranks above transaction
+ * start), the data can change while IO is in flight. Tell the
+ * block layer it should bounce the bio pages if stable data
+ * during write is required.
+ *
+ * We use up our safety reference in submit_bh().
+ */
+ _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
}
}
@@ -667,7 +676,17 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(write_op, bh);
+ /*
+ * In data=journal mode, here we can end up
+ * writing pagecache data that might be
+ * mmapped. Since we can't afford to clean the
+ * page and set PageWriteback (see the comment
+ * near the other use of _submit_bh()), the
+ * data can change while the write is in
+ * flight. Tell the block layer to bounce the
+ * bio pages if stable pages are required.
+ */
+ _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
}
cond_resched();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 81cc7eaff863..6510d6355729 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction,
new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
new_jh = journal_add_journal_head(new_bh); /* This sleeps */
@@ -564,6 +562,16 @@ int log_wait_commit(journal_t *journal, tid_t tid)
spin_unlock(&journal->j_state_lock);
#endif
spin_lock(&journal->j_state_lock);
+ /*
+ * Not running or committing trans? Must be already committed. This
+ * saves us from waiting for a *long* time when tid overflows.
+ */
+ if (!((journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == tid) ||
+ (journal->j_committing_transaction &&
+ journal->j_committing_transaction->t_tid == tid)))
+ goto out_unlock;
+
if (!tid_geq(journal->j_commit_waited, tid))
journal->j_commit_waited = tid;
while (tid_gt(tid, journal->j_commit_sequence)) {
@@ -575,6 +583,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
!tid_gt(tid, journal->j_commit_sequence));
spin_lock(&journal->j_state_lock);
}
+out_unlock:
spin_unlock(&journal->j_state_lock);
if (unlikely(is_journal_aborted(journal))) {
@@ -1845,7 +1854,7 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
if (ret == NULL) {
jbd_debug(1, "out of memory for journal_head\n");
printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
@@ -1853,7 +1862,7 @@ static struct journal_head *journal_alloc_journal_head(void)
while (ret == NULL) {
yield();
- ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -1915,10 +1924,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 071d6905f0dd..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
@@ -2020,16 +2019,20 @@ zap_buffer_unlocked:
* void journal_invalidatepage() - invalidate a journal page
* @journal: journal to use for flush
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: offset of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
if (!PageLocked(page))
@@ -2037,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2046,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh,
- offset > 0);
+ partial_page);
unlock_buffer(bh);
}
curr_off = next_off;
@@ -2058,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
+ depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+ "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd_space_needed(journal);
- while (__jbd2_log_space_left(journal) < nblocks) {
+ nblocks = jbd2_space_needed(journal);
+ while (jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
*/
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __jbd2_log_space_left(journal);
+ nblocks = jbd2_space_needed(journal);
+ space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/* We were able to recover space; yay! */
;
} else if (tid) {
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set. So we need to temporarily drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
+ write_lock(&journal->j_state_lock);
+ continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
ret = 1;
out:
return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
#include <trace/events/jbd2.h>
/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ struct buffer_head *orig_bh = bh->b_private;
+
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
+ if (orig_bh) {
+ clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&orig_bh->b_state, BH_Shadow);
+ }
unlock_buffer(bh);
}
@@ -85,8 +92,7 @@ nope:
__brelse(bh);
}
-static void jbd2_commit_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
{
struct commit_header *h;
__u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+ h = (struct commit_header *)(bh->b_data);
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head **cbh,
__u32 crc32_sum)
{
- struct journal_head *descriptor;
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
- if (!descriptor)
+ bh = jbd2_journal_get_descriptor_buffer(journal);
+ if (!bh)
return 1;
- bh = jh2bh(descriptor);
-
tmp = (struct commit_header *)bh->b_data;
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- jbd2_commit_block_csum_set(journal, descriptor);
+ jbd2_commit_block_csum_set(journal, bh);
- JBUFFER_TRACE(descriptor, "submit commit block");
+ BUFFER_TRACE(bh, "submit commit block");
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
put_bh(bh); /* One for getblk() */
- jbd2_journal_put_journal_head(bh2jh(bh));
return ret;
}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
}
static void jbd2_descr_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+ struct buffer_head *bh)
{
struct jbd2_journal_block_tail *tail;
__u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_block_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
{
struct page *page = bh->b_page;
__u8 *addr;
- __u32 csum;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
sequence = cpu_to_be32(sequence);
addr = kmap_atomic(page);
- csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
- bh->b_size);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+ bh->b_size);
kunmap_atomic(addr);
- tag->t_checksum = cpu_to_be32(csum);
+ /* We only have space to store the lower 16 bits of the crc32c. */
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
{
struct transaction_stats_s stats;
transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *jh;
+ struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
@@ -382,7 +383,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
- int i, to_free = 0;
+ int i;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tid_t first_tid;
int update_tail;
int csum_size = 0;
+ LIST_HEAD(io_bufs);
+ LIST_HEAD(log_bufs);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ /*
+ * Reserved credits cannot be claimed anymore, free them
+ */
+ atomic_sub(atomic_read(&journal->j_reserved_credits),
+ &commit_transaction->t_outstanding_credits);
+
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
- WRITE_SYNC);
+ &log_bufs, WRITE_SYNC);
blk_finish_plug(&plug);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_read(&commit_transaction->t_outstanding_credits));
err = 0;
- descriptor = NULL;
bufs = 0;
+ descriptor = NULL;
blk_start_plug(&plug);
while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- bh = jh2bh(descriptor);
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
+ (unsigned long long)descriptor->b_blocknr,
+ descriptor->b_data);
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
+ tagp = &descriptor->b_data[sizeof(journal_header_t)];
+ space_left = descriptor->b_size -
+ sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
+ set_buffer_jwrite(descriptor);
+ set_buffer_dirty(descriptor);
+ wbuf[bufs++] = descriptor;
/* Record it so that we can wait for IO
completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- jbd2_journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+ jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ rid of the shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
- * akpm: jbd2_journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
+ * Make a temporary IO buffer with which to write it out
+ * (this will requeue the metadata buffer to BJ_Shadow).
*/
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
+ jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
continue;
}
- set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
- wbuf[bufs++] = jh2bh(new_jh);
+ jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
- jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+ jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
+ bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
+ the io_bufs list.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
jbd_debug(3, "JBD2: commit phase 3\n");
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
+ while (!list_empty(&io_bufs)) {
+ struct buffer_head *bh = list_entry(io_bufs.prev,
+ struct buffer_head,
+ b_assoc_buffers);
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_unfile_log_bh(bh);
/*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by jbd2_journal_write_metadata_buffer().
+ * The list contains temporary buffer heads created by
+ * jbd2_journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
- jbd2_journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
- /* We also have to unlock and free the corresponding
- shadowed buffer */
+ /* We also have to refile the corresponding shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
- clear_bit(BH_JWrite, &bh->b_state);
+ clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
+ J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this IO to
- * complete. The barrier must be here so that changes by
- * jbd2_journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
@@ -883,26 +862,19 @@ wait_for_iobuf:
jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
+ while (!list_empty(&log_bufs)) {
struct buffer_head *bh;
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
+ bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
- jbd2_journal_unfile_buffer(journal, jh);
- jbd2_journal_put_journal_head(jh);
+ jbd2_unfile_log_bh(bh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -952,9 +924,7 @@ wait_for_iobuf:
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
@@ -1134,7 +1104,7 @@ restart_loop:
journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
- commit_transaction->t_state = T_FINISHED;
+ commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
@@ -1149,38 +1119,44 @@ restart_loop:
journal->j_average_commit_time*3) / 4;
else
journal->j_average_commit_time = commit_time;
+
write_unlock(&journal->j_state_lock);
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
- __jbd2_journal_drop_transaction(journal, commit_transaction);
- to_free = 1;
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
} else {
- if (journal->j_checkpoint_transactions == NULL) {
- journal->j_checkpoint_transactions = commit_transaction;
- commit_transaction->t_cpnext = commit_transaction;
- commit_transaction->t_cpprev = commit_transaction;
- } else {
- commit_transaction->t_cpnext =
- journal->j_checkpoint_transactions;
- commit_transaction->t_cpprev =
- commit_transaction->t_cpnext->t_cpprev;
- commit_transaction->t_cpnext->t_cpprev =
- commit_transaction;
- commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
- }
}
spin_unlock(&journal->j_list_lock);
-
+ /* Drop all spin_locks because commit_callback may be block.
+ * __journal_remove_checkpoint() can not destroy transaction
+ * under us because it is not marked as T_FINISHED yet */
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
- if (to_free)
- jbd2_journal_free_transaction(commit_transaction);
+ write_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ commit_transaction->t_state = T_FINISHED;
+ /* Recheck checkpoint lists after j_list_lock was dropped */
+ if (commit_transaction->t_checkpoint_list == NULL &&
+ commit_transaction->t_checkpoint_io_list == NULL) {
+ __jbd2_journal_drop_transaction(journal, commit_transaction);
+ jbd2_journal_free_transaction(commit_transaction);
+ }
+ spin_unlock(&journal->j_list_lock);
+ write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ed10991ab006..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > jbd2_journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
/* Checksumming functions */
int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
- *
- * The function returns a pointer to the buffer_heads to be used for IO.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
*
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ *
*
* Return value:
* <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned long long blocknr)
+ struct buffer_head **bh_out,
+ sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
@@ -367,17 +382,14 @@ retry_alloc:
}
/* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+ jbd_lock_bh_state(bh_in);
+repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
- jbd_lock_bh_state(bh_in);
-repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
@@ -417,7 +429,7 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
- jbd2_journal_put_journal_head(new_jh);
+ brelse(new_bh);
return -ENOMEM;
}
jbd_lock_bh_state(bh_in);
@@ -428,7 +440,7 @@ repeat:
jh_in->b_frozen_data = tmp;
mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+ memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
@@ -454,14 +466,14 @@ repeat:
}
set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_size = bh_in->b_size;
+ new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
+ new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
- *jh_out = new_jh;
+ *bh_out = new_bh;
/*
* The to-be-written buffer needs to get moved to the io queue,
@@ -472,11 +484,9 @@ repeat:
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
+ set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in);
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
return do_escape | (done_copy_out << 1);
}
@@ -486,35 +496,6 @@ repeat:
*/
/*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- /* assert_spin_locked(&journal->j_state_lock); */
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
@@ -566,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
}
/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions. We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ * 0 if nothing to commit,
+ * 1 if transaction was successfully committed.
*/
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
- int need_to_start = 0;
+ int need_to_start = 0, ret = 0;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
@@ -590,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
transaction = journal->j_committing_transaction;
if (!transaction) {
+ /* Nothing to commit */
read_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
+ return 0;
}
-
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- return 1;
+ ret = jbd2_log_wait_commit(journal, tid);
+ if (!ret)
+ ret = 1;
+
+ return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ int ret;
+
+ ret = __jbd2_journal_force_commit(journal);
+ return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ int ret;
+
+ J_ASSERT(!current->journal_info);
+ ret = __jbd2_journal_force_commit(journal);
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -710,6 +725,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed. If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete. If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+ int need_to_wait = 1;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == tid) {
+ if (journal->j_commit_request != tid) {
+ /* transaction not yet started, so request it */
+ read_unlock(&journal->j_state_lock);
+ jbd2_log_start_commit(journal, tid);
+ goto wait_commit;
+ }
+ } else if (!(journal->j_committing_transaction &&
+ journal->j_committing_transaction->t_tid == tid))
+ need_to_wait = 0;
+ read_unlock(&journal->j_state_lock);
+ if (!need_to_wait)
+ return 0;
+wait_commit:
+ return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
+/*
* Log buffer allocation routines:
*/
@@ -769,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long long blocknr;
@@ -788,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
- return jbd2_journal_add_journal_head(bh);
+ return bh;
}
/*
@@ -950,7 +996,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
- journal_t *journal = PDE(inode)->data;
+ journal_t *journal = PDE_DATA(inode);
struct jbd2_stats_proc_session *s;
int rc, size;
@@ -1033,11 +1079,10 @@ static journal_t * journal_init_common (void)
return NULL;
init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_reserved);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1047,6 +1092,7 @@ static journal_t * journal_init_common (void)
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
+ atomic_set(&journal->j_reserved_credits, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1289,6 +1335,7 @@ static int journal_reset(journal_t *journal)
static void jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
+ journal_superblock_t *sb = journal->j_superblock;
int ret;
trace_jbd2_write_superblock(journal, write_op);
@@ -1310,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ jbd2_superblock_csum_set(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(write_op, bh);
@@ -1406,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
journal->j_errno);
sb->s_errno = cpu_to_be32(journal->j_errno);
- jbd2_superblock_csum_set(journal, sb);
read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2296,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD2_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
while (!ret) {
yield();
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -2364,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
- __u32 provided, calculated;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
sequence = cpu_to_be32(sequence);
- calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
- provided = be32_to_cpu(tag->t_checksum);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return provided == cpu_to_be32(calculated);
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
+ struct list_head *,
+ struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
+ struct list_head *log_bufs,
int write_op)
{
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction,
+ write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
- struct journal_head **descriptorp,
+ struct list_head *log_bufs,
+ struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
int csum_size = 0;
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
int offset;
journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "file in log_bufs");
+ jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
- * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_revoke_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->r_checksum = cpu_to_be32(csum);
}
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
*/
static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
+ struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
- put_bh(bh);
+ put_bh(descriptor);
return;
}
- header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
jbd2_revoke_csum_set(journal, descriptor);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+ write_dirty_buffer(descriptor, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
- atomic_set(&transaction->t_outstanding_credits, 0);
+ atomic_set(&transaction->t_outstanding_credits,
+ atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+ int need_to_start;
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ need_to_start = !tid_geq(journal->j_commit_request, tid);
+ read_unlock(&journal->j_state_lock);
+ if (need_to_start)
+ jbd2_log_start_commit(journal, tid);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+ atomic_sub(blocks, &journal->j_reserved_credits);
+ wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction. Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+ int rsv_blocks)
+{
+ transaction_t *t = journal->j_running_transaction;
+ int needed;
+ int total = blocks + rsv_blocks;
+
+ /*
+ * If the current transaction is locked down for commit, wait
+ * for the lock to be released.
+ */
+ if (t->t_state == T_LOCKED) {
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log space.
+ */
+ needed = atomic_add_return(total, &t->t_outstanding_credits);
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large,
+ * then start to commit it: we can then go back and
+ * attach this handle to a new transaction.
+ */
+ atomic_sub(total, &t->t_outstanding_credits);
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ */
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ __jbd2_log_wait_for_space(journal);
+ write_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* No reservation? We are done... */
+ if (!rsv_blocks)
+ return 0;
+
+ needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+ /* We allow at most half of a transaction to be reserved */
+ if (needed > journal->j_max_transaction_buffers / 2) {
+ sub_reserved_credits(journal, rsv_blocks);
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + rsv_blocks
+ <= journal->j_max_transaction_buffers / 2);
+ return 1;
+ }
+ return 0;
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- tid_t tid;
- int needed, need_to_start;
- int nblocks = handle->h_buffer_credits;
+ int blocks = handle->h_buffer_credits;
+ int rsv_blocks = 0;
unsigned long ts = jiffies;
- if (nblocks > journal->j_max_transaction_buffers) {
+ /*
+ * 1/2 of transaction can be reserved so we can practically handle
+ * only 1/2 of maximum transaction size per operation
+ */
+ if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
+ current->comm, blocks,
+ journal->j_max_transaction_buffers / 2);
return -ENOSPC;
}
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
return -EROFS;
}
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
+ /*
+ * Wait on the journal's transaction barrier if necessary. Specifically
+ * we allow reserved handles to proceed because otherwise commit could
+ * deadlock on page writeback not being able to complete.
+ */
+ if (!handle->h_reserved && journal->j_barrier_count) {
read_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
goto alloc_transaction;
write_lock(&journal->j_state_lock);
if (!journal->j_running_transaction &&
- !journal->j_barrier_count) {
+ (handle->h_reserved || !journal->j_barrier_count)) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -223,85 +340,18 @@ repeat:
transaction = journal->j_running_transaction;
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- read_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- needed = atomic_add_return(nblocks,
- &transaction->t_outstanding_credits);
-
- if (needed > journal->j_max_transaction_buffers) {
+ if (!handle->h_reserved) {
+ /* We may have dropped j_state_lock - restart in that case */
+ if (add_transaction_credits(journal, blocks, rsv_blocks))
+ goto repeat;
+ } else {
/*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
+ * We have handle reserved so we are allowed to join T_LOCKED
+ * transaction and we don't have to check for transaction size
+ * and journal space.
*/
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- tid = transaction->t_tid;
- need_to_start = !tid_geq(journal->j_commit_request, tid);
- read_unlock(&journal->j_state_lock);
- if (need_to_start)
- jbd2_log_start_commit(journal, tid);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * jbd2_journal_extend().
- */
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- read_unlock(&journal->j_state_lock);
- write_lock(&journal->j_state_lock);
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
- __jbd2_log_wait_for_space(journal);
- write_unlock(&journal->j_state_lock);
- goto repeat;
+ sub_reserved_credits(journal, blocks);
+ handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
- handle->h_requested_credits = nblocks;
+ handle->h_requested_credits = blocks;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks,
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
- __jbd2_log_space_left(journal));
+ jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
+ current->journal_info = handle;
lock_map_acquire(&handle->h_lockdep_map);
jbd2_journal_free_transaction(new_transaction);
@@ -332,7 +383,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
@@ -349,16 +399,21 @@ static handle_t *new_handle(int nblocks)
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
*
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
- unsigned int type, unsigned int line_no)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+ gfp_t gfp_mask, unsigned int type,
+ unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -375,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
+ if (rsv_blocks) {
+ handle_t *rsv_handle;
- current->journal_info = handle;
+ rsv_handle = new_handle(rsv_blocks);
+ if (!rsv_handle) {
+ jbd2_free_handle(handle);
+ return ERR_PTR(-ENOMEM);
+ }
+ rsv_handle->h_reserved = 1;
+ rsv_handle->h_journal = journal;
+ handle->h_rsv_handle = rsv_handle;
+ }
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
- current->journal_info = NULL;
return ERR_PTR(err);
}
handle->h_type = type;
@@ -396,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_buffer_credits);
+ jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+ unsigned int line_no)
+{
+ journal_t *journal = handle->h_journal;
+ int ret = -EIO;
+
+ if (WARN_ON(!handle->h_reserved)) {
+ /* Someone passed in normal handle? Just stop it. */
+ jbd2_journal_stop(handle);
+ return ret;
+ }
+ /*
+ * Usefulness of mixing of reserved and unreserved handles is
+ * questionable. So far nobody seems to need it so just error out.
+ */
+ if (WARN_ON(current->journal_info)) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+
+ handle->h_journal = NULL;
+ /*
+ * GFP_NOFS is here because callers are likely from writeback or
+ * similarly constrained call sites
+ */
+ ret = start_this_handle(journal, handle, GFP_NOFS);
+ if (ret < 0)
+ jbd2_journal_free_reserved(handle);
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
* int jbd2_journal_extend() - extend buffer credits.
@@ -424,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
int jbd2_journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
int result;
int wanted;
- result = -EIO;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
result = 1;
read_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
+ if (transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
- wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+ wanted = atomic_add_return(nblocks,
+ &transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
- if (wanted > __jbd2_log_space_left(journal)) {
+ if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+ jbd2_log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
handle->h_buffer_credits,
nblocks);
handle->h_buffer_credits += nblocks;
handle->h_requested_credits += nblocks;
- atomic_add(nblocks, &transaction->t_outstanding_credits);
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -474,7 +599,6 @@ unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
-out:
return result;
}
@@ -491,19 +615,22 @@ out:
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
*/
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
tid_t tid;
int need_to_start, ret;
+ WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
+ journal = transaction->t_journal;
/*
* First unlink the handle from its current transaction, and start the
@@ -516,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
spin_lock(&transaction->t_handle_lock);
atomic_sub(handle->h_buffer_credits,
&transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle) {
+ sub_reserved_credits(journal,
+ handle->h_rsv_handle->h_buffer_credits);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
+ tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
+ handle->h_transaction = NULL;
+ current->journal_info = NULL;
jbd_debug(2, "restarting handle %p\n", handle);
- tid = transaction->t_tid;
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
@@ -558,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
+ /* Wait until there are no reserved handles */
+ if (atomic_read(&journal->j_reserved_credits)) {
+ write_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) == 0);
+ write_lock(&journal->j_state_lock);
+ }
+
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
@@ -620,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+static int sleep_on_shadow_bh(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -635,16 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
- transaction_t *transaction;
+ transaction_t *transaction = handle->h_transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
+ unsigned long start_lock, time_lock;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
-
- transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -655,9 +802,16 @@ repeat:
/* @@@ Need to check for errors here at some point. */
+ start_lock = jiffies;
lock_buffer(bh);
jbd_lock_bh_state(bh);
+ /* If it takes too long to lock the buffer, trace it */
+ time_lock = jbd2_time_diff(start_lock, jiffies);
+ if (time_lock > HZ/10)
+ trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+ jiffies_to_msecs(time_lock));
+
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
@@ -747,41 +901,29 @@ repeat:
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+ if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
+ wait_on_bit(&bh->b_state, BH_Shadow,
+ sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
goto repeat;
}
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
+ /*
+ * Only do the copy if the currently-owning transaction still
+ * needs it. If buffer isn't on BJ_Metadata list, the
+ * committing transaction is past that stage (here we use the
+ * fact that BH_Shadow is set under bh_state lock together with
+ * refiling to BJ_Shadow list and at this point we know the
+ * buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
+ * in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -908,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
+ WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
+ journal = transaction->t_journal;
err = 0;
JBUFFER_TRACE(jh, "entry");
@@ -1121,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int ret = 0;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
jh = jbd2_journal_grab_journal_head(bh);
if (!jh) {
ret = -EUCLEAN;
@@ -1220,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
@@ -1251,12 +1397,17 @@ out:
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ WARN_ON(!transaction);
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ journal = transaction->t_journal;
+
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
@@ -1283,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
*/
jh->b_modified = 0;
- if (jh->b_transaction == handle->h_transaction) {
+ if (jh->b_transaction == transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
@@ -1378,19 +1529,21 @@ drop:
int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err, wait_for_commit = 0;
+ journal_t *journal;
+ int err = 0, wait_for_commit = 0;
tid_t tid;
pid_t pid;
+ if (!transaction)
+ goto free_and_exit;
+ journal = transaction->t_journal;
+
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else {
+ else
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- err = 0;
- }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1400,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
@@ -1511,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
lock_map_release(&handle->h_lockdep_map);
+ if (handle->h_rsv_handle)
+ jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
jbd2_free_handle(handle);
return err;
}
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = jbd2_journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = jbd2_journal_stop(handle);
- }
- return ret;
-}
-
/*
*
* List management code snippets: various functions for manipulating the
@@ -1594,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
- * of these pointers, it could go bad. Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list. If the caller is holding onto a copy of one of these
+ * pointers, it could go bad. Generally the caller needs to re-read the
+ * pointer from the transaction_t.
*
* Called under j_list_lock.
*/
@@ -1627,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2027,18 +2154,23 @@ zap_buffer_unlocked:
* void jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
*/
int jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2047,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2056,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return 0;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- ret = journal_unmap_buffer(journal, bh, offset > 0);
+ ret = journal_unmap_buffer(journal, bh, partial_page);
unlock_buffer(bh);
if (ret < 0)
return ret;
@@ -2070,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
@@ -2131,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2241,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- return -EIO;
+ return -EROFS;
+ journal = transaction->t_journal;
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
#include <linux/time.h>
#include "nodelist.h"
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
static int jffs2_create (struct inode *,struct dentry *,umode_t,
bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
const struct file_operations jffs2_dir_operations =
{
.read = generic_read_dir,
- .readdir = jffs2_readdir,
+ .iterate = jffs2_readdir,
.unlocked_ioctl=jffs2_ioctl,
.fsync = jffs2_fsync,
.llseek = generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
/***********************************************************************/
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
{
- struct jffs2_inode_info *f;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_full_dirent *fd;
- unsigned long offset, curofs;
+ unsigned long curofs = 1;
- jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
- file_inode(filp)->i_ino);
+ jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
- f = JFFS2_INODE_INFO(inode);
-
- offset = filp->f_pos;
-
- if (offset == 0) {
- jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
- offset++;
- }
- if (offset == 1) {
- unsigned long pino = parent_ino(filp->f_path.dentry);
- jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
- if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
- goto out;
- offset++;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- curofs=1;
mutex_lock(&f->sem);
for (fd = f->dents; fd; fd = fd->next) {
-
curofs++;
- /* First loop: curofs = 2; offset = 2 */
- if (curofs < offset) {
+ /* First loop: curofs = 2; pos = 2 */
+ if (curofs < ctx->pos) {
jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
- fd->name, fd->ino, fd->type, curofs, offset);
+ fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
continue;
}
if (!fd->ino) {
jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
fd->name);
- offset++;
+ ctx->pos++;
continue;
}
jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
- offset, fd->name, fd->ino, fd->type);
- if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+ (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
+ if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
break;
- offset++;
+ ctx->pos++;
}
mutex_unlock(&f->sem);
- out:
- filp->f_pos = offset;
return 0;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b7dc47ba675e..730f24e282a6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -125,7 +126,7 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int wait = wbc->sync_mode == WB_SYNC_ALL;
- if (test_cflag(COMMIT_Nolink, inode))
+ if (inode->i_nlink == 0)
return 0;
/*
* If COMMIT_DIRTY is not set, the inode isn't really dirty.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ip->i_sb,
- "dbFree: block to be freed is outside the map");
+ jfs_error(ip->i_sb, "block to be freed is outside the map\n");
return -EIO;
}
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
/* free the blocks. */
if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
- jfs_error(ip->i_sb, "dbFree: error in block map\n");
+ jfs_error(ip->i_sb, "error in block map\n");
release_metapage(mp);
IREAD_UNLOCK(ipbmap);
return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ipbmap->i_sb,
- "dbUpdatePMap: blocks are outside the map");
+ jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
return -EIO;
}
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
/* the hint should be within the map */
if (hint >= mapSize) {
- jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+ jfs_error(ip->i_sb, "the hint is outside the map\n");
return -EIO;
}
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
bmp = sbi->bmap;
if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
IREAD_UNLOCK(ipbmap);
- jfs_error(ip->i_sb,
- "dbExtend: the block is outside the filesystem");
+ jfs_error(ip->i_sb, "the block is outside the filesystem\n");
return -EIO;
}
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
u32 mask;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNext: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
s8 *leaf;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNear: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
*/
if (l2nb > bmp->db_agl2size) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: allocation request is larger than the "
- "allocation group size");
+ "allocation request is larger than the allocation group size\n");
return -EIO;
}
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
(unsigned long long) blkno,
(unsigned long long) nblocks);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: dbAllocCtl failed in free AG");
+ "dbAllocCtl failed in free AG\n");
}
return (rc);
}
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
budmin = dcp->budmin;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
}
if (n == 4) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: failed descending stree");
+ "failed descending stree\n");
release_metapage(mp);
return -EIO;
}
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
&blkno))) {
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: control page "
- "inconsistent");
+ "control page inconsistent\n");
return -EIO;
}
return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: unable to allocate blocks");
+ "unable to allocate blocks\n");
rc = -EIO;
}
return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
*/
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAny: unable to allocate blocks");
+ jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
return -EIO;
}
return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
if (totrim == NULL) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: no memory for trim array");
+ jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
IWRITE_UNLOCK(ipbmap);
return 0;
}
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
nblocks = 1 << l2nb;
} else {
/* Trim any already allocated blocks */
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: -EIO");
+ jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
break;
}
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: Corrupt dmapctl page");
+ "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (rc) {
if (lev != level) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: dmap inconsistent");
+ "dmap inconsistent\n");
return -EIO;
}
return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
if (dp->tree.stree[ROOT] != L2BPERDMAP) {
release_metapage(mp);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: the dmap is not all free");
+ "the dmap is not all free\n");
rc = -EIO;
goto backout;
}
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: I/O Error: Block Leakage.");
+ "I/O Error: Block Leakage\n");
continue;
}
dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
release_metapage(mp);
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: Block Leakage.");
+ jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
continue;
}
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
for (; nwords > 0; nwords -= nw) {
if (leaf[word] < BUDMIN) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocBits: leaf page "
- "corrupt");
+ "leaf page corrupt\n");
break;
}
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
dcp = (struct dmapctl *) mp->data;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
assert(level == bmp->db_maxlevel);
if (bmp->db_maxfreebud != oldroot) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: the maximum free buddy is "
- "not the old root");
+ "the maximum free buddy is not the old root\n");
}
bmp->db_maxfreebud = dcp->stree[ROOT];
}
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
p = BMAPBLKNO + nbperpage; /* L2 page */
l2mp = read_metapage(ipbmap, p, PSIZE, 0);
if (!l2mp) {
- jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+ jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
return -EIO;
}
l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
}
} /* for each L1 in a L2 */
- jfs_error(ipbmap->i_sb,
- "dbExtendFS: function has not returned as expected");
+ jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
errout:
if (l0mp)
release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
}
if (bmp->db_agpref >= bmp->db_numag) {
jfs_error(ipbmap->i_sb,
- "cannot find ag with average freespace");
+ "cannot find ag with average freespace\n");
}
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
/* get page buffer for specified block address */
-#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
- if (!(RC))\
- {\
- if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
- {\
- BT_PUTPAGE(MP);\
- jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
+ if (!(RC)) { \
+ if (((P)->header.nextindex > \
+ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
+ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ BT_PUTPAGE(MP); \
+ jfs_error((IP)->i_sb, \
+ "DT_GETPAGE: dtree page corrupt\n"); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
/* Something's corrupted, mark filesystem dirty so
* chkdsk will fix it.
*/
- jfs_error(sb, "stack overrun in dtSearch!");
+ jfs_error(sb, "stack overrun!\n");
BT_STACK_DUMP(btstack);
rc = -EIO;
goto out;
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
* return: offset = (pn, index) of start entry
* of next jfs_readdir()/dtRead()
*/
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *ip = file_inode(filp);
+ struct inode *ip = file_inode(file);
struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
int rc = 0;
loff_t dtpos; /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
int overflow, fix_page, page_fixed = 0;
static int unique_pos = 2; /* If we can't fix broken index */
- if (filp->f_pos == DIREND)
+ if (ctx->pos == DIREND)
return 0;
if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
*/
do_index = 1;
- dir_index = (u32) filp->f_pos;
+ dir_index = (u32) ctx->pos;
if (dir_index > 1) {
struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (dtEmpty(ip) ||
(dir_index >= JFS_IP(ip)->next_index)) {
/* Stale position. Directory has shrunk */
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
repeat:
rc = read_index(ip, dir_index, &dirtab_slot);
if (rc) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return rc;
}
if (dirtab_slot.flag == DIR_INDEX_FREE) {
if (loop_count++ > JFS_IP(ip)->next_index) {
jfs_err("jfs_readdir detected "
"infinite loop!");
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
dir_index = le32_to_cpu(dirtab_slot.addr2);
if (dir_index == -1) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
index = dirtab_slot.slot;
DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
if (rc) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
if (p->header.flag & BT_INTERNAL) {
jfs_err("jfs_readdir: bad index table");
DT_PUTPAGE(mp);
- filp->f_pos = -1;
+ ctx->pos = -1;
return 0;
}
} else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/*
* self "."
*/
- filp->f_pos = 0;
- if (filldir(dirent, ".", 1, 0, ip->i_ino,
- DT_DIR))
+ ctx->pos = 0;
+ if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
}
/*
* parent ".."
*/
- filp->f_pos = 1;
- if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+ ctx->pos = 1;
+ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
/*
* Find first entry of left-most leaf
*/
if (dtEmpty(ip)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
* pn > 0: Real entries, pn=1 -> leftmost page
* pn = index = -1: No more entries
*/
- dtpos = filp->f_pos;
+ dtpos = ctx->pos;
if (dtpos == 0) {
/* build "." entry */
-
- if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
- DT_DIR))
+ if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
dtoffset->index = 1;
- filp->f_pos = dtpos;
+ ctx->pos = dtpos;
}
if (dtoffset->pn == 0) {
if (dtoffset->index == 1) {
/* build ".." entry */
-
- if (filldir(dirent, "..", 2, filp->f_pos,
- PARENT(ip), DT_DIR))
+ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
} else {
jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
dtoffset->pn = 1;
dtoffset->index = 0;
- filp->f_pos = dtpos;
+ ctx->pos = dtpos;
}
if (dtEmpty(ip)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
- if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+ if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
jfs_err("jfs_readdir: unexpected rc = %d "
"from dtReadNext", rc);
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
/* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* offset beyond directory eof ? */
if (bn < 0) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return 0;
}
}
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (dirent_buf == 0) {
DT_PUTPAGE(mp);
jfs_warn("jfs_readdir: __get_free_page failed!");
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
return -ENOMEM;
}
@@ -3252,8 +3247,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* Sanity Check */
if (d_namleft == 0) {
jfs_error(ip->i_sb,
- "JFS:Dtree error: ino = "
- "%ld, bn=%Ld, index = %d",
+ "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
(long)ip->i_ino,
(long long)bn,
i);
@@ -3295,9 +3289,9 @@ skip_one:
jfs_dirent = (struct jfs_dirent *) dirent_buf;
while (jfs_dirents--) {
- filp->f_pos = jfs_dirent->position;
- if (filldir(dirent, jfs_dirent->name,
- jfs_dirent->name_len, filp->f_pos,
+ ctx->pos = jfs_dirent->position;
+ if (!dir_emit(ctx, jfs_dirent->name,
+ jfs_dirent->name_len,
jfs_dirent->ino, DT_UNKNOWN))
goto out;
jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3303,7 @@ skip_one:
}
if (!overflow && (bn == 0)) {
- filp->f_pos = DIREND;
+ ctx->pos = DIREND;
break;
}
@@ -3373,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
*/
if (BT_STACK_FULL(btstack)) {
DT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+ jfs_error(ip->i_sb, "btstack overrun\n");
BT_STACK_DUMP(btstack);
return -EIO;
}
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
ino_t * orig_ino, ino_t new_ino, int flag);
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
if ((rc == 0) && xlen) {
if (xlen != nbperpage) {
- jfs_error(ip->i_sb, "extHint: corrupt xtree");
+ jfs_error(ip->i_sb, "corrupt xtree\n");
rc = -EIO;
}
XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 6ba4006e011b..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
dp += rel_inode;
if (ip->i_ino != le32_to_cpu(dp->di_number)) {
- jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+ jfs_error(ip->i_sb, "i_ino != di_number\n");
rc = -EIO;
} else if (le32_to_cpu(dp->di_nlink) == 0)
rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
if (!addressPXD(&(jfs_ip->ixpxd)) ||
(lengthPXD(&(jfs_ip->ixpxd)) !=
JFS_IP(ipimap)->i_imap->im_nbperiext)) {
- jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+ jfs_error(ip->i_sb, "ixpxd invalid\n");
return -EIO;
}
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
if (iagno >= imap->im_nextiag) {
print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
imap, 32, 0);
- jfs_error(ip->i_sb,
- "diFree: inum = %d, iagno = %d, nextiag = %d",
+ jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
(uint) inum, iagno, imap->im_nextiag);
return -EIO;
}
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
mask = HIGHORDER >> bitno;
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
- jfs_error(ip->i_sb,
- "diFree: wmap shows inode already free");
+ jfs_error(ip->i_sb, "wmap shows inode already free\n");
}
if (!addressPXD(&iagp->inoext[extno])) {
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: invalid inoext");
+ jfs_error(ip->i_sb, "invalid inoext\n");
return -EIO;
}
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
/*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
* for the inode being freed.
*/
if (iagp->pmap[extno] != 0) {
- jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+ jfs_error(ip->i_sb, "the pmap does not show inode free\n");
}
iagp->wmap[extno] = 0;
PXDlength(&iagp->inoext[extno], 0);
@@ -1493,7 +1491,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
/* mask any prior bits for the starting words of the
* summary map.
*/
- mask = ONES << (EXTSPERSUM - bitno);
+ mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno));
inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
release_metapage(mp);
AG_UNLOCK(imap, agno);
jfs_error(ip->i_sb,
- "diAlloc: can't find free bit "
- "in wmap");
+ "can't find free bit in wmap\n");
return -EIO;
}
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
numinos = imap->im_agctl[agno].numinos;
if (numfree > numinos) {
- jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (!iagp->nfreeinos) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb,
- "diAllocIno: nfreeinos = 0, but iag on freelist");
+ jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
return -EIO;
}
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
jfs_error(ip->i_sb,
- "diAllocIno: free inode not found in summary map");
+ "free inode not found in summary map\n");
return -EIO;
}
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+ jfs_error(ip->i_sb, "no free extent found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= INOSPEREXT) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+ jfs_error(ip->i_sb, "free inode not found\n");
return -EIO;
}
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
if ((rc = diIAGRead(imap, iagno, &mp))) {
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+ jfs_error(ip->i_sb, "error reading iag\n");
return rc;
}
iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (sword >= SMAPSZ) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb,
- "diAllocExt: free ext summary map not found");
+ jfs_error(ip->i_sb, "free ext summary map not found\n");
return -EIO;
}
if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+ jfs_error(ip->i_sb, "free extent not found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
if (bmp)
release_metapage(bmp);
- jfs_error(imap->im_ipimap->i_sb,
- "diAllocBit: iag inconsistent");
+ jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
return -EIO;
}
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* better have free extents.
*/
if (!iagp->nfreeexts) {
- jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+ jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
return -EIO;
}
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
}
if (ciagp == NULL) {
jfs_error(imap->im_ipimap->i_sb,
- "diNewExt: ciagp == NULL");
+ "ciagp == NULL\n");
rc = -EIO;
goto error_out;
}
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
IWRITE_UNLOCK(ipimap);
IAGFREE_UNLOCK(imap);
jfs_error(imap->im_ipimap->i_sb,
- "diNewIAG: ipimap->i_size is wrong");
+ "ipimap->i_size is wrong\n");
return -EIO;
}
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
iagno = INOTOIAG(inum);
/* make sure that the iag is contained within the map */
if (iagno >= imap->im_nextiag) {
- jfs_error(ipimap->i_sb,
- "diUpdatePMap: the iag is outside the map");
+ jfs_error(ipimap->i_sb, "the iag is outside the map\n");
return -EIO;
}
/* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
*/
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in wmap!", inum);
+ "inode %ld not marked as allocated in wmap!\n",
+ inum);
}
if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in pmap!", inum);
+ "inode %ld not marked as allocated in pmap!\n",
+ inum);
}
/* update the bitmap for the extent of the freed inode */
iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not allocated in "
- "the working map");
+ "the inode is not allocated in the working map\n");
return -EIO;
}
if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not free in the "
- "persistent map");
+ "the inode is not free in the persistent map\n");
return -EIO;
}
/* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
iagp = (struct iag *) bp->data;
if (le32_to_cpu(iagp->iagnum) != i) {
release_metapage(bp);
- jfs_error(ipimap->i_sb,
- "diExtendFs: unexpected value of iagnum");
+ jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
return -EIO;
}
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
if (xnuminos != atomic_read(&imap->im_numinos) ||
xnumfree != atomic_read(&imap->im_numfree)) {
- jfs_error(ipimap->i_sb,
- "diExtendFs: numinos or numfree incorrect");
+ jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
return -EIO;
}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 2eb952c41a69..360d27c48887 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,8 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
*/
void jfs_syncpt(struct jfs_log *log, int hard_sync)
{ LOG_LOCK(log);
- lmLogSync(log, hard_sync);
+ if (!test_bit(log_QUIESCE, &log->flag))
+ lmLogSync(log, hard_sync);
LOG_UNLOCK(log);
}
@@ -2004,12 +2005,17 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_io_vec[0].bv_offset = bp->l_offset;
bio->bi_vcnt = 1;
- bio->bi_idx = 0;
bio->bi_size = LOGPSIZE;
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- submit_bio(READ_SYNC, bio);
+ /*check if journaling to disk has been disabled*/
+ if (log->no_integrity) {
+ bio->bi_size = 0;
+ lbmIODone(bio, 0);
+ } else {
+ submit_bio(READ_SYNC, bio);
+ }
wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
@@ -2145,7 +2151,6 @@ static void lbmStartIO(struct lbuf * bp)
bio->bi_io_vec[0].bv_offset = bp->l_offset;
bio->bi_vcnt = 1;
- bio->bi_idx = 0;
bio->bi_size = LOGPSIZE;
bio->bi_end_io = lbmIODone;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- BUG_ON(offset);
+ BUG_ON(offset || length < PAGE_CACHE_SIZE);
BUG_ON(PageWriteback(page));
@@ -646,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (mp) {
if (mp->logical_size != size) {
jfs_error(inode->i_sb,
- "__get_metapage: mp->logical_size != size");
+ "get_mp->logical_size != size\n");
jfs_err("logical_size = %d, size = %d",
mp->logical_size, size);
dump_stack();
@@ -657,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (test_bit(META_discard, &mp->flag)) {
if (!new) {
jfs_error(inode->i_sb,
- "__get_metapage: using a "
- "discarded metapage");
+ "using a discarded metapage\n");
discard_metapage(mp);
goto unlock;
}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
extern int readSuper(struct super_block *, struct buffer_head **);
extern int updateSuper(struct super_block *, uint);
+__printf(2, 3)
extern void jfs_error(struct super_block *, const char *, ...);
extern int jfs_mount(struct super_block *);
extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
* mark filesystem dirty
*/
if (dirty)
- jfs_error(tblk->sb, "txAbort");
+ jfs_error(tblk->sb, "\n");
return;
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
/* get page buffer for specified block address */
/* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
- if (!(RC))\
- {\
- if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
- (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
- (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
- {\
- jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
- BT_PUTPAGE(MP);\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
+ if (!(RC)) { \
+ if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
+ (le16_to_cpu((P)->header.nextindex) > \
+ le16_to_cpu((P)->header.maxentry)) || \
+ (le16_to_cpu((P)->header.maxentry) > \
+ (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
+ jfs_error((IP)->i_sb, \
+ "XT_GETPAGE: xtree page corrupt\n"); \
+ BT_PUTPAGE(MP); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
/* push (bn, index) of the parent page/entry */
if (BT_STACK_FULL(btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+ jfs_error(ip->i_sb, "xtSearch did not find extent\n");
return -EIO;
}
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
xad = &p->xad[index];
if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+ jfs_error(ip->i_sb, "extension is not contiguous\n");
return -EIO;
}
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+ jfs_error(ip->i_sb, "couldn't find extent\n");
return -EIO;
}
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
nextindex = le16_to_cpu(p->header.nextindex);
if (index != nextindex - 1) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTailgate: the entry found is not the last entry");
+ jfs_error(ip->i_sb, "the entry found is not the last entry\n");
return -EIO;
}
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+ jfs_error(ip->i_sb, "Could not find extent\n");
return -EIO;
}
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
(nxoff + nxlen > xoff + xlen)) {
XT_PUTPAGE(mp);
jfs_error(ip->i_sb,
- "xtUpdate: nXAD in not completely contained within XAD");
+ "nXAD in not completely contained within XAD\n");
return -EIO;
}
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (xoff >= nxoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+ jfs_error(ip->i_sb, "xoff >= nxoff\n");
return -EIO;
}
/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+ jfs_error(ip->i_sb, "xtSearch failed\n");
return -EIO;
}
if (index0 != index) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtUpdate: unexpected value of index");
+ jfs_error(ip->i_sb, "unexpected value of index\n");
return -EIO;
}
}
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTruncate_pmap: did not find extent");
+ jfs_error(ip->i_sb, "did not find extent\n");
return -EIO;
}
} else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!S_ISDIR(old_ip->i_mode) && new_ip)
IWRITE_UNLOCK(new_ip);
jfs_error(new_ip->i_sb,
- "jfs_rename: new_ip->i_nlink != 0");
+ "new_ip->i_nlink != 0\n");
return -EIO;
}
tblk = tid_to_tblock(tid);
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
const struct file_operations jfs_dir_operations = {
.read = generic_read_dir,
- .readdir = jfs_readdir,
+ .iterate = jfs_readdir,
.fsync = jfs_fsync,
.unlocked_ioctl = jfs_ioctl,
#ifdef CONFIG_COMPAT
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
.llseek = generic_file_llseek,
};
-static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
- struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
{
unsigned long hash;
int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
return 0;
}
-static int jfs_ci_compare(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
int i, result = 1;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto resume;
error_out:
- jfs_error(sb, "jfs_extendfs");
+ jfs_error(sb, "\n");
resume:
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2003e830ed1c..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
/* nothing is done for continue beyond marking the superblock dirty */
}
-void jfs_error(struct super_block *sb, const char * function, ...)
+void jfs_error(struct super_block *sb, const char *fmt, ...)
{
- static char error_buf[256];
+ struct va_format vaf;
va_list args;
- va_start(args, function);
- vsnprintf(error_buf, sizeof(error_buf), function, args);
- va_end(args);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("ERROR: (device %s): %pf: %pV\n",
+ sb->s_id, __builtin_return_address(0), &vaf);
- pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
+ va_end(args);
jfs_handle_error(sb);
}
@@ -611,11 +615,28 @@ static int jfs_freeze(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
+ int rc = 0;
if (!(sb->s_flags & MS_RDONLY)) {
txQuiesce(sb);
- lmLogShutdown(log);
- updateSuper(sb, FM_CLEAN);
+ rc = lmLogShutdown(log);
+ if (rc) {
+ jfs_error(sb, "lmLogShutdown failed\n");
+
+ /* let operations fail rather than hang */
+ txResume(sb);
+
+ return rc;
+ }
+ rc = updateSuper(sb, FM_CLEAN);
+ if (rc) {
+ jfs_err("jfs_freeze: updateSuper failed\n");
+ /*
+ * Don't fail here. Everything succeeded except
+ * marking the superblock clean, so there's really
+ * no harm in leaving it frozen for now.
+ */
+ }
}
return 0;
}
@@ -627,13 +648,18 @@ static int jfs_unfreeze(struct super_block *sb)
int rc = 0;
if (!(sb->s_flags & MS_RDONLY)) {
- updateSuper(sb, FM_MOUNT);
- if ((rc = lmLogInit(log)))
- jfs_err("jfs_unlock failed with return code %d", rc);
- else
- txResume(sb);
+ rc = updateSuper(sb, FM_MOUNT);
+ if (rc) {
+ jfs_error(sb, "updateSuper failed\n");
+ goto out;
+ }
+ rc = lmLogInit(log);
+ if (rc)
+ jfs_error(sb, "lmLogInit failed\n");
+out:
+ txResume(sb);
}
- return 0;
+ return rc;
}
static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
nbytes = sizeDXD(&ji->ea);
if (!nbytes) {
- jfs_error(sb, "ea_read: nbytes is 0");
+ jfs_error(sb, "nbytes is 0\n");
return -EIO;
}
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
current_blocks = 0;
} else {
if (!(ji->ea.flag & DXD_EXTENT)) {
- jfs_error(sb, "ea_get: invalid ea.flag)");
+ jfs_error(sb, "invalid ea.flag\n");
return -EIO;
}
current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..c3a0837fb861 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode)
* both impossible due to the lock on directory.
*/
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct dentry *cursor = filp->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *cursor = file->private_data;
struct list_head *p, *q = &cursor->d_u.d_child;
- ino_t ino;
- int i = filp->f_pos;
- switch (i) {
- case 0:
- ino = dentry->d_inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- break;
- filp->f_pos++;
- i++;
- /* fallthrough */
- default:
- spin_lock(&dentry->d_lock);
- if (filp->f_pos == 2)
- list_move(q, &dentry->d_subdirs);
-
- for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
- struct dentry *next;
- next = list_entry(p, struct dentry, d_u.d_child);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- if (!simple_positive(next)) {
- spin_unlock(&next->d_lock);
- continue;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ spin_lock(&dentry->d_lock);
+ if (ctx->pos == 2)
+ list_move(q, &dentry->d_subdirs);
+
+ for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
+ struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+ if (!simple_positive(next)) {
+ spin_unlock(&next->d_lock);
+ continue;
+ }
- spin_unlock(&next->d_lock);
- spin_unlock(&dentry->d_lock);
- if (filldir(dirent, next->d_name.name,
- next->d_name.len, filp->f_pos,
- next->d_inode->i_ino,
- dt_type(next->d_inode)) < 0)
- return 0;
- spin_lock(&dentry->d_lock);
- spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
- /* next is still alive */
- list_move(q, p);
- spin_unlock(&next->d_lock);
- p = q;
- filp->f_pos++;
- }
- spin_unlock(&dentry->d_lock);
+ spin_unlock(&next->d_lock);
+ spin_unlock(&dentry->d_lock);
+ if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+ next->d_inode->i_ino, dt_type(next->d_inode)))
+ return 0;
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+ /* next is still alive */
+ list_move(q, p);
+ spin_unlock(&next->d_lock);
+ p = q;
+ ctx->pos++;
}
+ spin_unlock(&dentry->d_lock);
return 0;
}
@@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = {
.release = dcache_dir_close,
.llseek = dcache_dir_lseek,
.read = generic_read_dir,
- .readdir = dcache_readdir,
+ .iterate = dcache_readdir,
.fsync = noop_fsync,
};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0796c45d0d4d..01bfe7662751 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -144,6 +144,9 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
timeout);
if (ret < 0)
return -ERESTARTSYS;
+ /* Reset the lock status after a server reboot so we resend */
+ if (block->b_status == nlm_lck_denied_grace_period)
+ block->b_status = nlm_lck_blocked;
req->a_res.status = block->b_status;
return 0;
}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7e529c3c45c0..9760ecb9b60f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -550,9 +550,6 @@ again:
status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
if (status < 0)
break;
- /* Resend the blocking lock request after a server reboot */
- if (resp->status == nlm_lck_denied_grace_period)
- continue;
if (resp->status != nlm_lck_blocked)
break;
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+ nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..067778b0ccc9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
dprintk("lockd: unlinking block %p...\n", block);
/* Remove block from list */
- status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
+ status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
nlmsvc_remove_block(block);
return status;
}
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
}
+/*
+ * Since NLM uses two "keys" for tracking locks, we need to hash them down
+ * to one for the blocked_hash. Here, we're just xor'ing the host address
+ * with the pid in order to create a key value for picking a hash bucket.
+ */
+static unsigned long
+nlmsvc_owner_key(struct file_lock *fl)
+{
+ return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
+}
+
const struct lock_manager_operations nlmsvc_lock_operations = {
.lm_compare_owner = nlmsvc_same_owner,
+ .lm_owner_key = nlmsvc_owner_key,
.lm_notify = nlmsvc_notify_blocked,
.lm_grant = nlmsvc_grant_deferred,
};
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
again:
file->f_locks = 0;
- lock_flocks(); /* protects i_flock list */
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl; fl = fl->fl_next) {
if (fl->fl_lmops != &nlmsvc_lock_operations)
continue;
@@ -181,7 +181,7 @@ again:
if (match(lockhost, host)) {
struct file_lock lock = *fl;
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
goto again;
}
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return 0;
}
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
return 1;
- lock_flocks();
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl; fl = fl->fl_next) {
if (fl->fl_lmops == &nlmsvc_lock_operations) {
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return 1;
}
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
file->f_locks = 0;
return 0;
}
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,9 @@
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
+#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
#include <asm/uaccess.h>
@@ -153,30 +156,53 @@ int lease_break_time = 45;
#define for_each_lock(inode, lockp) \
for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-static LIST_HEAD(file_lock_list);
-static LIST_HEAD(blocked_list);
-static DEFINE_SPINLOCK(file_lock_lock);
+/*
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
+ */
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
/*
- * Protects the two list heads above, plus the inode->i_flock list
+ * The blocked_hash is used to find POSIX lock loops for deadlock detection.
+ * It is protected by blocked_lock_lock.
+ *
+ * We hash locks by lockowner in order to optimize searching for the lock a
+ * particular lockowner is waiting on.
+ *
+ * FIXME: make this value scale via some heuristic? We generally will want more
+ * buckets when we have more lockowners holding locks, but that's a little
+ * difficult to determine without knowing what the workload will look like.
*/
-void lock_flocks(void)
-{
- spin_lock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(lock_flocks);
+#define BLOCKED_HASH_BITS 7
+static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
-void unlock_flocks(void)
-{
- spin_unlock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(unlock_flocks);
+/*
+ * This lock protects the blocked_hash. Generally, if you're accessing it, you
+ * want to be holding this lock.
+ *
+ * In addition, it also protects the fl->fl_block list, and the fl->fl_next
+ * pointer for file_lock structures that are acting as lock requests (in
+ * contrast to those that are acting as records of acquired locks).
+ *
+ * Note that when we acquire this lock in order to change the above fields,
+ * we often hold the i_lock as well. In certain cases, when reading the fields
+ * protected by this lock, we can skip acquiring it iff we already hold the
+ * i_lock.
+ *
+ * In particular, adding an entry to the fl_block list requires that you hold
+ * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * an entry from the list however only requires the file_lock_lock.
+ */
+static DEFINE_SPINLOCK(blocked_lock_lock);
static struct kmem_cache *filelock_cache __read_mostly;
static void locks_init_lock_heads(struct file_lock *fl)
{
- INIT_LIST_HEAD(&fl->fl_link);
+ INIT_HLIST_NODE(&fl->fl_link);
INIT_LIST_HEAD(&fl->fl_block);
init_waitqueue_head(&fl->fl_wait);
}
@@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl)
{
BUG_ON(waitqueue_active(&fl->fl_wait));
BUG_ON(!list_empty(&fl->fl_block));
- BUG_ON(!list_empty(&fl->fl_link));
+ BUG_ON(!hlist_unhashed(&fl->fl_link));
locks_release_private(fl);
kmem_cache_free(filelock_cache, fl);
@@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner;
}
+/* Must be called with the i_lock held! */
+static inline void
+locks_insert_global_locks(struct file_lock *fl)
+{
+ lg_local_lock(&file_lock_lglock);
+ fl->fl_link_cpu = smp_processor_id();
+ hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+ lg_local_unlock(&file_lock_lglock);
+}
+
+/* Must be called with the i_lock held! */
+static inline void
+locks_delete_global_locks(struct file_lock *fl)
+{
+ /*
+ * Avoid taking lock if already unhashed. This is safe since this check
+ * is done while holding the i_lock, and new insertions into the list
+ * also require that it be held.
+ */
+ if (hlist_unhashed(&fl->fl_link))
+ return;
+ lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+ hlist_del_init(&fl->fl_link);
+ lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+}
+
+static unsigned long
+posix_owner_key(struct file_lock *fl)
+{
+ if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
+ return fl->fl_lmops->lm_owner_key(fl);
+ return (unsigned long)fl->fl_owner;
+}
+
+static inline void
+locks_insert_global_blocked(struct file_lock *waiter)
+{
+ hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+}
+
+static inline void
+locks_delete_global_blocked(struct file_lock *waiter)
+{
+ hash_del(&waiter->fl_link);
+}
+
/* Remove waiter from blocker's block list.
* When blocker ends up pointing to itself then the list is empty.
+ *
+ * Must be called with blocked_lock_lock held.
*/
static void __locks_delete_block(struct file_lock *waiter)
{
+ locks_delete_global_blocked(waiter);
list_del_init(&waiter->fl_block);
- list_del_init(&waiter->fl_link);
waiter->fl_next = NULL;
}
-/*
- */
-void locks_delete_block(struct file_lock *waiter)
+static void locks_delete_block(struct file_lock *waiter)
{
- lock_flocks();
+ spin_lock(&blocked_lock_lock);
__locks_delete_block(waiter);
- unlock_flocks();
+ spin_unlock(&blocked_lock_lock);
}
-EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
* We use a circular list so that processes can be easily woken up in
* the order they blocked. The documentation doesn't require this but
* it seems like the reasonable thing to do.
+ *
+ * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * list itself is protected by the file_lock_list, but by ensuring that the
+ * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * in some cases when we see that the fl_block list is empty.
*/
-static void locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter)
+static void __locks_insert_block(struct file_lock *blocker,
+ struct file_lock *waiter)
{
BUG_ON(!list_empty(&waiter->fl_block));
- list_add_tail(&waiter->fl_block, &blocker->fl_block);
waiter->fl_next = blocker;
+ list_add_tail(&waiter->fl_block, &blocker->fl_block);
if (IS_POSIX(blocker))
- list_add(&waiter->fl_link, &blocked_list);
+ locks_insert_global_blocked(waiter);
}
-/* Wake up processes blocked waiting for blocker.
- * If told to wait then schedule the processes until the block list
- * is empty, otherwise empty the block list ourselves.
+/* Must be called with i_lock held. */
+static void locks_insert_block(struct file_lock *blocker,
+ struct file_lock *waiter)
+{
+ spin_lock(&blocked_lock_lock);
+ __locks_insert_block(blocker, waiter);
+ spin_unlock(&blocked_lock_lock);
+}
+
+/*
+ * Wake up processes blocked waiting for blocker.
+ *
+ * Must be called with the inode->i_lock held!
*/
static void locks_wake_up_blocks(struct file_lock *blocker)
{
+ /*
+ * Avoid taking global lock if list is empty. This is safe since new
+ * blocked requests are only added to the list under the i_lock, and
+ * the i_lock is always held here. Note that removal from the fl_block
+ * list does not require the i_lock, so we must recheck list_empty()
+ * after acquiring the blocked_lock_lock.
+ */
+ if (list_empty(&blocker->fl_block))
+ return;
+
+ spin_lock(&blocked_lock_lock);
while (!list_empty(&blocker->fl_block)) {
struct file_lock *waiter;
@@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
else
wake_up(&waiter->fl_wait);
}
+ spin_unlock(&blocked_lock_lock);
}
/* Insert file lock fl into an inode's lock list at the position indicated
* by pos. At the same time add the lock to the global file lock list.
+ *
+ * Must be called with the i_lock held!
*/
static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
{
- list_add(&fl->fl_link, &file_lock_list);
-
fl->fl_nspid = get_pid(task_tgid(current));
/* insert into file's list */
fl->fl_next = *pos;
*pos = fl;
+
+ locks_insert_global_locks(fl);
}
/*
@@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
* Wake up processes that are blocked waiting for this lock,
* notify the FS that the lock has been cleared and
* finally free the lock.
+ *
+ * Must be called with the i_lock held!
*/
static void locks_delete_lock(struct file_lock **thisfl_p)
{
struct file_lock *fl = *thisfl_p;
+ locks_delete_global_locks(fl);
+
*thisfl_p = fl->fl_next;
fl->fl_next = NULL;
- list_del_init(&fl->fl_link);
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
@@ -625,8 +728,9 @@ void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
struct file_lock *cfl;
+ struct inode *inode = file_inode(filp);
- lock_flocks();
+ spin_lock(&inode->i_lock);
for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
if (!IS_POSIX(cfl))
continue;
@@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
fl->fl_pid = pid_vnr(cfl->fl_nspid);
} else
fl->fl_type = F_UNLCK;
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return;
}
EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
{
struct file_lock *fl;
- list_for_each_entry(fl, &blocked_list, fl_link) {
+ hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
if (posix_same_owner(fl, block_fl))
return fl->fl_next;
}
return NULL;
}
+/* Must be called with the blocked_lock_lock held! */
static int posix_locks_deadlock(struct file_lock *caller_fl,
struct file_lock *block_fl)
{
@@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
return -ENOMEM;
}
- lock_flocks();
+ spin_lock(&inode->i_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file.
*/
if (found) {
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
cond_resched();
- lock_flocks();
+ spin_lock(&inode->i_lock);
}
find_conflict:
@@ -777,7 +882,7 @@ find_conflict:
error = 0;
out:
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
if (new_fl)
locks_free_lock(new_fl);
return error;
@@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
struct file_lock *left = NULL;
struct file_lock *right = NULL;
struct file_lock **before;
- int error, added = 0;
+ int error;
+ bool added = false;
/*
* We may need two file_lock structures for this operation,
@@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
new_fl2 = locks_alloc_lock();
}
- lock_flocks();
+ spin_lock(&inode->i_lock);
+ /*
+ * New lock request. Walk all POSIX locks and look for conflicts. If
+ * there are any, either return error or put the request on the
+ * blocker's list of waiters and the global blocked_hash.
+ */
if (request->fl_type != F_UNLCK) {
for_each_lock(inode, before) {
fl = *before;
@@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
error = -EAGAIN;
if (!(request->fl_flags & FL_SLEEP))
goto out;
+ /*
+ * Deadlock detection and insertion into the blocked
+ * locks list must be done while holding the same lock!
+ */
error = -EDEADLK;
- if (posix_locks_deadlock(request, fl))
- goto out;
- error = FILE_LOCK_DEFERRED;
- locks_insert_block(fl, request);
+ spin_lock(&blocked_lock_lock);
+ if (likely(!posix_locks_deadlock(request, fl))) {
+ error = FILE_LOCK_DEFERRED;
+ __locks_insert_block(fl, request);
+ }
+ spin_unlock(&blocked_lock_lock);
goto out;
}
}
@@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
before = &fl->fl_next;
}
- /* Process locks with this owner. */
+ /* Process locks with this owner. */
while ((fl = *before) && posix_same_owner(request, fl)) {
/* Detect adjacent or overlapping regions (if same lock type)
*/
@@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
continue;
}
request = fl;
- added = 1;
+ added = true;
}
else {
/* Processing for different lock types is a bit
@@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (fl->fl_start > request->fl_end)
break;
if (request->fl_type == F_UNLCK)
- added = 1;
+ added = true;
if (fl->fl_start < request->fl_start)
left = fl;
/* If the next lock in the list has a higher end
@@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_release_private(fl);
locks_copy_private(fl, request);
request = fl;
- added = 1;
+ added = true;
}
}
/* Go on to next lock.
@@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
}
/*
- * The above code only modifies existing locks in case of
- * merging or replacing. If new lock(s) need to be inserted
- * all modifications are done bellow this, so it's safe yet to
- * bail out.
+ * The above code only modifies existing locks in case of merging or
+ * replacing. If new lock(s) need to be inserted all modifications are
+ * done below this, so it's safe yet to bail out.
*/
error = -ENOLCK; /* "no luck" */
if (right && left == right && !new_fl2)
@@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_wake_up_blocks(left);
}
out:
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
/*
* Free any unused locks.
*/
@@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode)
/*
* Search the lock list for this inode for any POSIX locks.
*/
- lock_flocks();
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!IS_POSIX(fl))
continue;
if (fl->fl_owner != owner)
break;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return fl ? -EAGAIN : 0;
}
@@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
- lock_flocks();
+ spin_lock(&inode->i_lock);
time_out_leases(inode);
@@ -1249,11 +1365,11 @@ restart:
break_time++;
}
locks_insert_block(flock, new_fl);
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
- lock_flocks();
- __locks_delete_block(new_fl);
+ spin_lock(&inode->i_lock);
+ locks_delete_block(new_fl);
if (error >= 0) {
if (error == 0)
time_out_leases(inode);
@@ -1270,7 +1386,7 @@ restart:
}
out:
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
locks_free_lock(new_fl);
return error;
}
@@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime);
int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
+ struct inode *inode = file_inode(filp);
int type = F_UNLCK;
- lock_flocks();
+ spin_lock(&inode->i_lock);
time_out_leases(file_inode(filp));
for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
@@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp)
break;
}
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return type;
}
-int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
+static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
{
struct file_lock *fl, **before, **my_before = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
@@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
if ((arg == F_WRLCK)
- && ((dentry->d_count > 1)
+ && ((d_count(dentry) > 1)
|| (atomic_read(&inode->i_count) > 1)))
goto out;
@@ -1403,7 +1520,7 @@ out:
return error;
}
-int generic_delete_lease(struct file *filp, struct file_lock **flp)
+static int generic_delete_lease(struct file *filp, struct file_lock **flp)
{
struct file_lock *fl, **before;
struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*
- * Called with file_lock_lock held.
+ * Called with inode->i_lock held.
*/
int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
{
@@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
{
+ struct inode *inode = file_inode(filp);
int error;
- lock_flocks();
+ spin_lock(&inode->i_lock);
error = __vfs_setlease(filp, arg, lease);
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return error;
}
@@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp)
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
{
struct file_lock *fl, *ret;
+ struct inode *inode = file_inode(filp);
struct fasync_struct *new;
int error;
@@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
return -ENOMEM;
}
ret = fl;
- lock_flocks();
+ spin_lock(&inode->i_lock);
error = __vfs_setlease(filp, arg, &ret);
if (error) {
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
locks_free_lock(fl);
goto out_free_fasync;
}
@@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
new = NULL;
error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
out_free_fasync:
if (new)
@@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp)
fl.fl_ops->fl_release_private(&fl);
}
- lock_flocks();
+ spin_lock(&inode->i_lock);
before = &inode->i_flock;
while ((fl = *before) != NULL) {
@@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp)
}
before = &fl->fl_next;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
}
/**
* posix_unblock_lock - stop waiting for a file lock
- * @filp: how the file was opened
* @waiter: the lock which was waiting
*
* lockd needs to block waiting for locks.
*/
int
-posix_unblock_lock(struct file *filp, struct file_lock *waiter)
+posix_unblock_lock(struct file_lock *waiter)
{
int status = 0;
- lock_flocks();
+ spin_lock(&blocked_lock_lock);
if (waiter->fl_next)
__locks_delete_block(waiter);
else
status = -ENOENT;
- unlock_flocks();
+ spin_unlock(&blocked_lock_lock);
return status;
}
-
EXPORT_SYMBOL(posix_unblock_lock);
/**
@@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+struct locks_iterator {
+ int li_cpu;
+ loff_t li_pos;
+};
+
static void lock_get_status(struct seq_file *f, struct file_lock *fl,
loff_t id, char *pfx)
{
@@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
static int locks_show(struct seq_file *f, void *v)
{
+ struct locks_iterator *iter = f->private;
struct file_lock *fl, *bfl;
- fl = list_entry(v, struct file_lock, fl_link);
+ fl = hlist_entry(v, struct file_lock, fl_link);
- lock_get_status(f, fl, *((loff_t *)f->private), "");
+ lock_get_status(f, fl, iter->li_pos, "");
list_for_each_entry(bfl, &fl->fl_block, fl_block)
- lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+ lock_get_status(f, bfl, iter->li_pos, " ->");
return 0;
}
static void *locks_start(struct seq_file *f, loff_t *pos)
{
- loff_t *p = f->private;
+ struct locks_iterator *iter = f->private;
- lock_flocks();
- *p = (*pos + 1);
- return seq_list_start(&file_lock_list, *pos);
+ iter->li_pos = *pos + 1;
+ lg_global_lock(&file_lock_lglock);
+ spin_lock(&blocked_lock_lock);
+ return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
- loff_t *p = f->private;
- ++*p;
- return seq_list_next(v, &file_lock_list, pos);
+ struct locks_iterator *iter = f->private;
+
+ ++iter->li_pos;
+ return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
{
- unlock_flocks();
+ spin_unlock(&blocked_lock_lock);
+ lg_global_unlock(&file_lock_lglock);
}
static const struct seq_operations locks_seq_operations = {
@@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
static int locks_open(struct inode *inode, struct file *filp)
{
- return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+ return seq_open_private(filp, &locks_seq_operations,
+ sizeof(struct locks_iterator));
}
static const struct file_operations proc_locks_operations = {
@@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
{
struct file_lock *fl;
int result = 1;
- lock_flocks();
+
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (IS_POSIX(fl)) {
if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
result = 0;
break;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return result;
}
@@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
{
struct file_lock *fl;
int result = 1;
- lock_flocks();
+
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (IS_POSIX(fl)) {
if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
result = 0;
break;
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return result;
}
@@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
static int __init filelock_init(void)
{
+ int i;
+
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+ for_each_possible_cpu(i)
+ INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
return 0;
}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index e784a217b500..550475ca6a0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
bio_vec.bv_len = PAGE_SIZE;
bio_vec.bv_offset = 0;
bio.bi_vcnt = 1;
- bio.bi_idx = 0;
bio.bi_size = PAGE_SIZE;
bio.bi_bdev = bdev;
bio.bi_sector = page->index * (PAGE_SIZE >> 9);
@@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
if (i >= max_pages) {
/* Block layer cannot split bios :( */
bio->bi_vcnt = i;
- bio->bi_idx = 0;
bio->bi_size = i * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
@@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
unlock_page(page);
}
bio->bi_vcnt = nr_pages;
- bio->bi_idx = 0;
bio->bi_size = nr_pages * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
@@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
if (i >= max_pages) {
/* Block layer cannot split bios :( */
bio->bi_vcnt = i;
- bio->bi_idx = 0;
bio->bi_size = i * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
@@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_io_vec[i].bv_offset = 0;
}
bio->bi_vcnt = nr_pages;
- bio->bi_idx = 0;
bio->bi_size = nr_pages * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
* way to combine the two copies */
-#define IMPLICIT_NODES 2
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file_inode(file);
- loff_t pos = file->f_pos - IMPLICIT_NODES;
+ loff_t pos;
struct page *page;
struct logfs_disk_dentry *dd;
- int full;
+ if (ctx->pos < 0)
+ return -EINVAL;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ pos = ctx->pos - 2;
BUG_ON(pos < 0);
- for (;; pos++) {
+ for (;; pos++, ctx->pos++) {
+ bool full;
if (beyond_eof(dir, pos))
break;
if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
dd = kmap(page);
BUG_ON(dd->namelen == 0);
- full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
- pos, be64_to_cpu(dd->ino), dd->type);
+ full = !dir_emit(ctx, (char *)dd->name,
+ be16_to_cpu(dd->namelen),
+ be64_to_cpu(dd->ino), dd->type);
kunmap(page);
page_cache_release(page);
if (full)
break;
}
-
- file->f_pos = pos + IMPLICIT_NODES;
return 0;
}
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
- struct inode *inode = file_inode(file);
- ino_t pino = parent_ino(file->f_dentry);
- int err;
-
- if (file->f_pos < 0)
- return -EINVAL;
-
- if (file->f_pos == 0) {
- if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
- return 0;
- file->f_pos++;
- }
- if (file->f_pos == 1) {
- if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
- return 0;
- file->f_pos++;
- }
-
- err = __logfs_readdir(file, buf, filldir);
- return err;
-}
-
static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
{
dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
const struct file_operations logfs_dir_fops = {
.fsync = logfs_fsync,
.unlocked_ioctl = logfs_ioctl,
- .readdir = logfs_readdir,
+ .iterate = logfs_readdir,
.read = generic_read_dir,
.llseek = default_llseek,
};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
}
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
return area;
}
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+ unsigned int l)
{
return;
}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..dfaf6fa9b7b5 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
typedef struct minix_dir_entry minix_dirent;
typedef struct minix3_dir_entry minix3_dirent;
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
const struct file_operations minix_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = minix_readdir,
+ .iterate = minix_readdir,
.fsync = generic_file_fsync,
};
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
return (void*)((char*)de + sbi->s_dirsize);
}
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
- unsigned long npages = dir_pages(inode);
struct minix_sb_info *sbi = minix_sb(sb);
unsigned chunk_size = sbi->s_dirsize;
- char *name;
- __u32 inumber;
+ unsigned long npages = dir_pages(inode);
+ unsigned long pos = ctx->pos;
+ unsigned offset;
+ unsigned long n;
- pos = (pos + chunk_size-1) & ~(chunk_size-1);
+ ctx->pos = pos = ALIGN(pos, chunk_size);
if (pos >= inode->i_size)
- goto done;
+ return 0;
+
+ offset = pos & ~PAGE_CACHE_MASK;
+ n = pos >> PAGE_CACHE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
p = kaddr+offset;
limit = kaddr + minix_last_byte(inode, n) - chunk_size;
for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+ const char *name;
+ __u32 inumber;
if (sbi->s_version == MINIX_V3) {
minix3_dirent *de3 = (minix3_dirent *)p;
name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
inumber = de->inode;
}
if (inumber) {
- int over;
-
unsigned l = strnlen(name, sbi->s_namelen);
- offset = p - kaddr;
- over = filldir(dirent, name, l,
- (n << PAGE_CACHE_SHIFT) | offset,
- inumber, DT_UNKNOWN);
- if (over) {
+ if (!dir_emit(ctx, name, l,
+ inumber, DT_UNKNOWN)) {
dir_put_page(page);
- goto done;
+ return 0;
}
}
+ ctx->pos += chunk_size;
}
dir_put_page(page);
}
-
-done:
- filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
return 0;
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
return error;
}
+static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int error;
+ struct inode *inode = minix_new_inode(dir, mode, &error);
+ if (inode) {
+ minix_set_inode(inode, 0);
+ mark_inode_dirty(inode);
+ d_tmpfile(dentry, inode);
+ }
+ return error;
+}
+
static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
.mknod = minix_mknod,
.rename = minix_rename,
.getattr = minix_getattr,
+ .tmpfile = minix_tmpfile,
};
diff --git a/fs/mount.h b/fs/mount.h
index cd5007980400..64a858143ff9 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -18,6 +18,12 @@ struct mnt_pcp {
int mnt_writers;
};
+struct mountpoint {
+ struct list_head m_hash;
+ struct dentry *m_dentry;
+ int m_count;
+};
+
struct mount {
struct list_head mnt_hash;
struct mount *mnt_parent;
@@ -40,6 +46,7 @@ struct mount {
struct list_head mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
+ struct mountpoint *mnt_mp; /* where is it mounted */
#ifdef CONFIG_FSNOTIFY
struct hlist_head mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
diff --git a/fs/namei.c b/fs/namei.c
index 57ae9c8c66bf..b2beee7a733f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
- dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode);
+ dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (!dentry)
goto unlazy;
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
- err = parent->d_op->d_hash(parent, nd->inode,
- &this);
+ err = parent->d_op->d_hash(parent, &this);
if (err < 0)
break;
}
@@ -1976,7 +1975,7 @@ static int path_lookupat(int dfd, const char *name,
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
- if (!nd->inode->i_op->lookup) {
+ if (!can_lookup(nd->inode)) {
path_put(&nd->path);
err = -ENOTDIR;
}
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
* to use its own hash..
*/
if (base->d_flags & DCACHE_OP_HASH) {
- int err = base->d_op->d_hash(base, base->d_inode, &this);
+ int err = base->d_op->d_hash(base, &this);
if (err < 0)
return ERR_PTR(err);
}
@@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
nd->flags &= ~LOOKUP_PARENT;
nd->flags |= op->intent;
- switch (nd->last_type) {
- case LAST_DOTDOT:
- case LAST_DOT:
+ if (nd->last_type != LAST_NORM) {
error = handle_dots(nd, nd->last_type);
if (error)
return error;
- /* fallthrough */
- case LAST_ROOT:
- error = complete_walk(nd);
- if (error)
- return error;
- audit_inode(name, nd->path.dentry, 0);
- if (open_flag & O_CREAT) {
- error = -EISDIR;
- goto out;
- }
- goto finish_open;
- case LAST_BIND:
- error = complete_walk(nd);
- if (error)
- return error;
- audit_inode(name, dir, 0);
goto finish_open;
}
@@ -2740,7 +2721,7 @@ static int do_last(struct nameidata *nd, struct path *path,
if (error)
return error;
- audit_inode(name, dir, 0);
+ audit_inode(name, dir, LOOKUP_PARENT);
error = -EISDIR;
/* trailing slashes? */
if (nd->last.name[nd->last.len])
@@ -2841,19 +2822,19 @@ finish_lookup:
}
nd->inode = inode;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
+finish_open:
error = complete_walk(nd);
if (error) {
path_put(&save_parent);
return error;
}
+ audit_inode(name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
goto out;
error = -ENOTDIR;
- if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
+ if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
goto out;
- audit_inode(name, nd->path.dentry, 0);
-finish_open:
if (!S_ISREG(nd->inode->i_mode))
will_truncate = false;
@@ -2920,6 +2901,67 @@ stale_open:
goto retry_lookup;
}
+static int do_tmpfile(int dfd, struct filename *pathname,
+ struct nameidata *nd, int flags,
+ const struct open_flags *op,
+ struct file *file, int *opened)
+{
+ static const struct qstr name = QSTR_INIT("/", 1);
+ struct dentry *dentry, *child;
+ struct inode *dir;
+ int error = path_lookupat(dfd, pathname->name,
+ flags | LOOKUP_DIRECTORY, nd);
+ if (unlikely(error))
+ return error;
+ error = mnt_want_write(nd->path.mnt);
+ if (unlikely(error))
+ goto out;
+ /* we want directory to be writable */
+ error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+ if (error)
+ goto out2;
+ dentry = nd->path.dentry;
+ dir = dentry->d_inode;
+ if (!dir->i_op->tmpfile) {
+ error = -EOPNOTSUPP;
+ goto out2;
+ }
+ child = d_alloc(dentry, &name);
+ if (unlikely(!child)) {
+ error = -ENOMEM;
+ goto out2;
+ }
+ nd->flags &= ~LOOKUP_DIRECTORY;
+ nd->flags |= op->intent;
+ dput(nd->path.dentry);
+ nd->path.dentry = child;
+ error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+ if (error)
+ goto out2;
+ audit_inode(pathname, nd->path.dentry, 0);
+ error = may_open(&nd->path, op->acc_mode, op->open_flag);
+ if (error)
+ goto out2;
+ file->f_path.mnt = nd->path.mnt;
+ error = finish_open(file, nd->path.dentry, NULL, opened);
+ if (error)
+ goto out2;
+ error = open_check_o_direct(file);
+ if (error) {
+ fput(file);
+ } else if (!(op->open_flag & O_EXCL)) {
+ struct inode *inode = file_inode(file);
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_LINKABLE;
+ spin_unlock(&inode->i_lock);
+ }
+out2:
+ mnt_drop_write(nd->path.mnt);
+out:
+ path_put(&nd->path);
+ return error;
+}
+
static struct file *path_openat(int dfd, struct filename *pathname,
struct nameidata *nd, const struct open_flags *op, int flags)
{
@@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
file->f_flags = op->open_flag;
+ if (unlikely(file->f_flags & O_TMPFILE)) {
+ error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+ goto out;
+ }
+
error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
if (unlikely(error))
goto out;
@@ -2987,9 +3034,10 @@ out:
}
struct file *do_filp_open(int dfd, struct filename *pathname,
- const struct open_flags *op, int flags)
+ const struct open_flags *op)
{
struct nameidata nd;
+ int flags = op->lookup_flags;
struct file *filp;
filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
}
struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
- const char *name, const struct open_flags *op, int flags)
+ const char *name, const struct open_flags *op)
{
struct nameidata nd;
struct file *file;
struct filename filename = { .name = name };
+ int flags = op->lookup_flags | LOOKUP_ROOT;
nd.root.mnt = mnt;
nd.root.dentry = dentry;
- flags |= LOOKUP_ROOT;
-
if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
@@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
mutex_lock(&inode->i_mutex);
/* Make sure we don't allow creating hardlink to an unlinked file */
- if (inode->i_nlink == 0)
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
error = -ENOENT;
else if (max_links && inode->i_nlink >= max_links)
error = -EMLINK;
else
error = dir->i_op->link(old_dentry, dir, new_dentry);
+
+ if (!error && (inode->i_state & I_LINKABLE)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_LINKABLE;
+ spin_unlock(&inode->i_lock);
+ }
mutex_unlock(&inode->i_mutex);
if (!error)
fsnotify_link(dir, inode, new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 341d3f564082..7b1ca9ba0b0a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -21,7 +21,8 @@
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
#include "pnode.h"
#include "internal.h"
@@ -36,6 +37,7 @@ static int mnt_id_start = 0;
static int mnt_group_start = 1;
static struct list_head *mount_hashtable __read_mostly;
+static struct list_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static struct rw_semaphore namespace_sem;
@@ -605,6 +607,51 @@ struct vfsmount *lookup_mnt(struct path *path)
}
}
+static struct mountpoint *new_mountpoint(struct dentry *dentry)
+{
+ struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
+ struct mountpoint *mp;
+
+ list_for_each_entry(mp, chain, m_hash) {
+ if (mp->m_dentry == dentry) {
+ /* might be worth a WARN_ON() */
+ if (d_unlinked(dentry))
+ return ERR_PTR(-ENOENT);
+ mp->m_count++;
+ return mp;
+ }
+ }
+
+ mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+ if (!mp)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&dentry->d_lock);
+ if (d_unlinked(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ kfree(mp);
+ return ERR_PTR(-ENOENT);
+ }
+ dentry->d_flags |= DCACHE_MOUNTED;
+ spin_unlock(&dentry->d_lock);
+ mp->m_dentry = dentry;
+ mp->m_count = 1;
+ list_add(&mp->m_hash, chain);
+ return mp;
+}
+
+static void put_mountpoint(struct mountpoint *mp)
+{
+ if (!--mp->m_count) {
+ struct dentry *dentry = mp->m_dentry;
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_MOUNTED;
+ spin_unlock(&dentry->d_lock);
+ list_del(&mp->m_hash);
+ kfree(mp);
+ }
+}
+
static inline int check_mnt(struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
@@ -633,27 +680,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
/*
- * Clear dentry's mounted state if it has no remaining mounts.
- * vfsmount_lock must be held for write.
- */
-static void dentry_reset_mounted(struct dentry *dentry)
-{
- unsigned u;
-
- for (u = 0; u < HASH_SIZE; u++) {
- struct mount *p;
-
- list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
- if (p->mnt_mountpoint == dentry)
- return;
- }
- }
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_MOUNTED;
- spin_unlock(&dentry->d_lock);
-}
-
-/*
* vfsmount lock must be held for write
*/
static void detach_mnt(struct mount *mnt, struct path *old_path)
@@ -664,32 +690,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_hash);
- dentry_reset_mounted(old_path->dentry);
+ put_mountpoint(mnt->mnt_mp);
+ mnt->mnt_mp = NULL;
}
/*
* vfsmount lock must be held for write
*/
-void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
+void mnt_set_mountpoint(struct mount *mnt,
+ struct mountpoint *mp,
struct mount *child_mnt)
{
+ mp->m_count++;
mnt_add_count(mnt, 1); /* essentially, that's mntget */
- child_mnt->mnt_mountpoint = dget(dentry);
+ child_mnt->mnt_mountpoint = dget(mp->m_dentry);
child_mnt->mnt_parent = mnt;
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_MOUNTED;
- spin_unlock(&dentry->d_lock);
+ child_mnt->mnt_mp = mp;
}
/*
* vfsmount lock must be held for write
*/
-static void attach_mnt(struct mount *mnt, struct path *path)
+static void attach_mnt(struct mount *mnt,
+ struct mount *parent,
+ struct mountpoint *mp)
{
- mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
+ mnt_set_mountpoint(parent, mp, mnt);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
- hash(path->mnt, path->dentry));
- list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
+ hash(&parent->mnt, mp->m_dentry));
+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
/*
@@ -1095,11 +1124,23 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
-void release_mounts(struct list_head *head)
+static LIST_HEAD(unmounted); /* protected by namespace_sem */
+
+static void namespace_unlock(void)
{
struct mount *mnt;
- while (!list_empty(head)) {
- mnt = list_first_entry(head, struct mount, mnt_hash);
+ LIST_HEAD(head);
+
+ if (likely(list_empty(&unmounted))) {
+ up_write(&namespace_sem);
+ return;
+ }
+
+ list_splice_init(&unmounted, &head);
+ up_write(&namespace_sem);
+
+ while (!list_empty(&head)) {
+ mnt = list_first_entry(&head, struct mount, mnt_hash);
list_del_init(&mnt->mnt_hash);
if (mnt_has_parent(mnt)) {
struct dentry *dentry;
@@ -1119,11 +1160,16 @@ void release_mounts(struct list_head *head)
}
}
+static inline void namespace_lock(void)
+{
+ down_write(&namespace_sem);
+}
+
/*
* vfsmount lock must be held for write
* namespace_sem must be held for write
*/
-void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
+void umount_tree(struct mount *mnt, int propagate)
{
LIST_HEAD(tmp_list);
struct mount *p;
@@ -1142,20 +1188,20 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
p->mnt_parent->mnt_ghosts++;
- dentry_reset_mounted(p->mnt_mountpoint);
+ put_mountpoint(p->mnt_mp);
+ p->mnt_mp = NULL;
}
change_mnt_propagation(p, MS_PRIVATE);
}
- list_splice(&tmp_list, kill);
+ list_splice(&tmp_list, &unmounted);
}
-static void shrink_submounts(struct mount *mnt, struct list_head *umounts);
+static void shrink_submounts(struct mount *mnt);
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
int retval;
- LIST_HEAD(umount_list);
retval = security_sb_umount(&mnt->mnt, flags);
if (retval)
@@ -1222,22 +1268,21 @@ static int do_umount(struct mount *mnt, int flags)
return retval;
}
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
event++;
if (!(flags & MNT_DETACH))
- shrink_submounts(mnt, &umount_list);
+ shrink_submounts(mnt);
retval = -EBUSY;
if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 1, &umount_list);
+ umount_tree(mnt, 1);
retval = 0;
}
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
- release_mounts(&umount_list);
+ namespace_unlock();
return retval;
}
@@ -1310,13 +1355,13 @@ static bool mnt_ns_loop(struct path *path)
* mount namespace loop?
*/
struct inode *inode = path->dentry->d_inode;
- struct proc_inode *ei;
+ struct proc_ns *ei;
struct mnt_namespace *mnt_ns;
if (!proc_ns_inode(inode))
return false;
- ei = PROC_I(inode);
+ ei = get_proc_ns(inode);
if (ei->ns_ops != &mntns_operations)
return false;
@@ -1327,8 +1372,7 @@ static bool mnt_ns_loop(struct path *path)
struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
int flag)
{
- struct mount *res, *p, *q, *r;
- struct path path;
+ struct mount *res, *p, *q, *r, *parent;
if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
return ERR_PTR(-EINVAL);
@@ -1355,25 +1399,22 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
q = q->mnt_parent;
}
p = s;
- path.mnt = &q->mnt;
- path.dentry = p->mnt_mountpoint;
+ parent = q;
q = clone_mnt(p, p->mnt.mnt_root, flag);
if (IS_ERR(q))
goto out;
br_write_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
- attach_mnt(q, &path);
+ attach_mnt(q, parent, p->mnt_mp);
br_write_unlock(&vfsmount_lock);
}
}
return res;
out:
if (res) {
- LIST_HEAD(umount_list);
br_write_lock(&vfsmount_lock);
- umount_tree(res, 0, &umount_list);
+ umount_tree(res, 0);
br_write_unlock(&vfsmount_lock);
- release_mounts(&umount_list);
}
return q;
}
@@ -1383,10 +1424,10 @@ out:
struct vfsmount *collect_mounts(struct path *path)
{
struct mount *tree;
- down_write(&namespace_sem);
+ namespace_lock();
tree = copy_tree(real_mount(path->mnt), path->dentry,
CL_COPY_ALL | CL_PRIVATE);
- up_write(&namespace_sem);
+ namespace_unlock();
if (IS_ERR(tree))
return NULL;
return &tree->mnt;
@@ -1394,13 +1435,11 @@ struct vfsmount *collect_mounts(struct path *path)
void drop_collected_mounts(struct vfsmount *mnt)
{
- LIST_HEAD(umount_list);
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
- umount_tree(real_mount(mnt), 0, &umount_list);
+ umount_tree(real_mount(mnt), 0);
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
- release_mounts(&umount_list);
+ namespace_unlock();
}
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
@@ -1509,11 +1548,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
* in allocations.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct path *path, struct path *parent_path)
+ struct mount *dest_mnt,
+ struct mountpoint *dest_mp,
+ struct path *parent_path)
{
LIST_HEAD(tree_list);
- struct mount *dest_mnt = real_mount(path->mnt);
- struct dentry *dest_dentry = path->dentry;
struct mount *child, *p;
int err;
@@ -1522,7 +1561,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
if (err)
goto out;
}
- err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
+ err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
if (err)
goto out_cleanup_ids;
@@ -1534,10 +1573,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
}
if (parent_path) {
detach_mnt(source_mnt, parent_path);
- attach_mnt(source_mnt, path);
+ attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
- mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+ mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
}
@@ -1556,46 +1595,53 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return err;
}
-static int lock_mount(struct path *path)
+static struct mountpoint *lock_mount(struct path *path)
{
struct vfsmount *mnt;
+ struct dentry *dentry = path->dentry;
retry:
- mutex_lock(&path->dentry->d_inode->i_mutex);
- if (unlikely(cant_mount(path->dentry))) {
- mutex_unlock(&path->dentry->d_inode->i_mutex);
- return -ENOENT;
+ mutex_lock(&dentry->d_inode->i_mutex);
+ if (unlikely(cant_mount(dentry))) {
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return ERR_PTR(-ENOENT);
}
- down_write(&namespace_sem);
+ namespace_lock();
mnt = lookup_mnt(path);
- if (likely(!mnt))
- return 0;
- up_write(&namespace_sem);
+ if (likely(!mnt)) {
+ struct mountpoint *mp = new_mountpoint(dentry);
+ if (IS_ERR(mp)) {
+ namespace_unlock();
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return mp;
+ }
+ return mp;
+ }
+ namespace_unlock();
mutex_unlock(&path->dentry->d_inode->i_mutex);
path_put(path);
path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
+ dentry = path->dentry = dget(mnt->mnt_root);
goto retry;
}
-static void unlock_mount(struct path *path)
+static void unlock_mount(struct mountpoint *where)
{
- up_write(&namespace_sem);
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ struct dentry *dentry = where->m_dentry;
+ put_mountpoint(where);
+ namespace_unlock();
+ mutex_unlock(&dentry->d_inode->i_mutex);
}
-static int graft_tree(struct mount *mnt, struct path *path)
+static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
- if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+ if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
return -ENOTDIR;
- if (d_unlinked(path->dentry))
- return -ENOENT;
-
- return attach_recursive_mnt(mnt, path, NULL);
+ return attach_recursive_mnt(mnt, p, mp, NULL);
}
/*
@@ -1633,7 +1679,7 @@ static int do_change_type(struct path *path, int flag)
if (!type)
return -EINVAL;
- down_write(&namespace_sem);
+ namespace_lock();
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
@@ -1646,7 +1692,7 @@ static int do_change_type(struct path *path, int flag)
br_write_unlock(&vfsmount_lock);
out_unlock:
- up_write(&namespace_sem);
+ namespace_unlock();
return err;
}
@@ -1656,9 +1702,9 @@ static int do_change_type(struct path *path, int flag)
static int do_loopback(struct path *path, const char *old_name,
int recurse)
{
- LIST_HEAD(umount_list);
struct path old_path;
- struct mount *mnt = NULL, *old;
+ struct mount *mnt = NULL, *old, *parent;
+ struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -1670,17 +1716,19 @@ static int do_loopback(struct path *path, const char *old_name,
if (mnt_ns_loop(&old_path))
goto out;
- err = lock_mount(path);
- if (err)
+ mp = lock_mount(path);
+ err = PTR_ERR(mp);
+ if (IS_ERR(mp))
goto out;
old = real_mount(old_path.mnt);
+ parent = real_mount(path->mnt);
err = -EINVAL;
if (IS_MNT_UNBINDABLE(old))
goto out2;
- if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
+ if (!check_mnt(parent) || !check_mnt(old))
goto out2;
if (recurse)
@@ -1693,15 +1741,14 @@ static int do_loopback(struct path *path, const char *old_name,
goto out2;
}
- err = graft_tree(mnt, path);
+ err = graft_tree(mnt, parent, mp);
if (err) {
br_write_lock(&vfsmount_lock);
- umount_tree(mnt, 0, &umount_list);
+ umount_tree(mnt, 0);
br_write_unlock(&vfsmount_lock);
}
out2:
- unlock_mount(path);
- release_mounts(&umount_list);
+ unlock_mount(mp);
out:
path_put(&old_path);
return err;
@@ -1786,6 +1833,7 @@ static int do_move_mount(struct path *path, const char *old_name)
struct path old_path, parent_path;
struct mount *p;
struct mount *old;
+ struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -1793,8 +1841,9 @@ static int do_move_mount(struct path *path, const char *old_name)
if (err)
return err;
- err = lock_mount(path);
- if (err < 0)
+ mp = lock_mount(path);
+ err = PTR_ERR(mp);
+ if (IS_ERR(mp))
goto out;
old = real_mount(old_path.mnt);
@@ -1804,9 +1853,6 @@ static int do_move_mount(struct path *path, const char *old_name)
if (!check_mnt(p) || !check_mnt(old))
goto out1;
- if (d_unlinked(path->dentry))
- goto out1;
-
err = -EINVAL;
if (old_path.dentry != old_path.mnt->mnt_root)
goto out1;
@@ -1833,7 +1879,7 @@ static int do_move_mount(struct path *path, const char *old_name)
if (p == old)
goto out1;
- err = attach_recursive_mnt(old, path, &parent_path);
+ err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
if (err)
goto out1;
@@ -1841,7 +1887,7 @@ static int do_move_mount(struct path *path, const char *old_name)
* automatically */
list_del_init(&old->mnt_expire);
out1:
- unlock_mount(path);
+ unlock_mount(mp);
out:
if (!err)
path_put(&parent_path);
@@ -1877,21 +1923,24 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
*/
static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
+ struct mountpoint *mp;
+ struct mount *parent;
int err;
mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
- err = lock_mount(path);
- if (err)
- return err;
+ mp = lock_mount(path);
+ if (IS_ERR(mp))
+ return PTR_ERR(mp);
+ parent = real_mount(path->mnt);
err = -EINVAL;
- if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+ if (unlikely(!check_mnt(parent))) {
/* that's acceptable only for automounts done in private ns */
if (!(mnt_flags & MNT_SHRINKABLE))
goto unlock;
/* ... and for those we'd better have mountpoint still alive */
- if (!real_mount(path->mnt)->mnt_ns)
+ if (!parent->mnt_ns)
goto unlock;
}
@@ -1906,10 +1955,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
goto unlock;
newmnt->mnt.mnt_flags = mnt_flags;
- err = graft_tree(newmnt, path);
+ err = graft_tree(newmnt, parent, mp);
unlock:
- unlock_mount(path);
+ unlock_mount(mp);
return err;
}
@@ -1982,11 +2031,11 @@ int finish_automount(struct vfsmount *m, struct path *path)
fail:
/* remove m from any expiration list it may be on */
if (!list_empty(&mnt->mnt_expire)) {
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
list_del_init(&mnt->mnt_expire);
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
+ namespace_unlock();
}
mntput(m);
mntput(m);
@@ -2000,13 +2049,13 @@ fail:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
+ namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -2019,12 +2068,11 @@ void mark_mounts_for_expiry(struct list_head *mounts)
{
struct mount *mnt, *next;
LIST_HEAD(graveyard);
- LIST_HEAD(umounts);
if (list_empty(mounts))
return;
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
/* extract from the expiration list every vfsmount that matches the
@@ -2042,12 +2090,10 @@ void mark_mounts_for_expiry(struct list_head *mounts)
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
- umount_tree(mnt, 1, &umounts);
+ umount_tree(mnt, 1);
}
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
-
- release_mounts(&umounts);
+ namespace_unlock();
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -2104,7 +2150,7 @@ resume:
*
* vfsmount_lock must be held for write
*/
-static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
+static void shrink_submounts(struct mount *mnt)
{
LIST_HEAD(graveyard);
struct mount *m;
@@ -2115,7 +2161,7 @@ static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
- umount_tree(m, 1, umounts);
+ umount_tree(m, 1);
}
}
}
@@ -2238,12 +2284,11 @@ long do_mount(const char *dev_name, const char *dir_name,
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
+ if (!retval && !may_mount())
+ retval = -EPERM;
if (retval)
goto dput_out;
- if (!may_mount())
- return -EPERM;
-
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
@@ -2342,14 +2387,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
if (IS_ERR(new_ns))
return new_ns;
- down_write(&namespace_sem);
+ namespace_lock();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_ALL | CL_EXPIRE;
if (user_ns != mnt_ns->user_ns)
copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
- up_write(&namespace_sem);
+ namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
@@ -2380,7 +2425,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
p = next_mnt(p, old);
q = next_mnt(q, new);
}
- up_write(&namespace_sem);
+ namespace_unlock();
if (rootmnt)
mntput(rootmnt);
@@ -2418,7 +2463,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
struct mount *mnt = real_mount(m);
mnt->mnt_ns = new_ns;
new_ns->root = mnt;
- list_add(&new_ns->list, &mnt->mnt_list);
+ list_add(&mnt->mnt_list, &new_ns->list);
} else {
mntput(m);
}
@@ -2550,7 +2595,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
struct path new, old, parent_path, root_parent, root;
- struct mount *new_mnt, *root_mnt;
+ struct mount *new_mnt, *root_mnt, *old_mnt;
+ struct mountpoint *old_mp, *root_mp;
int error;
if (!may_mount())
@@ -2569,14 +2615,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out2;
get_fs_root(current->fs, &root);
- error = lock_mount(&old);
- if (error)
+ old_mp = lock_mount(&old);
+ error = PTR_ERR(old_mp);
+ if (IS_ERR(old_mp))
goto out3;
error = -EINVAL;
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
- if (IS_MNT_SHARED(real_mount(old.mnt)) ||
+ old_mnt = real_mount(old.mnt);
+ if (IS_MNT_SHARED(old_mnt) ||
IS_MNT_SHARED(new_mnt->mnt_parent) ||
IS_MNT_SHARED(root_mnt->mnt_parent))
goto out4;
@@ -2585,37 +2633,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = -ENOENT;
if (d_unlinked(new.dentry))
goto out4;
- if (d_unlinked(old.dentry))
- goto out4;
error = -EBUSY;
- if (new.mnt == root.mnt ||
- old.mnt == root.mnt)
+ if (new_mnt == root_mnt || old_mnt == root_mnt)
goto out4; /* loop, on the same file system */
error = -EINVAL;
if (root.mnt->mnt_root != root.dentry)
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
goto out4; /* not attached */
+ root_mp = root_mnt->mnt_mp;
if (new.mnt->mnt_root != new.dentry)
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
goto out4; /* not attached */
/* make sure we can reach put_old from new_root */
- if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
+ if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
+ root_mp->m_count++; /* pin it so it won't go away */
br_write_lock(&vfsmount_lock);
detach_mnt(new_mnt, &parent_path);
detach_mnt(root_mnt, &root_parent);
/* mount old root on put_old */
- attach_mnt(root_mnt, &old);
+ attach_mnt(root_mnt, old_mnt, old_mp);
/* mount new_root on / */
- attach_mnt(new_mnt, &root_parent);
+ attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
touch_mnt_namespace(current->nsproxy->mnt_ns);
br_write_unlock(&vfsmount_lock);
chroot_fs_refs(&root, &new);
+ put_mountpoint(root_mp);
error = 0;
out4:
- unlock_mount(&old);
+ unlock_mount(old_mp);
if (!error) {
path_put(&root_parent);
path_put(&parent_path);
@@ -2670,14 +2718,17 @@ void __init mnt_init(void)
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+ mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
- if (!mount_hashtable)
+ if (!mount_hashtable || !mountpoint_hashtable)
panic("Failed to allocate mount hash table\n");
printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
for (u = 0; u < HASH_SIZE; u++)
INIT_LIST_HEAD(&mount_hashtable[u]);
+ for (u = 0; u < HASH_SIZE; u++)
+ INIT_LIST_HEAD(&mountpoint_hashtable[u]);
br_lock_init(&vfsmount_lock);
@@ -2694,16 +2745,13 @@ void __init mnt_init(void)
void put_mnt_ns(struct mnt_namespace *ns)
{
- LIST_HEAD(umount_list);
-
if (!atomic_dec_and_test(&ns->count))
return;
- down_write(&namespace_sem);
+ namespace_lock();
br_write_lock(&vfsmount_lock);
- umount_tree(ns->root, 0, &umount_list);
+ umount_tree(ns->root, 0);
br_write_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
- release_mounts(&umount_list);
+ namespace_unlock();
free_mnt_ns(ns);
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 816326093656..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
#include "ncp_fs.h"
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
struct ncp_cache_control *);
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ncp_readdir,
+ .iterate = ncp_readdir,
.unlocked_ioctl = ncp_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ncp_compat_ioctl,
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
* Dentry operations routines
*/
static int ncp_lookup_validate(struct dentry *, unsigned int);
-static int ncp_hash_dentry(const struct dentry *, const struct inode *,
- struct qstr *);
-static int ncp_compare_dentry(const struct dentry *, const struct inode *,
- const struct dentry *, const struct inode *,
+static int ncp_hash_dentry(const struct dentry *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
static int ncp_delete_dentry(const struct dentry *);
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
/*
* Note: leave the hash unchanged if the directory
* is case-sensitive.
+ *
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
*/
static int
-ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
- struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
{
+ struct inode *inode = ACCESS_ONCE(dentry->d_inode);
+
+ if (!inode)
+ return 0;
+
if (!ncp_case_sensitive(inode)) {
struct super_block *sb = dentry->d_sb;
struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
return 0;
}
+/*
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
static int
-ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
+ struct inode *pinode;
+
if (len != name->len)
return 1;
+ pinode = ACCESS_ONCE(parent->d_inode);
+ if (!pinode)
+ return 1;
+
if (ncp_case_sensitive(pinode))
return strncmp(str, name->name, len);
@@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
}
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct page *page = NULL;
struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (int) filp->f_pos);
+ (int) ctx->pos);
result = -EIO;
/* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out;
result = 0;
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
- goto out;
- filp->f_pos = 1;
- }
- if (filp->f_pos == 1) {
- if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
- goto out;
- filp->f_pos = 2;
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
page = grab_cache_page(&inode->i_data, 0);
if (!page)
@@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!PageUptodate(page) || !ctl.head.eof)
goto init_cache;
- if (filp->f_pos == 2) {
+ if (ctx->pos == 2) {
if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
goto init_cache;
@@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto init_cache;
}
- if (filp->f_pos > ctl.head.end)
+ if (ctx->pos > ctl.head.end)
goto finished;
- ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+ ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE;
ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE;
@@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
while (ctl.idx < NCP_DIRCACHE_SIZE) {
struct dentry *dent;
- int res;
+ bool over;
dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
- dentry, filp->f_pos);
+ dentry, ctx->pos);
if (!dent)
goto invalid_cache;
- res = filldir(dirent, dent->d_name.name,
- dent->d_name.len, filp->f_pos,
+ over = !dir_emit(ctx, dent->d_name.name,
+ dent->d_name.len,
dent->d_inode->i_ino, DT_UNKNOWN);
dput(dent);
- if (res)
+ if (over)
goto finished;
- filp->f_pos += 1;
+ ctx->pos += 1;
ctl.idx += 1;
- if (filp->f_pos > ctl.head.end)
+ if (ctx->pos > ctl.head.end)
goto finished;
}
if (ctl.page) {
@@ -548,9 +556,9 @@ init_cache:
ctl.valid = 1;
read_really:
if (ncp_is_server_root(inode)) {
- ncp_read_volume_list(filp, dirent, filldir, &ctl);
+ ncp_read_volume_list(file, ctx, &ctl);
} else {
- ncp_do_readdir(filp, dirent, filldir, &ctl);
+ ncp_do_readdir(file, ctx, &ctl);
}
ctl.head.end = ctl.fpos - 1;
ctl.head.eof = ctl.valid;
@@ -573,11 +581,11 @@ out:
}
static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
int inval_childs)
{
- struct dentry *newdent, *dentry = filp->f_path.dentry;
+ struct dentry *newdent, *dentry = file->f_path.dentry;
struct inode *dir = dentry->d_inode;
struct ncp_cache_control ctl = *ctrl;
struct qstr qname;
@@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
end_advance:
if (!valid)
ctl.valid = 0;
- if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
- if (!ino)
- ino = find_inode_number(dentry, &qname);
+ if (!ctl.filled && (ctl.fpos == ctx->pos)) {
if (!ino)
ino = iunique(dir->i_sb, 2);
- ctl.filled = filldir(dirent, qname.name, qname.len,
- filp->f_pos, ino, DT_UNKNOWN);
+ ctl.filled = !dir_emit(ctx, qname.name, qname.len,
+ ino, DT_UNKNOWN);
if (!ctl.filled)
- filp->f_pos += 1;
+ ctx->pos += 1;
}
ctl.fpos += 1;
ctl.idx += 1;
@@ -683,10 +689,10 @@ end_advance:
}
static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct ncp_server *server = NCP_SERVER(inode);
struct ncp_volume_info info;
@@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
int i;
DPRINTK("ncp_read_volume_list: pos=%ld\n",
- (unsigned long) filp->f_pos);
+ (unsigned long) ctx->pos);
for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
int inval_dentry;
@@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
}
inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
entry.volume = entry.i.volNumber;
- if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+ if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
return;
}
}
static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *dir = dentry->d_inode;
struct ncp_server *server = NCP_SERVER(dir);
struct nw_search_sequence seq;
@@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) filp->f_pos);
+ (unsigned long) ctx->pos);
PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
dentry->d_name.name, NCP_FINFO(dir)->volNumber,
NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
rpl += onerpl;
rpls -= onerpl;
entry.volume = entry.i.volNumber;
- if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+ if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
break;
}
} while (more);
@@ -1029,15 +1035,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
- /*
- * fail with EBUSY if there are still references to this
- * directory.
- */
- dentry_unhash(dentry);
- error = -EBUSY;
- if (!d_unhashed(dentry))
- goto out;
-
len = sizeof(__name);
error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
dentry->d_name.len, !ncp_preserve_case(dir));
@@ -1140,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
- /*
- * fail with EBUSY if there are still references to this
- * directory.
- */
- dentry_unhash(new_dentry);
- error = -EBUSY;
- if (!d_unhashed(new_dentry))
- goto out;
- }
-
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 26910c8154da..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
switch (optval) {
case 'u':
data->uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->uid))
+ if (!uid_valid(data->uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'g':
data->gid = make_kgid(current_user_ns(), optint);
- if (!gid_valid(data->gid))
+ if (!gid_valid(data->gid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'o':
data->mounted_uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->mounted_uid))
+ if (!uid_valid(data->mounted_uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'm':
data->file_mode = optint;
@@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
if (!server) /* How this could happen? */
goto out;
+ result = -EPERM;
+ if (IS_DEADDIR(dentry->d_inode))
+ goto out;
+
/* ageing the dentry to force validation */
ncp_age_dentry(server, dentry);
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
/* we do not support files bigger than 4GB... We eventually
supports just 4GB... */
- if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff
+ if (vma_pages(vma) + vma->vm_pgoff
> (1U << (32 - PAGE_SHIFT)))
return -EFBIG;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
If unsure, say N.
+config NFS_V4_2
+ bool "NFS client support for NFSv4.2"
+ depends on NFS_V4_1
+ help
+ This option enables support for minor version 2 of the NFSv4 protocol
+ in the kernel's NFS client.
+
+ If unsure, say N.
+
config PNFS_FILE_LAYOUT
tristate
depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
If the NFS client is unchanged from the upstream kernel, this
option should be set to the default "kernel.org".
+config NFS_V4_SECURITY_LABEL
+ bool
+ depends on NFS_V4_2 && SECURITY
+ default y
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
direct.o pagelist.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o \
- dns_resolve.o cache_lib.o
+ write.o namespace.o mount_clnt.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
obj-$(CONFIG_NFS_V4) += nfsv4.o
nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
- nfs4namespace.o nfs4getroot.o nfs4client.o
+ nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
dev->pgbase = 0;
dev->pglen = PAGE_SIZE * max_pages;
dev->mincount = 0;
+ dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev);
+ rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc) {
rv = ERR_PTR(rc);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f4891bde8851..8485978993e8 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -173,7 +173,7 @@ struct bl_msg_hdr {
/* blocklayoutdev.c */
ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blkdev_put(struct block_device *bdev);
+void nfs4_blkdev_put(struct block_device *bdev);
struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
struct pnfs_device *dev);
int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a86c5bdad9e3..04303b5c9361 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
/*
* Release the block device
*/
-int nfs4_blkdev_put(struct block_device *bdev)
+void nfs4_blkdev_put(struct block_device *bdev)
{
dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
MINOR(bdev->bd_dev));
- return blkdev_put(bdev, FMODE_READ);
+ blkdev_put(bdev, FMODE_READ);
}
ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 6fc7b5cae92b..8999cfddd866 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -88,14 +88,8 @@ out:
*/
static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
{
- int rv;
-
dprintk("%s Releasing\n", __func__);
- rv = nfs4_blkdev_put(bdev->bm_mdev);
- if (rv)
- printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
- __func__, rv);
-
+ nfs4_blkdev_put(bdev->bm_mdev);
dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 5088b57b078a..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,6 +125,9 @@ nfs41_callback_svc(void *vrqstp)
set_freezable();
while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
@@ -208,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
struct svc_rqst *rqstp;
int (*callback_svc)(void *vrqstp);
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- char svc_name[12];
int ret;
nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -232,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
svc_sock_update_bufs(serv);
- sprintf(svc_name, "nfsv4.%u-svc", minorversion);
cb_info->serv = serv;
cb_info->rqst = rqstp;
- cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+ cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+ "nfsv4.%u-svc", minorversion);
if (IS_ERR(cb_info->task)) {
ret = PTR_ERR(cb_info->task);
svc_exit_thread(cb_info->rqst);
@@ -279,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
ret = nfs4_callback_up_net(serv, net);
break;
case 1:
+ case 2:
ret = nfs41_callback_up_net(serv, net);
break;
default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
OP_CB_WANTS_CANCELLED = 12,
OP_CB_NOTIFY_LOCK = 13,
OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+ OP_CB_OFFLOAD = 15,
OP_CB_ILLEGAL = 10044,
};
@@ -39,6 +41,7 @@ struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
u32 slotid;
+ u32 minorversion;
struct net *net;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2960512792c2..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
- clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+ &args->csa_sessionid, cps->minorversion);
if (clp == NULL)
goto out;
@@ -414,7 +415,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
- if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_unlock(&tbl->slot_tbl_lock);
status = htonl(NFS4ERR_DELAY);
/* Return NFS4ERR_BADSESSION if we're draining the session
@@ -500,7 +501,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
&args->craa_type_mask))
pnfs_recall_all_layouts(cps->clp);
if (flags)
- nfs_expire_all_delegation_types(cps->clp, flags);
+ nfs_expire_unused_delegation_types(cps->clp, flags);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 59461c957d9d..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
- /* Check minor version is zero or one. */
- if (hdr->minorversion <= 1) {
- hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+ /* Check for minor version support */
+ if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
} else {
pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
"illegal minor version %u!\n",
@@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* A single slot, so highest used slotid is either 0 or -1
*/
tbl->highest_used_slotid = NFS4_NO_SLOT;
- nfs4_session_drain_complete(session, tbl);
+ nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
}
#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+ if (status != htonl(NFS4ERR_OP_ILLEGAL))
+ return status;
+
+ if (op_nr == OP_CB_OFFLOAD)
+ return htonl(NFS4ERR_NOTSUPP);
+ return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
static __be32
preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
return htonl(NFS_OK);
}
-static __be32 process_op(uint32_t minorversion, int nop,
- struct svc_rqst *rqstp,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
struct xdr_stream *xdr_out, void *resp,
struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
return status;
dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
- __func__, minorversion, nop, op_nr);
+ __func__, cps->minorversion, nop, op_nr);
+
+ switch (cps->minorversion) {
+ case 0:
+ status = preprocess_nfs4_op(op_nr, &op);
+ break;
+ case 1:
+ status = preprocess_nfs41_op(nop, op_nr, &op);
+ break;
+ case 2:
+ status = preprocess_nfs42_op(nop, op_nr, &op);
+ break;
+ default:
+ status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
- status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
- preprocess_nfs4_op(op_nr, &op);
if (status == htonl(NFS4ERR_OP_ILLEGAL))
op_nr = OP_CB_ILLEGAL;
if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
return rpc_drop_reply;
}
+ cps.minorversion = hdr_arg.minorversion;
hdr_res.taglen = hdr_arg.taglen;
hdr_res.tag = hdr_arg.tag;
if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
return rpc_system_err;
while (status == 0 && nops != hdr_arg.nops) {
- status = process_op(hdr_arg.minorversion, nops, rqstp,
- &xdr_in, argp, &xdr_out, resp, &cps);
+ status = process_op(nops, rqstp, &xdr_in,
+ argp, &xdr_out, resp, &cps);
nops++;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 84d8eae203a7..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -593,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
args.flags |= RPC_CLNT_CREATE_DISCRTRY;
if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+ if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
if (!IS_ERR(clp->cl_rpcclient))
return 0;
@@ -751,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
data->timeo, data->retrans);
if (data->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- if (server->options & NFS_OPTION_MIGRATION)
- set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1074,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
}
if (!(fattr->valid & NFS_ATTR_FATTR)) {
- error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+ error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 6390a4b5fee7..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -64,31 +64,29 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
return ret;
}
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
+static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct inode *inode = state->inode;
struct file_lock *fl;
int status = 0;
if (inode->i_flock == NULL)
- return 0;
-
- if (inode->i_flock == NULL)
goto out;
- /* Protect inode->i_flock using the file locks lock */
- lock_flocks();
+
+ /* Protect inode->i_flock using the i_lock */
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
continue;
if (nfs_file_open_context(fl->fl_file) != ctx)
continue;
- unlock_flocks();
- status = nfs4_lock_delegation_recall(state, fl);
+ spin_unlock(&inode->i_lock);
+ status = nfs4_lock_delegation_recall(fl, state, stateid);
if (status < 0)
goto out;
- lock_flocks();
+ spin_lock(&inode->i_lock);
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
out:
return status;
}
@@ -120,7 +118,7 @@ again:
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid);
if (!err)
- err = nfs_delegation_claim_locks(ctx, state);
+ err = nfs_delegation_claim_locks(ctx, state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
@@ -389,6 +387,24 @@ out:
return err;
}
+static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
+{
+ bool ret = false;
+
+ if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+ ret = true;
+ if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
+ struct inode *inode;
+
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (inode && list_empty(&NFS_I(inode)->open_files))
+ ret = true;
+ spin_unlock(&delegation->lock);
+ }
+ return ret;
+}
+
/**
* nfs_client_return_marked_delegations - return previously marked delegations
* @clp: nfs_client to process
@@ -411,8 +427,7 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
- &delegation->flags))
+ if (!nfs_delegation_need_return(delegation))
continue;
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
@@ -471,6 +486,13 @@ int nfs4_inode_return_delegation(struct inode *inode)
return err;
}
+static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
static void nfs_mark_return_delegation(struct nfs_server *server,
struct nfs_delegation *delegation)
{
@@ -478,6 +500,45 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
}
+static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ nfs_mark_return_delegation(server, delegation);
+ ret = true;
+ }
+ return ret;
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_server_mark_return_all_delegations(server);
+ rcu_read_unlock();
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+ if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+ nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+ nfs_client_mark_return_all_delegations(clp);
+ nfs_delegation_run_state_manager(clp);
+}
+
/**
* nfs_super_return_all_delegations - return delegations for one superblock
* @sb: sb to process
@@ -486,24 +547,22 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
void nfs_server_return_all_delegations(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
- struct nfs_delegation *delegation;
+ bool need_wait;
if (clp == NULL)
return;
rcu_read_lock();
- list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
- spin_lock(&delegation->lock);
- set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
- spin_unlock(&delegation->lock);
- }
+ need_wait = nfs_server_mark_return_all_delegations(server);
rcu_read_unlock();
- if (nfs_client_return_marked_delegations(clp) != 0)
+ if (need_wait) {
nfs4_schedule_state_manager(clp);
+ nfs4_wait_clnt_recover(clp);
+ }
}
-static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+static void nfs_mark_return_unused_delegation_types(struct nfs_server *server,
fmode_t flags)
{
struct nfs_delegation *delegation;
@@ -512,27 +571,21 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
continue;
if (delegation->type & flags)
- nfs_mark_return_delegation(server, delegation);
+ nfs_mark_return_if_closed_delegation(server, delegation);
}
}
-static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp,
fmode_t flags)
{
struct nfs_server *server;
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
- nfs_mark_return_all_delegation_types(server, flags);
+ nfs_mark_return_unused_delegation_types(server, flags);
rcu_read_unlock();
}
-static void nfs_delegation_run_state_manager(struct nfs_client *clp)
-{
- if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
- nfs4_schedule_state_manager(clp);
-}
-
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
@@ -546,27 +599,17 @@ void nfs_remove_bad_delegation(struct inode *inode)
EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
/**
- * nfs_expire_all_delegation_types
+ * nfs_expire_unused_delegation_types
* @clp: client to process
* @flags: delegation types to expire
*
*/
-void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)
{
- nfs_client_mark_return_all_delegation_types(clp, flags);
+ nfs_client_mark_return_unused_delegation_types(clp, flags);
nfs_delegation_run_state_manager(clp);
}
-/**
- * nfs_expire_all_delegations
- * @clp: client to process
- *
- */
-void nfs_expire_all_delegations(struct nfs_client *clp)
-{
- nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
-}
-
static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
{
struct nfs_delegation *delegation;
@@ -574,7 +617,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
continue;
- nfs_mark_return_delegation(server, delegation);
+ nfs_mark_return_if_closed_delegation(server, delegation);
}
}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index d54d4fca6793..9a79c7a99d6d 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -28,6 +28,7 @@ struct nfs_delegation {
enum {
NFS_DELEGATION_NEED_RECLAIM = 0,
NFS_DELEGATION_RETURN,
+ NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
};
@@ -41,7 +42,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode);
struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
void nfs_server_return_all_delegations(struct nfs_server *);
void nfs_expire_all_delegations(struct nfs_client *clp);
-void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
int nfs_client_return_marked_delegations(struct nfs_client *clp);
int nfs_delegations_present(struct nfs_client *clp);
@@ -53,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
-int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f23f455be42b..0fac2cb1ea18 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
#include <linux/pagevec.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/swap.h>
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
@@ -46,7 +47,7 @@
static int nfs_opendir(struct inode *, struct file *);
static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.read = generic_read_dir,
- .readdir = nfs_readdir,
+ .iterate = nfs_readdir,
.open = nfs_opendir,
.release = nfs_closedir,
.fsync = nfs_fsync_dir,
@@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
typedef struct {
struct file *file;
struct page *page;
+ struct dir_context *ctx;
unsigned long page_index;
u64 *dir_cookie;
u64 last_cookie;
@@ -252,7 +254,7 @@ out:
static
int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
- loff_t diff = desc->file->f_pos - desc->current_index;
+ loff_t diff = desc->ctx->pos - desc->current_index;
unsigned int index;
if (diff < 0)
@@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
|| (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->file->f_pos) {
+ } else if (new_pos < desc->ctx->pos) {
if (ctx->duped > 0
&& ctx->dup_cookie == *desc->dir_cookie) {
if (printk_ratelimit()) {
@@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
ctx->dup_cookie = *desc->dir_cookie;
ctx->duped = -1;
}
- desc->file->f_pos = new_pos;
+ desc->ctx->pos = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -405,13 +407,13 @@ different:
}
static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
return true;
- if (filp->f_pos == 0)
+ if (ctx->pos == 0)
return true;
return false;
}
@@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct dentry *alias;
struct inode *dir = parent->d_inode;
struct inode *inode;
+ int status;
if (filename.name[0] == '.') {
if (filename.len == 1)
@@ -447,7 +450,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
dentry = d_lookup(parent, &filename);
if (dentry != NULL) {
if (nfs_same_file(dentry, entry)) {
- nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ if (!status)
+ nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
goto out;
} else {
if (d_invalidate(dentry) != 0)
@@ -460,7 +465,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (dentry == NULL)
return;
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
if (IS_ERR(inode))
goto out;
@@ -585,10 +590,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
if (entry.fh == NULL || entry.fattr == NULL)
goto out;
+ entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(entry.label)) {
+ status = PTR_ERR(entry.label);
+ goto out;
+ }
+
array = nfs_readdir_get_array(page);
if (IS_ERR(array)) {
status = PTR_ERR(array);
- goto out;
+ goto out_label_free;
}
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
@@ -614,6 +625,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
nfs_readdir_free_large_page(pages_ptr, pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
+out_label_free:
+ nfs4_label_free(entry.label);
out:
nfs_free_fattr(entry.fattr);
nfs_free_fhandle(entry.fh);
@@ -702,8 +715,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
* Once we've found the start of the dirent within a page: fill 'er up...
*/
static
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
- filldir_t filldir)
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
{
struct file *file = desc->file;
int i = 0;
@@ -721,13 +733,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
struct nfs_cache_array_entry *ent;
ent = &array->array[i];
- if (filldir(dirent, ent->string.name, ent->string.len,
- file->f_pos, nfs_compat_user_ino64(ent->ino),
- ent->d_type) < 0) {
+ if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+ nfs_compat_user_ino64(ent->ino), ent->d_type)) {
desc->eof = 1;
break;
}
- file->f_pos++;
+ desc->ctx->pos++;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
@@ -759,8 +770,7 @@ out:
* directory in the page cache by the time we get here.
*/
static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
- filldir_t filldir)
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
{
struct page *page = NULL;
int status;
@@ -785,7 +795,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
if (status < 0)
goto out_release;
- status = nfs_do_filldir(desc, dirent, filldir);
+ status = nfs_do_filldir(desc);
out:
dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +810,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
last cookie cache takes care of the common case of reading the
whole directory.
*/
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
- struct nfs_open_dir_context *dir_ctx = filp->private_data;
+ struct nfs_open_dir_context *dir_ctx = file->private_data;
int res;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (long long)filp->f_pos);
+ (long long)ctx->pos);
nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
/*
- * filp->f_pos points to the dirent entry number.
+ * ctx->pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
memset(desc, 0, sizeof(*desc));
- desc->file = filp;
+ desc->file = file;
+ desc->ctx = ctx;
desc->dir_cookie = &dir_ctx->dir_cookie;
desc->decode = NFS_PROTO(inode)->decode_dirent;
- desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+ desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -840,7 +851,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* This means either end of directory */
if (*desc->dir_cookie && desc->eof == 0) {
/* Or that the server has 'lost' a cookie */
- res = uncached_readdir(desc, dirent, filldir);
+ res = uncached_readdir(desc);
if (res == 0)
continue;
}
@@ -857,7 +868,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (res < 0)
break;
- res = nfs_do_filldir(desc, dirent, filldir);
+ res = nfs_do_filldir(desc);
if (res < 0)
break;
} while (!desc->eof);
@@ -1040,6 +1051,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
if (flags & LOOKUP_RCU)
@@ -1082,7 +1094,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (fhandle == NULL || fattr == NULL)
goto out_error;
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out_error;
+
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error)
goto out_bad;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1106,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if ((error = nfs_refresh_inode(inode, fattr)) != 0)
goto out_bad;
+ nfs_setsecurity(inode, fattr, label);
+
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
@@ -1108,6 +1128,7 @@ out_zap_parent:
out_bad:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
nfs_mark_for_revalidate(dir);
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
@@ -1128,6 +1149,7 @@ out_zap_parent:
out_error:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
__func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1278,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1305,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (fhandle == NULL || fattr == NULL)
goto out;
+ label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out;
+
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
res = ERR_CAST(inode);
if (IS_ERR(res))
goto out_unblock_sillyrename;
@@ -1310,6 +1337,7 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
+ nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
@@ -1357,18 +1385,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
{
int err;
- if (ctx->dentry != dentry) {
- dput(ctx->dentry);
- ctx->dentry = dget(dentry);
- }
-
- /* If the open_intent is for execute, we have an extra check to make */
- if (ctx->mode & FMODE_EXEC) {
- err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
- if (err < 0)
- goto out;
- }
-
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;
@@ -1427,13 +1443,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
nfs_block_sillyrename(dentry->d_parent);
inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
- d_drop(dentry);
+ nfs_unblock_sillyrename(dentry->d_parent);
if (IS_ERR(inode)) {
- nfs_unblock_sillyrename(dentry->d_parent);
put_nfs_open_context(ctx);
err = PTR_ERR(inode);
switch (err) {
case -ENOENT:
+ d_drop(dentry);
d_add(dentry, NULL);
break;
case -EISDIR:
@@ -1449,16 +1465,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
- res = d_add_unique(dentry, inode);
- if (res != NULL)
- dentry = res;
-
- nfs_unblock_sillyrename(dentry->d_parent);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
- err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
- dput(res);
+ err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
out:
return err;
@@ -1486,6 +1494,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto no_open;
if (d_mountpoint(dentry))
goto no_open;
+ if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ goto no_open;
inode = dentry->d_inode;
parent = dget_parent(dentry);
@@ -1526,7 +1536,8 @@ no_open:
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct dentry *parent = dget_parent(dentry);
struct inode *dir = parent->d_inode;
@@ -1539,18 +1550,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
if (dentry->d_inode)
goto out;
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
if (error)
goto out_error;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (!(fattr->valid & NFS_ATTR_FATTR)) {
struct nfs_server *server = NFS_SB(dentry->d_sb);
- error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
if (error < 0)
goto out_error;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_error;
@@ -1719,7 +1730,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_ino, dentry->d_name.name);
spin_lock(&dentry->d_lock);
- if (dentry->d_count > 1) {
+ if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
@@ -1757,7 +1768,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
*/
int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- struct pagevec lru_pvec;
struct page *page;
char *kaddr;
struct iattr attr;
@@ -1797,11 +1807,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
*/
- pagevec_init(&lru_pvec, 0);
- if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+ if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
GFP_KERNEL)) {
- pagevec_add(&lru_pvec, page);
- pagevec_lru_add_file(&lru_pvec);
SetPageUptodate(page);
unlock_page(page);
} else
@@ -1868,7 +1875,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
- new_dentry->d_count);
+ d_count(new_dentry));
/*
* For non-directories, check whether the target is busy and if so,
@@ -1886,7 +1893,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rehash = new_dentry;
}
- if (new_dentry->d_count > 2) {
+ if (d_count(new_dentry) > 2) {
int err;
/* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
kfree(ip_addr);
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
#else
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
ret = -ESRCH;
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
static struct cache_detail nfs_dns_resolve_template = {
.owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
cache_destroy_net(nn->nfs_dns_resolve, net);
}
+static int nfs4_dns_net_init(struct net *net)
+{
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+ .init = nfs4_dns_net_init,
+ .exit = nfs4_dns_net_exit,
+};
+
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
void *ptr)
{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
int nfs_dns_resolver_init(void)
{
- return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ int err;
+
+ err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+ if (err < 0)
+ goto out;
+ err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ if (err < 0)
+ goto out1;
+ return 0;
+out1:
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+ return err;
}
void nfs_dns_resolver_destroy(void)
{
rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
}
#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 29f4a48a0ee6..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+ page, offset, length);
- if (offset != 0)
+ if (offset != 0 || length < PAGE_CACHE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
return nfs_fscache_release_page(page, gfp);
}
+static void nfs_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct nfs_inode *nfsi;
+ struct address_space *mapping = page_file_mapping(page);
+
+ if (!mapping || PageSwapCache(page))
+ return;
+
+ /*
+ * Check if an unstable page is currently being committed and
+ * if so, have the VM treat it as if the page is under writeback
+ * so it will not block due to pages that will shortly be freeable.
+ */
+ nfsi = NFS_I(mapping->host);
+ if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ *writeback = true;
+ return;
+ }
+
+ /*
+ * If PagePrivate() is set, then the page is not freeable and as the
+ * inode is not being committed, it's not going to be cleaned in the
+ * near future so treat it as dirty
+ */
+ if (PagePrivate(page))
+ *dirty = true;
+}
+
/*
* Attempt to clear the private state associated with a page when an error
* occurs that requires the cached contents of an inode to be written back or
@@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
.direct_IO = nfs_direct_IO,
.migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
+ .is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
#ifdef CONFIG_NFS_SWAP
.swap_activate = nfs_swap_activate,
@@ -744,6 +776,7 @@ static int
do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
+ struct nfs_lock_context *l_ctx;
int status;
/*
@@ -752,6 +785,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
*/
nfs_sync_mapping(filp->f_mapping);
+ l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+ if (!IS_ERR(l_ctx)) {
+ status = nfs_iocounter_wait(&l_ctx->io_count);
+ nfs_put_lock_context(l_ctx);
+ if (status < 0)
+ return status;
+ }
+
/* NOTE: special case
* If we're signalled while cleaning up locks on process exit, we
* still need to complete the unlock.
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
goto out;
}
- inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+ inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
return desclen;
}
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
- const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, struct idmap *idmap)
{
- const struct cred *saved_cred;
- struct key *rkey;
char *desc;
- struct user_key_payload *payload;
+ struct key *rkey;
ssize_t ret;
ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
if (ret <= 0)
- goto out;
+ return ERR_PTR(ret);
+
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ if (IS_ERR(rkey)) {
+ mutex_lock(&idmap->idmap_mutex);
+ rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+ desc, "", 0, idmap);
+ mutex_unlock(&idmap->idmap_mutex);
+ }
+
+ kfree(desc);
+ return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ struct user_key_payload *payload;
+ ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
- if (idmap)
- rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
- else
- rkey = request_key(&key_type_id_resolver, desc, "");
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
revert_creds(saved_cred);
- kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
goto out;
@@ -316,23 +329,6 @@ out:
return ret;
}
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
-{
- ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
- name, namelen, type, data,
- data_size, NULL);
- if (ret < 0) {
- mutex_lock(&idmap->idmap_mutex);
- ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
- name, namelen, type, data,
- data_size, idmap);
- mutex_unlock(&idmap->idmap_mutex);
- }
- return ret;
-}
-
/* ID -> Name */
static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1f941674b089..c93639e6cf68 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
-#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
#include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
{
if (fatal_signal_pending(current))
return -ERESTARTSYS;
- freezable_schedule();
+ freezable_schedule_unsafe();
return 0;
}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
nfs_fscache_invalidate(inode);
- } else {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
- }
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
+ } else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
}
void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
return 0;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ int error;
+
+ if (label == NULL)
+ return;
+
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+ return;
+
+ if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+ return;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+ error = security_inode_notifysecctx(inode, label->label,
+ label->len);
+ if (error)
+ printk(KERN_ERR "%s() %s %d "
+ "security_inode_notifysecctx() %d\n",
+ __func__,
+ (char *)label->label,
+ label->len, error);
+ }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+ struct nfs4_label *label = NULL;
+ int minor_version = server->nfs_client->cl_minorversion;
+
+ if (minor_version < 2)
+ return label;
+
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ return label;
+
+ label = kzalloc(sizeof(struct nfs4_label), flags);
+ if (label == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+ if (label->label == NULL) {
+ kfree(label);
+ return ERR_PTR(-ENOMEM);
+ }
+ label->len = NFS4_MAXLABELLEN;
+
+ return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
/*
* This is our front-end to iget that looks up inodes by file handle
* instead of inode number.
*/
struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_find_desc desc = {
.fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
}
+
+ nfs_setsecurity(inode, fattr, label);
+
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
nfsi->access_cache = RB_ROOT;
@@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
+ nfs_setsecurity(inode, fattr, label);
dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
NFS_PROTO(inode)->return_delegation(inode);
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
if (error == 0)
- nfs_refresh_inode(inode, fattr);
+ error = nfs_refresh_inode(inode, fattr);
nfs_free_fattr(fattr);
out:
return error;
@@ -561,20 +632,22 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
l_ctx->lockowner.l_owner = current->files;
l_ctx->lockowner.l_pid = current->tgid;
INIT_LIST_HEAD(&l_ctx->list);
+ nfs_iocounter_init(&l_ctx->io_count);
}
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
{
- struct nfs_lock_context *pos;
+ struct nfs_lock_context *head = &ctx->lock_context;
+ struct nfs_lock_context *pos = head;
- list_for_each_entry(pos, &ctx->lock_context.list, list) {
+ do {
if (pos->lockowner.l_owner != current->files)
continue;
if (pos->lockowner.l_pid != current->tgid)
continue;
atomic_inc(&pos->count);
return pos;
- }
+ } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
return NULL;
}
@@ -711,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
*/
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = ctx->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
- filp->private_data = get_nfs_open_context(ctx);
spin_lock(&inode->i_lock);
list_add(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ filp->private_data = get_nfs_open_context(ctx);
+ if (list_empty(&ctx->list))
+ nfs_inode_attach_open_context(ctx);
+}
EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
/*
@@ -746,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
static void nfs_file_clear_open_context(struct file *filp)
{
- struct inode *inode = file_inode(filp);
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
+ struct inode *inode = ctx->dentry->d_inode;
+
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -788,6 +869,7 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
+ struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
@@ -805,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label)) {
+ status = PTR_ERR(label);
+ goto out;
+ }
+
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -815,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (!S_ISDIR(inode->i_mode))
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
}
- goto out;
+ goto err_out;
}
status = nfs_refresh_inode(inode, fattr);
@@ -823,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
- goto out;
+ goto err_out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -833,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id,
(long long)NFS_FILEID(inode));
- out:
+err_out:
+ nfs4_label_free(label);
+out:
nfs_free_fattr(fattr);
return status;
}
@@ -861,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
*/
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
- if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
&& !nfs_attribute_cache_expired(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
@@ -1241,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
spin_lock(&inode->i_lock);
status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
+
return status;
}
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1481,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
- if (invalid & NFS_INO_INVALID_ATTR) {
+ if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -1494,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
}
invalid &= ~NFS_INO_INVALID_ATTR;
+ invalid &= ~NFS_INO_INVALID_LABEL;
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1636,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
nfs_clients_init(net);
- return nfs_dns_resolver_cache_init(net);
+ return 0;
}
static void nfs_net_exit(struct net *net)
{
- nfs_dns_resolver_cache_destroy(net);
nfs_cleanup_cb_ident_idr(net);
}
@@ -1659,10 +1752,6 @@ static int __init init_nfs_fs(void)
{
int err;
- err = nfs_dns_resolver_init();
- if (err < 0)
- goto out10;;
-
err = register_pernet_subsys(&nfs_net_ops);
if (err < 0)
goto out9;
@@ -1728,8 +1817,6 @@ out7:
out8:
unregister_pernet_subsys(&nfs_net_ops);
out9:
- nfs_dns_resolver_destroy();
-out10:
return err;
}
@@ -1742,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
- nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 541c9ebdbc5a..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
- struct nfs4_sessionid *);
+ struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
struct nfs_subversion *);
extern struct nfs_server *nfs4_create_server(
@@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_io_counter *c);
+
+static inline void nfs_iocounter_init(struct nfs_io_counter *c)
+{
+ c->flags = 0;
+ atomic_set(&c->io_count, 0);
+}
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
@@ -248,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
#ifdef CONFIG_NFS_V4_1
extern const u32 nfs41_maxread_overhead;
extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
#endif
/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
* nfs_mount - Obtain an NFS file handle for the given host and path
* @info: pointer to mount request arguments
*
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
*/
int nfs_mount(struct nfs_mount_request *info)
{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
dprintk("NFS: MNT request succeeded\n");
status = 0;
+ /*
+ * If the server didn't provide a flavor list, allow the
+ * client to try any flavor.
+ */
+ if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+ dprintk("NFS: Faking up auth_flavs list\n");
+ info->auth_flavs[0] = RPC_AUTH_NULL;
+ *info->auth_flav_len = 1;
+ }
out:
return status;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
struct dentry *parent = dget_parent(dentry);
/* Look it up again to get its attributes */
- err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+ err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
dput(parent);
if (err != 0)
return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX)
break;
- freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs3_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs3_diropargs arg = {
.fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
nfs_post_op_update_inode(dir, data->res.dir_attr);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
return status;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 944c9a5c1039..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -36,6 +36,7 @@ enum nfs4_client_state {
struct nfs4_minor_version_ops {
u32 minor_version;
+ unsigned init_caps;
int (*call_sync)(struct rpc_clnt *clnt,
struct nfs_server *server,
@@ -46,6 +47,8 @@ struct nfs4_minor_version_ops {
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
+ int (*free_lock_state)(struct nfs_server *,
+ struct nfs4_lock_state *);
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -143,12 +146,14 @@ struct nfs4_lock_state {
enum {
LK_STATE_IN_USE,
NFS_DELEGATED_STATE, /* Current stateid is delegation */
+ NFS_OPEN_STATE, /* OPEN stateid is set */
NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
+ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
};
struct nfs4_state {
@@ -189,7 +194,7 @@ struct nfs4_state_recovery_ops {
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
- int (*reclaim_complete)(struct nfs_client *);
+ int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
struct rpc_cred *);
};
@@ -231,8 +236,11 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
struct nfs_fh *, struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern int nfs4_release_lockowner(struct nfs4_lock_state *);
extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode);
#if defined(CONFIG_NFS_V4_1)
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -295,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
void nfs4_free_client(struct nfs_client *);
@@ -347,13 +355,13 @@ extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
extern void nfs4_schedule_state_manager(struct nfs_client *);
extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
-extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
extern void nfs41_handle_server_scope(struct nfs_client *,
struct nfs41_server_scope **);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
+extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
fmode_t, const struct nfs_lockowner *);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
@@ -412,6 +420,11 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
return memcmp(dst, src, sizeof(*dst)) == 0;
}
+static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
+{
+ return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
+}
+
#else
#define nfs4_close_state(a, b) do { } while (0)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 66b6664dcd4c..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
if (err)
goto error;
+ if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+ err = -EINVAL;
+ goto error;
+ }
+
spin_lock_init(&clp->cl_lock);
INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -198,8 +203,12 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
/* Check NFS protocol revision and initialize RPC op vector */
clp->rpc_ops = &nfs_v4_clientops;
+ if (clp->cl_minorversion != 0)
+ __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
- error = nfs_create_rpc_client(clp, timeparms, authflavour);
+ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+ if (error == -EINVAL)
+ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
if (error < 0)
goto error;
@@ -558,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
*/
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
struct nfs_client *clp;
struct nfs_net *nn = net_generic(net, nfs_net_id);
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
- if (nfs4_cb_match_client(addr, clp, 1) == false)
+ if (nfs4_cb_match_client(addr, clp, minorversion) == false)
continue;
if (!nfs4_has_session(clp))
@@ -588,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
return NULL;
}
@@ -622,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (server->options & NFS_OPTION_MIGRATION)
+ set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -726,10 +737,23 @@ static int nfs4_server_common_setup(struct nfs_server *server,
return -ENOMEM;
/* We must ensure the session is initialised first */
- error = nfs4_init_session(server);
+ error = nfs4_init_session(server->nfs_client);
if (error < 0)
goto out;
+ /* Set the basic capabilities */
+ server->caps |= server->nfs_client->cl_mvops->init_caps;
+ if (server->flags & NFS_MOUNT_NORDIRPLUS)
+ server->caps &= ~NFS_CAP_READDIRPLUS;
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping &&
+ server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+
/* Probe the root fh to retrieve its FSID and filehandle */
error = nfs4_get_rootfh(server, mntfh);
if (error < 0)
@@ -773,9 +797,6 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
- if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
- server->caps |= NFS_CAP_READDIRPLUS;
server->options = data->options;
/* Get a client record */
@@ -792,13 +813,6 @@ static int nfs4_init_server(struct nfs_server *server,
if (error < 0)
goto error;
- /*
- * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
- * authentication.
- */
- if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
- server->caps |= NFS_CAP_UIDGID_NOMAP;
-
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
if (data->wsize)
@@ -876,7 +890,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Initialise the client representation from the parent server */
nfs_server_copy_userdata(server, parent_server);
- server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
goto out_drop;
}
}
- iput(inode);
if (inode != dentry->d_inode)
goto out_drop;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 4fb234d3aefb..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -158,11 +158,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
- nfs4_schedule_stateid_recovery(mds_server, state);
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
- if (state != NULL)
- nfs4_schedule_stateid_recovery(mds_server, state);
+ if (state != NULL) {
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
+ }
nfs4_schedule_lease_recovery(mds_client);
goto wait_on_recovery;
/* DS session errors */
@@ -226,6 +229,9 @@ reset:
out:
task->tk_status = 0;
return -EAGAIN;
+out_bad_stateid:
+ task->tk_status = -EIO;
+ return 0;
wait_on_recovery:
rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
@@ -299,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
struct nfs_read_data *rdata = data;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
if (filelayout_reset_to_mds(rdata->header->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
filelayout_reset_read(rdata);
@@ -307,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
}
rdata->read_done_cb = filelayout_read_done_cb;
- nfs41_setup_sequence(rdata->ds_clp->cl_session,
+ if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
&rdata->args.seq_args,
&rdata->res.seq_res,
- task);
+ task))
+ return;
+ nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+ rdata->args.lock_context, FMODE_READ);
}
static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -401,16 +414,23 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
{
struct nfs_write_data *wdata = data;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
if (filelayout_reset_to_mds(wdata->header->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
filelayout_reset_write(wdata);
rpc_exit(task, 0);
return;
}
- nfs41_setup_sequence(wdata->ds_clp->cl_session,
+ if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
&wdata->args.seq_args,
&wdata->res.seq_res,
- task);
+ task))
+ return;
+ nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+ wdata->args.lock_context, FMODE_WRITE);
}
static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -623,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
NFS_SERVER(lo->plh_inode)->nfs_client, id);
if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+ dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+ lo->plh_lc_cred, gfp_flags);
if (dsaddr == NULL)
goto out;
} else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index b8da95548d3d..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -70,6 +70,8 @@ struct nfs4_pnfs_ds {
struct list_head ds_addrs;
struct nfs_client *ds_clp;
atomic_t ds_count;
+ unsigned long ds_state;
+#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
};
struct nfs4_file_layout_dsaddr {
@@ -148,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 1fe284f01f8b..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
* of available devices, and return it.
*/
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+ struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred,
+ gfp_t gfp_flags)
{
struct pnfs_device *pdev = NULL;
u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
pdev->pgbase = 0;
pdev->pglen = max_resp_sz;
pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
- rc = nfs4_proc_getdeviceinfo(server, pdev);
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc)
goto out_free;
@@ -775,6 +779,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
return flseg->fh_array[i];
}
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+ might_sleep();
+ wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+
+
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
@@ -791,16 +811,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
filelayout_mark_devid_invalid(devid);
return NULL;
}
+ if (ds->ds_clp)
+ return ds;
- if (!ds->ds_clp) {
+ if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
int err;
err = nfs4_ds_connect(s, ds);
if (err) {
nfs4_mark_deviceid_unavailable(devid);
- return NULL;
+ ds = NULL;
}
+ nfs4_clear_ds_conn_bit(ds);
+ } else {
+ /* Either ds is connected, or ds is NULL */
+ nfs4_wait_ds_connect(ds);
}
return ds;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 0dd766079e1c..cdb0b41a4810 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -134,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len,
return ret;
}
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return the pseudoflavor of the first security mechanism in
+ * "flavors" that is locally supported. Return RPC_AUTH_UNIX if
+ * no matching flavor is found in the array. The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation.
+ */
rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
{
- struct gss_api_mech *mech;
- struct xdr_netobj oid;
- int i;
- rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+ rpc_authflavor_t pseudoflavor;
+ struct nfs4_secinfo4 *secinfo;
+ unsigned int i;
for (i = 0; i < flavors->num_flavors; i++) {
- struct nfs4_secinfo_flavor *flavor;
- flavor = &flavors->flavors[i];
-
- if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
- pseudoflavor = flavor->flavor;
- break;
- } else if (flavor->flavor == RPC_AUTH_GSS) {
- oid.len = flavor->gss.sec_oid4.len;
- oid.data = flavor->gss.sec_oid4.data;
- mech = gss_mech_get_by_OID(&oid);
- if (!mech)
- continue;
- pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
- gss_mech_put(mech);
+ secinfo = &flavors->flavors[i];
+
+ switch (secinfo->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ case RPC_AUTH_GSS:
+ pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ &secinfo->flavor_info);
+ if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+ return pseudoflavor;
break;
}
}
- return pseudoflavor;
+ return RPC_AUTH_UNIX;
}
static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0ad025eb523b..cf11799297c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state);
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ int err;
+
+ if (label == NULL)
+ return NULL;
+
+ if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+ return NULL;
+
+ if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+ return NULL;
+
+ err = security_dentry_init_security(dentry, sattr->ia_mode,
+ &dentry->d_name, (void **)&label->label, &label->len);
+ if (err == 0)
+ return label;
+
+ return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+ if (label)
+ security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+ if (label)
+ return server->attr_bitmask;
+
+ return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
{
@@ -107,6 +160,8 @@ static int nfs4_map_errors(int err)
return -EPROTONOSUPPORT;
case -NFS4ERR_ACCESS:
return -EACCES;
+ case -NFS4ERR_FILE_OPEN:
+ return -EBUSY;
default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
@@ -132,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
- | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -159,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
| FATTR4_WORD0_FILEID,
};
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
FATTR4_WORD0_FILES_AVAIL
| FATTR4_WORD0_FILES_FREE
| FATTR4_WORD0_FILES_TOTAL,
@@ -168,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
| FATTR4_WORD1_SPACE_TOTAL
};
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
FATTR4_WORD0_MAXLINK
| FATTR4_WORD0_MAXNAME,
0
@@ -183,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD2_LAYOUT_BLKSIZE
};
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
FATTR4_WORD0_TYPE
| FATTR4_WORD0_CHANGE
| FATTR4_WORD0_SIZE
@@ -199,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY
- | FATTR4_WORD1_MOUNTED_ON_FILEID
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
};
static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -266,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
*timeout = NFS4_POLL_RETRY_MIN;
if (*timeout > NFS4_POLL_RETRY_MAX)
*timeout = NFS4_POLL_RETRY_MAX;
- freezable_schedule_timeout_killable(*timeout);
+ freezable_schedule_timeout_killable_unsafe(*timeout);
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
*timeout <<= 1;
@@ -295,19 +353,30 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
}
if (state == NULL)
break;
- nfs4_schedule_stateid_recovery(server, state);
+ ret = nfs4_schedule_stateid_recovery(server, state);
+ if (ret < 0)
+ break;
goto wait_on_recovery;
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
+ if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
+ nfs_remove_bad_delegation(inode);
+ exception->retry = 1;
+ break;
+ }
if (state == NULL)
break;
- nfs_remove_bad_delegation(state->inode);
- nfs4_schedule_stateid_recovery(server, state);
+ ret = nfs4_schedule_stateid_recovery(server, state);
+ if (ret < 0)
+ break;
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
- if (state != NULL)
- nfs4_schedule_stateid_recovery(server, state);
+ if (state != NULL) {
+ ret = nfs4_schedule_stateid_recovery(server, state);
+ if (ret < 0)
+ break;
+ }
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
nfs4_schedule_lease_recovery(clp);
@@ -559,7 +628,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
task->tk_timeout = 0;
spin_lock(&tbl->slot_tbl_lock);
- if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) &&
!args->sa_privileged) {
/* The state manager will wait until the slot table is empty */
dprintk("%s session is draining\n", __func__);
@@ -749,6 +818,7 @@ struct nfs4_opendata {
struct nfs4_string owner_name;
struct nfs4_string group_name;
struct nfs_fattr f_attr;
+ struct nfs4_label *f_label;
struct dentry *dir;
struct dentry *dentry;
struct nfs4_state_owner *owner;
@@ -756,14 +826,45 @@ struct nfs4_opendata {
struct iattr attrs;
unsigned long timestamp;
unsigned int rpc_done : 1;
+ unsigned int is_recover : 1;
int rpc_status;
int cancelled;
};
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ if (err != -EINVAL)
+ return false;
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ return false;
+ server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+ exception->retry = 1;
+ return true;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+ enum open_claim_type4 claim)
+{
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ return claim;
+ switch (claim) {
+ default:
+ return claim;
+ case NFS4_OPEN_CLAIM_FH:
+ return NFS4_OPEN_CLAIM_NULL;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+ }
+}
static void nfs4_init_opendata_res(struct nfs4_opendata *p)
{
p->o_res.f_attr = &p->f_attr;
+ p->o_res.f_label = p->f_label;
p->o_res.seqid = p->o_arg.seqid;
p->c_res.seqid = p->c_arg.seqid;
p->o_res.server = p->o_arg.server;
@@ -775,6 +876,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
const struct iattr *attrs,
+ struct nfs4_label *label,
+ enum open_claim_type4 claim,
gfp_t gfp_mask)
{
struct dentry *parent = dget_parent(dentry);
@@ -785,15 +888,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
goto err;
+
+ p->f_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_label))
+ goto err_free_p;
+
p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
if (p->o_arg.seqid == NULL)
- goto err_free;
+ goto err_free_label;
nfs_sb_active(dentry->d_sb);
p->dentry = dget(dentry);
p->dir = parent;
p->owner = sp;
atomic_inc(&sp->so_count);
- p->o_arg.fh = NFS_FH(dir);
p->o_arg.open_flags = flags;
p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
@@ -809,9 +916,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
- p->o_arg.bitmask = server->attr_bitmask;
+ p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
- p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+ p->o_arg.label = label;
+ p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+ switch (p->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ p->o_arg.fh = NFS_FH(dir);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ p->o_arg.fh = NFS_FH(dentry->d_inode);
+ }
if (attrs != NULL && attrs->ia_valid != 0) {
__be32 verf[2];
@@ -829,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
nfs4_init_opendata_res(p);
kref_init(&p->kref);
return p;
-err_free:
+
+err_free_label:
+ nfs4_label_free(p->f_label);
+err_free_p:
kfree(p);
err:
dput(parent);
@@ -846,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
if (p->state != NULL)
nfs4_put_open_state(p->state);
nfs4_put_state_owner(p->owner);
+
+ nfs4_label_free(p->f_label);
+
dput(p->dir);
dput(p->dentry);
nfs_sb_deactive(sb);
@@ -924,6 +1050,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
+ set_bit(NFS_OPEN_STATE, &state->flags);
switch (fmode) {
case FMODE_READ:
set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1022,7 +1149,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs4_state *state = opendata->state;
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
- int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
+ int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
nfs4_stateid stateid;
int ret = -EAGAIN;
@@ -1047,9 +1174,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
nfs4_stateid_copy(&stateid, &delegation->stateid);
rcu_read_unlock();
nfs_release_seqid(opendata->o_arg.seqid);
- ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
- if (ret != 0)
- goto out;
+ if (!opendata->is_recover) {
+ ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+ if (ret != 0)
+ goto out;
+ }
ret = -EAGAIN;
/* Try to update the stateid using the delegation */
@@ -1121,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (ret)
goto err;
+ nfs_setsecurity(inode, &data->f_attr, data->f_label);
+
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1147,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
ret = -EAGAIN;
if (!(data->f_attr.valid & NFS_ATTR_FATTR))
goto err;
- inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
ret = PTR_ERR(inode);
if (IS_ERR(inode))
goto err;
@@ -1194,11 +1325,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *
return ERR_PTR(-ENOENT);
}
-static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state)
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
+ struct nfs4_state *state, enum open_claim_type4 claim)
{
struct nfs4_opendata *opendata;
- opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS);
+ opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
+ NULL, NULL, claim, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -1234,6 +1367,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
/* memory barrier prior to reading state->n_* */
clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
if (state->n_rdwr != 0) {
clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1284,11 +1418,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
fmode_t delegation_type = 0;
int status;
- opendata = nfs4_open_recoverdata_alloc(ctx, state);
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_PREVIOUS);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
- opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
- opendata->o_arg.fh = NFS_FH(state->inode);
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
@@ -1307,6 +1440,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
int err;
do {
err = _nfs4_do_open_reclaim(ctx, state);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -1321,71 +1456,72 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
ctx = nfs4_state_find_open_context(state);
if (IS_ERR(ctx))
- return PTR_ERR(ctx);
+ return -EAGAIN;
ret = nfs4_do_open_reclaim(ctx, state);
put_nfs_open_context(ctx);
return ret;
}
-static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err)
{
- struct nfs4_opendata *opendata;
- int ret;
-
- opendata = nfs4_open_recoverdata_alloc(ctx, state);
- if (IS_ERR(opendata))
- return PTR_ERR(opendata);
- opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
- nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
- ret = nfs4_open_recover(opendata, state);
- nfs4_opendata_put(opendata);
- return ret;
+ switch (err) {
+ default:
+ printk(KERN_ERR "NFS: %s: unhandled error "
+ "%d.\n", __func__, err);
+ case 0:
+ case -ENOENT:
+ case -ESTALE:
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+ return -EAGAIN;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ case -NFS4ERR_EXPIRED:
+ /* Don't recall a delegation if it was lost */
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ return -EAGAIN;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ nfs_inode_find_state_and_recover(state->inode,
+ stateid);
+ nfs4_schedule_stateid_recovery(server, state);
+ return 0;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ ssleep(1);
+ return -EAGAIN;
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+ return 0;
+ }
+ return err;
}
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
{
- struct nfs4_exception exception = { };
struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_opendata *opendata;
int err;
- do {
- err = _nfs4_open_delegation_recall(ctx, state, stateid);
- switch (err) {
- case 0:
- case -ENOENT:
- case -ESTALE:
- goto out;
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_DEADSESSION:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
- err = -EAGAIN;
- goto out;
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_STALE_STATEID:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- case -NFS4ERR_EXPIRED:
- /* Don't recall a delegation if it was lost */
- nfs4_schedule_lease_recovery(server->nfs_client);
- err = -EAGAIN;
- goto out;
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- nfs_inode_find_state_and_recover(state->inode,
- stateid);
- nfs4_schedule_stateid_recovery(server, state);
- case -ENOMEM:
- err = 0;
- goto out;
- }
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- err = nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
-out:
- return err;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+ err = nfs4_open_recover(opendata, state);
+ nfs4_opendata_put(opendata);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, err);
}
static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -1468,6 +1604,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_opendata *data = calldata;
struct nfs4_state_owner *sp = data->owner;
+ struct nfs_client *clp = sp->so_server->nfs_client;
if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
goto out_wait;
@@ -1483,15 +1620,20 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
+ data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
can_open_delegated(delegation, data->o_arg.fmode))
goto unlock_no_action;
rcu_read_unlock();
}
/* Update client id. */
- data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
- if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
- task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+ data->o_arg.clientid = clp->cl_clientid;
+ switch (data->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
+ case NFS4_OPEN_CLAIM_FH:
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
}
data->timestamp = jiffies;
@@ -1500,6 +1642,16 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
&data->o_res.seq_res,
task) != 0)
nfs_release_seqid(data->o_arg.seqid);
+
+ /* Set the create mode (note dependency on the session type) */
+ data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+ if (data->o_arg.open_flags & O_EXCL) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+ if (nfs4_has_persistent_session(clp))
+ data->o_arg.createmode = NFS4_CREATE_GUARDED;
+ else if (clp->cl_mvops->minor_version > 0)
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+ }
return;
unlock_no_action:
rcu_read_unlock();
@@ -1595,8 +1747,11 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
data->rpc_done = 0;
data->rpc_status = 0;
data->cancelled = 0;
- if (isrecover)
+ data->is_recover = 0;
+ if (isrecover) {
nfs4_set_sequence_privileged(&o_arg->seq_args);
+ data->is_recover = 1;
+ }
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1702,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
- _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+ _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
return 0;
}
@@ -1721,7 +1876,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
struct nfs4_opendata *opendata;
int ret;
- opendata = nfs4_open_recoverdata_alloc(ctx, state);
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
ret = nfs4_open_recover(opendata, state);
@@ -1739,6 +1895,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
do {
err = _nfs4_open_expired(ctx, state);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
switch (err) {
default:
goto out;
@@ -1759,7 +1917,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
ctx = nfs4_state_find_open_context(state);
if (IS_ERR(ctx))
- return PTR_ERR(ctx);
+ return -EAGAIN;
ret = nfs4_do_open_expired(ctx, state);
put_nfs_open_context(ctx);
return ret;
@@ -1770,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->stateid;
- int status;
+ struct nfs_delegation *delegation;
+ struct rpc_cred *cred = NULL;
+ int status = -NFS4ERR_BAD_STATEID;
/* If a state reset has been done, test_stateid is unneeded */
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
return;
- status = nfs41_test_stateid(server, stateid);
+ /* Get the delegation credential for use by test/free_stateid */
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match(&delegation->stateid, stateid)) {
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, stateid, cred);
+ } else
+ rcu_read_unlock();
+
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
nfs_remove_bad_delegation(state->inode);
write_seqlock(&state->seqlock);
@@ -1789,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
write_sequnlock(&state->seqlock);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
}
+
+ if (cred != NULL)
+ put_rpccred(cred);
}
/**
@@ -1803,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->open_stateid;
+ struct rpc_cred *cred = state->owner->so_cred;
int status;
/* If a state reset has been done, test_stateid is unneeded */
@@ -1811,16 +1985,17 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
(test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
return -NFS4ERR_BAD_STATEID;
- status = nfs41_test_stateid(server, stateid);
+ status = nfs41_test_stateid(server, stateid, cred);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
}
return status;
}
@@ -1856,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
fmode_t fmode,
int flags,
- struct nfs4_state **res)
+ struct nfs_open_context *ctx)
{
struct nfs4_state_owner *sp = opendata->owner;
struct nfs_server *server = sp->so_server;
+ struct dentry *dentry;
struct nfs4_state *state;
unsigned int seq;
int ret;
@@ -1877,15 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (server->caps & NFS_CAP_POSIX_LOCK)
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ dentry = opendata->dentry;
+ if (dentry->d_inode == NULL) {
+ /* FIXME: Is this d_drop() ever needed? */
+ d_drop(dentry);
+ dentry = d_add_unique(dentry, igrab(state->inode));
+ if (dentry == NULL) {
+ dentry = opendata->dentry;
+ } else if (dentry != ctx->dentry) {
+ dput(ctx->dentry);
+ ctx->dentry = dget(dentry);
+ }
+ nfs_set_verifier(dentry,
+ nfs_save_change_attribute(opendata->dir->d_inode));
+ }
+
ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
if (ret != 0)
goto out;
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
- nfs4_schedule_stateid_recovery(server, state);
- nfs4_wait_clnt_recover(server->nfs_client);
+ ctx->state = state;
+ if (dentry->d_inode == state->inode) {
+ nfs_inode_attach_open_context(ctx);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ nfs4_schedule_stateid_recovery(server, state);
}
- *res = state;
out:
return ret;
}
@@ -1894,18 +2086,21 @@ out:
* Returns a referenced nfs4_state
*/
static int _nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_state **res,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
struct nfs4_state_owner *sp;
struct nfs4_state *state = NULL;
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_opendata *opendata;
+ struct dentry *dentry = ctx->dentry;
+ struct rpc_cred *cred = ctx->cred;
+ struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+ fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+ enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct nfs4_label *olabel = NULL;
int status;
/* Protect against reboot recovery conflicts */
@@ -1921,33 +2116,48 @@ static int _nfs4_do_open(struct inode *dir,
if (dentry->d_inode != NULL)
nfs4_return_incompatible_delegation(dentry->d_inode, fmode);
status = -ENOMEM;
- opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL);
+ if (dentry->d_inode)
+ claim = NFS4_OPEN_CLAIM_FH;
+ opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
+ label, claim, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
+ if (label) {
+ olabel = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = PTR_ERR(olabel);
+ goto err_opendata_put;
+ }
+ }
+
if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
if (!opendata->f_attr.mdsthreshold)
- goto err_opendata_put;
+ goto err_free_label;
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
if (dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
- status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
+ status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
if (status != 0)
- goto err_opendata_put;
+ goto err_free_label;
+ state = ctx->state;
- if (opendata->o_arg.open_flags & O_EXCL) {
+ if ((opendata->o_arg.open_flags & O_EXCL) &&
+ (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
nfs4_exclusive_attrset(opendata, sattr);
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- state);
- if (status == 0)
+ state, label, olabel);
+ if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
}
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -1956,37 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
kfree(opendata->f_attr.mdsthreshold);
opendata->f_attr.mdsthreshold = NULL;
+ nfs4_label_free(olabel);
+
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
- *res = state;
return 0;
+err_free_label:
+ nfs4_label_free(olabel);
err_opendata_put:
kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
out_err:
- *res = NULL;
return status;
}
static struct nfs4_state *nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_state *res;
int status;
- fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
do {
- status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
- &res, ctx_th);
+ status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+ res = ctx->state;
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2022,7 +2232,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
exception.retry = 1;
continue;
}
- res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+ if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+ continue;
+ res = ERR_PTR(nfs4_handle_exception(server,
status, &exception));
} while (exception.retry);
return res;
@@ -2030,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -2038,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.iap = sattr,
.server = server,
.bitmask = server->attr_bitmask,
+ .label = ilabel,
};
struct nfs_setattrres res = {
.fattr = fattr,
+ .label = olabel,
.server = server,
};
struct rpc_message msg = {
@@ -2050,20 +2265,29 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.rpc_cred = cred,
};
unsigned long timestamp = jiffies;
+ fmode_t fmode;
+ bool truncate;
int status;
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
nfs_fattr_init(fattr);
- if (state != NULL) {
+ /* Servers should only apply open mode checks for file size changes */
+ truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+ fmode = truncate ? FMODE_WRITE : FMODE_READ;
+
+ if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+ /* Use that stateid */
+ } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) {
struct nfs_lockowner lockowner = {
.l_owner = current->files,
.l_pid = current->tgid,
};
nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
&lockowner);
- } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
- FMODE_WRITE)) {
- /* Use that stateid */
} else
nfs4_stateid_copy(&arg.stateid, &zero_stateid);
@@ -2075,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = {
@@ -2084,9 +2309,16 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
};
int err;
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+ err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
switch (err) {
case -NFS4ERR_OPENMODE:
+ if (!(sattr->ia_valid & ATTR_SIZE)) {
+ pr_warn_once("NFSv4: server %s is incorrectly "
+ "applying open mode checks to "
+ "a SETATTR that is not "
+ "changing file size.\n",
+ server->nfs_client->cl_hostname);
+ }
if (state && !(state->state & FMODE_WRITE)) {
err = -EBADF;
if (sattr->ia_valid & ATTR_OPEN)
@@ -2130,11 +2362,19 @@ static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
fmode_t fmode)
{
spin_lock(&state->owner->so_lock);
- if (!(fmode & FMODE_READ))
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_WRITE:
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- if (!(fmode & FMODE_WRITE))
+ break;
+ case FMODE_READ:
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ break;
+ case 0:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+ }
spin_unlock(&state->owner->so_lock);
}
@@ -2202,6 +2442,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
calldata->arg.fmode &= ~FMODE_WRITE;
}
}
+ if (!nfs4_valid_open_stateid(state))
+ call_close = 0;
spin_unlock(&state->owner->so_lock);
if (!call_close) {
@@ -2212,8 +2454,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
if (calldata->roc &&
- pnfs_roc_drain(inode, &calldata->roc_barrier, task))
+ pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
+ nfs_release_seqid(calldata->arg.seqid);
goto out_wait;
+ }
}
nfs_fattr_init(calldata->res.fattr);
@@ -2310,14 +2554,18 @@ static struct inode *
nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
{
struct nfs4_state *state;
+ struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+ label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
/* Protect against concurrent sillydeletes */
- state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
- ctx->cred, &ctx->mdsthreshold);
+ state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+
+ nfs4_label_release_security(label);
+
if (IS_ERR(state))
return ERR_CAST(state);
- ctx->state = state;
- return igrab(state->inode);
+ return state->inode;
}
static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2373,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_CTIME;
if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+ memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+ sizeof(server->attr_bitmask));
+ if (server->caps & NFS_CAP_SECURITY_LABEL) {
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2399,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
+ u32 bitmask[3];
struct nfs4_lookup_root_arg args = {
- .bitmask = nfs4_fattr_bitmap,
+ .bitmask = bitmask,
};
struct nfs4_lookup_res res = {
.server = server,
@@ -2413,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
+ bitmask[0] = nfs4_fattr_bitmap[0];
+ bitmask[1] = nfs4_fattr_bitmap[1];
+ /*
+ * Process the label in the upcoming getfattr
+ */
+ bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
nfs_fattr_init(info->fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
@@ -2444,7 +2710,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
auth = rpcauth_create(flavor, server->client);
if (IS_ERR(auth)) {
- ret = -EIO;
+ ret = -EACCES;
goto out;
}
ret = nfs4_lookup_root(server, fhandle, info);
@@ -2452,27 +2718,36 @@ out:
return ret;
}
+/*
+ * Retry pseudoroot lookup with various security flavors. We do this when:
+ *
+ * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC
+ * NFSv4.1: the server does not support the SECINFO_NO_NAME operation
+ *
+ * Returns zero on success, or a negative NFS4ERR value, or a
+ * negative errno value.
+ */
static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
- int i, len, status = 0;
- rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
-
- len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
- if (len < 0)
- return len;
-
- for (i = 0; i < len; i++) {
- /* AUTH_UNIX is the default flavor if none was specified,
- * thus has already been tried. */
- if (flav_array[i] == RPC_AUTH_UNIX)
- continue;
+ /* Per 3530bis 15.33.5 */
+ static const rpc_authflavor_t flav_array[] = {
+ RPC_AUTH_GSS_KRB5P,
+ RPC_AUTH_GSS_KRB5I,
+ RPC_AUTH_GSS_KRB5,
+ RPC_AUTH_UNIX, /* courtesy */
+ RPC_AUTH_NULL,
+ };
+ int status = -EPERM;
+ size_t i;
+ for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
continue;
break;
}
+
/*
* -EACCESS could mean that the user doesn't have correct permissions
* to access the mount. It could also mean that we tried to mount
@@ -2485,24 +2760,36 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-/*
- * get the file handle for the "/" directory on the server
+static int nfs4_do_find_root_sec(struct nfs_server *server,
+ struct nfs_fh *fhandle, struct nfs_fsinfo *info)
+{
+ int mv = server->nfs_client->cl_minorversion;
+ return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
+}
+
+/**
+ * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
+ * @server: initialized nfs_server handle
+ * @fhandle: we fill in the pseudo-fs root file handle
+ * @info: we fill in an FSINFO struct
+ *
+ * Returns zero on success, or a negative errno.
*/
int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
- int minor_version = server->nfs_client->cl_minorversion;
- int status = nfs4_lookup_root(server, fhandle, info);
- if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
- /*
- * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
- * by nfs4_map_errors() as this function exits.
- */
- status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);
+ int status;
+
+ status = nfs4_lookup_root(server, fhandle, info);
+ if ((status == -NFS4ERR_WRONGSEC) &&
+ !(server->flags & NFS_MOUNT_SECFLAVOUR))
+ status = nfs4_do_find_root_sec(server, fhandle, info);
+
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
if (status == 0)
status = nfs4_do_fsinfo(server, fhandle, info);
+
return nfs4_map_errors(status);
}
@@ -2511,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
+ struct nfs4_label *label = NULL;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -2518,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- error = nfs4_proc_getattr(server, mntfh, fattr);
+ label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ error = nfs4_proc_getattr(server, mntfh, fattr, label);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
- return error;
+ goto err_free_label;
}
if (fattr->valid & NFS_ATTR_FATTR_FSID &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid))
memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+err_free_label:
+ nfs4_label_free(label);
+
return error;
}
@@ -2574,7 +2869,8 @@ out:
return status;
}
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_getattr_arg args = {
.fh = fhandle,
@@ -2582,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
struct nfs4_getattr_res res = {
.fattr = fattr,
+ .label = label,
.server = server,
};
struct rpc_message msg = {
@@ -2589,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
-
+
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getattr(server, fhandle, fattr),
+ _nfs4_proc_getattr(server, fhandle, fattr, label),
&exception);
} while (exception.retry);
return err;
@@ -2630,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct inode *inode = dentry->d_inode;
struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
+ struct nfs4_label *label = NULL;
int status;
if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2656,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
}
- status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
- if (status == 0)
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+ if (status == 0) {
nfs_setattr_update_inode(inode, sattr);
+ nfs_setsecurity(inode, fattr, label);
+ }
+ nfs4_label_free(label);
return status;
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
const struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
@@ -2676,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct nfs4_lookup_res res = {
.server = server,
.fattr = fattr,
+ .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -2684,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
.rpc_resp = &res,
};
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
dprintk("NFS call lookup %s\n", name->name);
@@ -2702,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
struct rpc_clnt *client = *clnt;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+ err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
switch (err) {
case -NFS4ERR_BADNAME:
err = -ENOENT;
@@ -2742,12 +3053,13 @@ out:
}
static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -2762,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
int status;
struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
if (status < 0) {
rpc_shutdown_client(client);
return ERR_PTR(status);
@@ -2787,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
.rpc_cred = entry->cred,
};
int mode = entry->mask;
- int status;
+ int status = 0;
/*
* Determine which access bits we want to ask for...
@@ -2892,6 +3204,7 @@ static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -2900,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
+ ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, dentry, ctx->mode,
- flags, sattr, ctx->cred,
- &ctx->mdsthreshold);
- d_drop(dentry);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
}
- d_add(dentry, igrab(state->inode));
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- ctx->state = state;
out:
+ nfs4_label_release_security(ilabel);
put_nfs_open_context(ctx);
return status;
}
@@ -2961,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+ nfs_fattr_init(res->dir_attr);
}
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3036,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
.rpc_resp = &res,
};
int status = -ENOMEM;
-
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(old_dir, &res.old_cinfo);
@@ -3070,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
};
struct nfs4_link_res res = {
.server = server,
+ .label = NULL,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3082,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
if (res.fattr == NULL)
goto out;
+ res.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(res.label)) {
+ status = PTR_ERR(res.label);
+ goto out;
+ }
+ arg.bitmask = nfs4_bitmask(server, res.label);
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(dir, &res.cinfo);
- nfs_post_op_update_inode(inode, res.fattr);
+ status = nfs_post_op_update_inode(inode, res.fattr);
+ if (!status)
+ nfs_setsecurity(inode, res.fattr, res.label);
}
+
+
+ nfs4_label_free(res.label);
+
out:
nfs_free_fattr(res.fattr);
return status;
@@ -3110,6 +3436,7 @@ struct nfs4_createdata {
struct nfs4_create_res res;
struct nfs_fh fh;
struct nfs_fattr fattr;
+ struct nfs4_label *label;
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3121,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
if (data != NULL) {
struct nfs_server *server = NFS_SERVER(dir);
+ data->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->label))
+ goto out_free;
+
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
data->msg.rpc_argp = &data->arg;
data->msg.rpc_resp = &data->res;
@@ -3129,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.name = name;
data->arg.attrs = sattr;
data->arg.ftype = ftype;
- data->arg.bitmask = server->attr_bitmask;
+ data->arg.bitmask = nfs4_bitmask(server, data->label);
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
+ data->res.label = data->label;
nfs_fattr_init(data->res.fattr);
}
return data;
+out_free:
+ kfree(data);
+ return NULL;
}
static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3144,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
update_changeattr(dir, &data->res.dir_cinfo);
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
}
return status;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
+ nfs4_label_free(data->label);
kfree(data);
}
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr)
+ struct page *page, unsigned int len, struct iattr *sattr,
+ struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENAMETOOLONG;
@@ -3171,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
data->arg.u.symlink.pages = &page;
data->arg.u.symlink.len = len;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
@@ -3183,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
_nfs4_proc_symlink(dir, dentry, page,
- len, sattr),
+ len, sattr, label),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
return err;
}
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+ struct iattr *sattr, struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENOMEM;
@@ -3203,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (data == NULL)
goto out;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
nfs4_free_createdata(data);
@@ -3214,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mkdir(dir, dentry, sattr),
+ _nfs4_proc_mkdir(dir, dentry, sattr, label),
&exception);
} while (exception.retry);
+ nfs4_label_release_security(label);
+
return err;
}
@@ -3279,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
}
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, dev_t rdev)
+ struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
{
struct nfs4_createdata *data;
int mode = sattr->ia_mode;
@@ -3304,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
status = -EINVAL;
goto out_free;
}
-
+
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
out_free:
nfs4_free_createdata(data);
@@ -3316,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mknod(dir, dentry, sattr, rdev),
+ _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
+
return err;
}
@@ -3381,12 +3738,21 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
{
struct nfs4_exception exception = { };
+ unsigned long now = jiffies;
int err;
do {
- err = nfs4_handle_exception(server,
- _nfs4_do_fsinfo(server, fhandle, fsinfo),
- &exception);
+ err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+ if (err == 0) {
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&clp->cl_lock);
+ clp->cl_lease_time = fsinfo->lease_time * HZ;
+ clp->cl_last_renewal = now;
+ spin_unlock(&clp->cl_lock);
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
return err;
}
@@ -3446,6 +3812,46 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}
+int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ const struct nfs_lockowner *lockowner = NULL;
+
+ if (l_ctx != NULL)
+ lockowner = &l_ctx->lockowner;
+ return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
+static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ nfs4_stateid current_stateid;
+
+ if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode))
+ return false;
+ return nfs4_stateid_match(stateid, &current_stateid);
+}
+
+static bool nfs4_error_stateid_expired(int err)
+{
+ switch (err) {
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ case -NFS4ERR_EXPIRED:
+ return true;
+ }
+ return false;
+}
+
void __nfs4_read_done_cb(struct nfs_read_data *data)
{
nfs_invalidate_atime(data->header->inode);
@@ -3466,6 +3872,20 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
+static bool nfs4_read_stateid_changed(struct rpc_task *task,
+ struct nfs_readargs *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_READ))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
{
@@ -3473,7 +3893,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
-
+ if (nfs4_read_stateid_changed(task, &data->args))
+ return -EAGAIN;
return data->read_done_cb ? data->read_done_cb(task, data) :
nfs4_read_done_cb(task, data);
}
@@ -3488,10 +3909,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
- nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+ if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
- task);
+ task))
+ return;
+ nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_READ);
}
static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3509,10 +3933,26 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
return 0;
}
+static bool nfs4_write_stateid_changed(struct rpc_task *task,
+ struct nfs_writeargs *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_WRITE))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
+ if (nfs4_write_stateid_changed(task, &data->args))
+ return -EAGAIN;
return data->write_done_cb ? data->write_done_cb(task, data) :
nfs4_write_done_cb(task, data);
}
@@ -3552,10 +3992,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
- nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+ if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
- task);
+ task))
+ return;
+ nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_WRITE);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3657,7 +4100,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
return -ENOMEM;
data->client = clp;
data->timestamp = jiffies;
- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+ return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
&nfs4_renew_ops, data);
}
@@ -3671,7 +4114,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
unsigned long now = jiffies;
int status;
- status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
if (status < 0)
return status;
do_renew_lease(clp, now);
@@ -3964,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
return err;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr fattr;
+ struct nfs4_label label = {0, 0, buflen, buf};
+
+ u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs4_getattr_arg args = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = &fattr,
+ .label = &label,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+
+ nfs_fattr_init(&fattr);
+
+ ret = rpc_call_sync(server->client, &msg, 0);
+ if (ret)
+ return ret;
+ if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+ return -ENOENT;
+ if (buflen < label.len)
+ return -ERANGE;
+ return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_get_security_label(inode, buf, buflen),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+
+ struct iattr sattr = {0};
+ struct nfs_server *server = NFS_SERVER(inode);
+ const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_setattrargs args = {
+ .fh = NFS_FH(inode),
+ .iap = &sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ nfs4_stateid_copy(&args.stateid, &zero_stateid);
+
+ status = rpc_call_sync(server->client, &msg, 0);
+ if (status)
+ dprintk("%s failed: %d\n", __func__, status);
+
+ return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_do_set_security_label(inode, ilabel,
+ fattr, olabel),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+ struct nfs4_label ilabel, *olabel = NULL;
+ struct nfs_fattr fattr;
+ struct rpc_cred *cred;
+ struct inode *inode = dentry->d_inode;
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ nfs_fattr_init(&fattr);
+
+ ilabel.pi = 0;
+ ilabel.lfs = 0;
+ ilabel.label = (char *)buf;
+ ilabel.len = buflen;
+
+ cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+
+ olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = -PTR_ERR(olabel);
+ goto out;
+ }
+
+ status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ if (status == 0)
+ nfs_setsecurity(inode, &fattr, olabel);
+
+ nfs4_label_free(olabel);
+out:
+ put_rpccred(cred);
+ return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
static int
nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
{
@@ -3981,11 +4573,14 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
- nfs4_schedule_stateid_recovery(server, state);
+ if (nfs4_schedule_stateid_recovery(server, state) < 0)
+ goto stateid_invalid;
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
- if (state != NULL)
- nfs4_schedule_stateid_recovery(server, state);
+ if (state != NULL) {
+ if (nfs4_schedule_stateid_recovery(server, state) < 0)
+ goto stateid_invalid;
+ }
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
nfs4_schedule_lease_recovery(clp);
@@ -4017,6 +4612,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
}
task->tk_status = nfs4_map_errors(task->tk_status);
return 0;
+stateid_invalid:
+ task->tk_status = -EIO;
+ return 0;
wait_on_recovery:
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
@@ -4116,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
/* cb_client4 */
rcu_read_lock();
setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
- sizeof(setclientid.sc_netid),
+ sizeof(setclientid.sc_netid), "%s",
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_NETID));
rcu_read_unlock();
@@ -4144,27 +4742,17 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
struct nfs4_setclientid_res *arg,
struct rpc_cred *cred)
{
- struct nfs_fsinfo fsinfo;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
.rpc_argp = arg,
- .rpc_resp = &fsinfo,
.rpc_cred = cred,
};
- unsigned long now;
int status;
dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
clp->cl_clientid);
- now = jiffies;
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
- if (status == 0) {
- spin_lock(&clp->cl_lock);
- clp->cl_lease_time = fsinfo.lease_time * HZ;
- clp->cl_last_renewal = now;
- spin_unlock(&clp->cl_lock);
- }
dprintk("NFS reply setclientid_confirm: %d\n", status);
return status;
}
@@ -4309,7 +4897,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
static unsigned long
nfs4_set_lock_task_retry(unsigned long timeout)
{
- freezable_schedule_timeout_killable(timeout);
+ freezable_schedule_timeout_killable_unsafe(timeout);
timeout <<= 1;
if (timeout > NFS4_LOCK_MAXTIMEOUT)
return NFS4_LOCK_MAXTIMEOUT;
@@ -4547,9 +5135,9 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
if (status != 0)
goto out;
/* Is this a delegated lock? */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags))
- goto out;
lsp = request->fl_u.nfs4_fl.owner;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+ goto out;
seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
status = -ENOMEM;
if (seqid == NULL)
@@ -4628,17 +5216,23 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
goto out_release_lock_seqid;
}
- data->arg.open_stateid = &state->stateid;
+ data->arg.open_stateid = &state->open_stateid;
data->arg.new_lock_owner = 1;
data->res.open_seqid = data->arg.open_seqid;
} else
data->arg.new_lock_owner = 0;
+ if (!nfs4_valid_open_stateid(state)) {
+ data->rpc_status = -EBADF;
+ task->tk_action = NULL;
+ goto out_release_open_seqid;
+ }
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->server,
&data->arg.seq_args,
&data->res.seq_res,
task) == 0)
return;
+out_release_open_seqid:
nfs_release_seqid(data->arg.open_seqid);
out_release_lock_seqid:
nfs_release_seqid(data->arg.lock_seqid);
@@ -4831,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
list_for_each_entry(lsp, &state->lock_states, ls_locks) {
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- status = nfs41_test_stateid(server, &lsp->ls_stateid);
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+ status = nfs41_test_stateid(server,
+ &lsp->ls_stateid,
+ cred);
if (status != NFS_OK) {
/* Free the stateid unless the server
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
nfs41_free_stateid(server,
- &lsp->ls_stateid);
+ &lsp->ls_stateid,
+ cred);
clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
ret = status;
}
@@ -4984,58 +5583,16 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
return status;
}
-int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- struct nfs4_exception exception = { };
int err;
err = nfs4_set_lock_state(state, fl);
if (err != 0)
- goto out;
- do {
- err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
- switch (err) {
- default:
- printk(KERN_ERR "NFS: %s: unhandled error "
- "%d.\n", __func__, err);
- case 0:
- case -ESTALE:
- goto out;
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_STALE_STATEID:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- case -NFS4ERR_EXPIRED:
- nfs4_schedule_lease_recovery(server->nfs_client);
- err = -EAGAIN;
- goto out;
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_DEADSESSION:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
- err = -EAGAIN;
- goto out;
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
- nfs4_schedule_stateid_recovery(server, state);
- err = 0;
- goto out;
- case -ENOMEM:
- case -NFS4ERR_DENIED:
- /* kill_proc(fl->fl_pid, SIGLOST, 1); */
- err = 0;
- goto out;
- }
- set_bit(NFS_DELEGATED_STATE, &state->flags);
- err = nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
-out:
- return err;
+ return err;
+ err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, err);
}
struct nfs_release_lockowner_data {
@@ -5055,9 +5612,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
.rpc_release = nfs4_release_lockowner_release,
};
-int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
+static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- struct nfs_server *server = lsp->ls_state->owner->so_server;
struct nfs_release_lockowner_data *data;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
@@ -5113,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
return len;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+ return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_set_security_label(dentry, buf, buflen);
+
+ return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+ void *buf, size_t buflen, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+ return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+ size_t list_len, const char *name,
+ size_t name_len, int type)
+{
+ size_t len = 0;
+
+ if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+ if (list && len <= list_len)
+ security_inode_listsecurity(dentry->d_inode, list, len);
+ }
+ return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = nfs4_xattr_list_nfs4_label,
+ .get = nfs4_xattr_get_nfs4_label,
+ .set = nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
*/
@@ -5136,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
struct page *page)
{
struct nfs_server *server = NFS_SERVER(dir);
- u32 bitmask[2] = {
+ u32 bitmask[3] = {
[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
};
struct nfs4_fs_locations_arg args = {
@@ -5323,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
struct nfs41_exchange_id_args args = {
.verifier = &verifier,
.client = clp,
- .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+ .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID,
};
struct nfs41_exchange_id_res res = {
0
@@ -5580,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
*/
static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
{
- struct nfs4_session *session = args->client->cl_session;
- unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
- mxresp_sz = session->fc_target_max_resp_sz;
+ unsigned int max_rqst_sz, max_resp_sz;
+
+ max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+ max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
- if (mxrqst_sz == 0)
- mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
- if (mxresp_sz == 0)
- mxresp_sz = NFS_MAX_FILE_IO_SIZE;
/* Fore channel attributes */
- args->fc_attrs.max_rqst_sz = mxrqst_sz;
- args->fc_attrs.max_resp_sz = mxresp_sz;
+ args->fc_attrs.max_rqst_sz = max_rqst_sz;
+ args->fc_attrs.max_resp_sz = max_resp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
args->fc_attrs.max_reqs = max_session_slots;
@@ -5849,7 +6450,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
.rpc_client = clp->cl_rpcclient,
.rpc_message = &msg,
.callback_ops = &nfs41_sequence_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
};
if (!atomic_inc_not_zero(&clp->cl_count))
@@ -5977,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
/*
* Issue a global reclaim complete.
*/
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+ struct rpc_cred *cred)
{
struct nfs4_reclaim_complete_data *calldata;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clp->cl_rpcclient,
@@ -6166,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
.rpc_argp = &lgp->args,
.rpc_resp = &lgp->res,
+ .rpc_cred = lgp->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = server->client,
@@ -6269,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
.rpc_argp = &lrp->args,
.rpc_resp = &lrp->res,
+ .rpc_cred = lrp->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = lrp->clp->cl_rpcclient,
@@ -6338,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_getdeviceinfo_args args = {
.pdev = pdev,
@@ -6350,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
int status;
@@ -6360,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
return status;
}
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getdeviceinfo(server, pdev),
+ _nfs4_proc_getdeviceinfo(server, pdev, cred),
&exception);
} while (exception.retry);
return err;
@@ -6551,7 +7161,9 @@ out:
return err;
}
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
int status;
struct nfs41_test_stateid_args args = {
@@ -6562,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
dprintk("NFS call test_stateid %p\n", stateid);
@@ -6582,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to test
+ * @cred: credential
*
* Returns NFS_OK if the server recognizes that "stateid" is valid.
* Otherwise a negative NFS4ERR value is returned if the operation
* failed or the state ID is not currently valid.
*/
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs41_test_stateid(server, stateid);
+ err = _nfs41_test_stateid(server, stateid, cred);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -6600,26 +7216,78 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
return err;
}
-static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
-{
- struct nfs41_free_stateid_args args = {
- .stateid = stateid,
- };
+struct nfs_free_stateid_data {
+ struct nfs_server *server;
+ struct nfs41_free_stateid_args args;
struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+ nfs41_setup_sequence(nfs4_get_session(data->server),
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+
+ nfs41_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+ kfree(calldata);
+}
+
+const struct rpc_call_ops nfs41_free_stateid_ops = {
+ .rpc_call_prepare = nfs41_free_stateid_prepare,
+ .rpc_call_done = nfs41_free_stateid_done,
+ .rpc_release = nfs41_free_stateid_release,
+};
+
+static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred,
+ bool privileged)
+{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
- .rpc_argp = &args,
- .rpc_resp = &res,
+ .rpc_cred = cred,
};
- int status;
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs41_free_stateid_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct nfs_free_stateid_data *data;
dprintk("NFS call free_stateid %p\n", stateid);
- nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
- nfs4_set_sequence_privileged(&args.seq_args);
- status = nfs4_call_sync_sequence(server->client, server, &msg,
- &args.seq_args, &res.seq_res);
- dprintk("NFS reply free_stateid: %d\n", status);
- return status;
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+ data->server = server;
+ nfs4_stateid_copy(&data->args.stateid, stateid);
+
+ task_setup.callback_data = data;
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ if (privileged)
+ nfs4_set_sequence_privileged(&data->args.seq_args);
+
+ return rpc_run_task(&task_setup);
}
/**
@@ -6627,21 +7295,39 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
+ * @cred: credential
*
* Returns NFS_OK if the server freed "stateid". Otherwise a
* negative NFS4ERR value is returned.
*/
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
- struct nfs4_exception exception = { };
- int err;
- do {
- err = _nfs4_free_stateid(server, stateid);
- if (err != -NFS4ERR_DELAY)
- break;
- nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
- return err;
+ struct rpc_task *task;
+ int ret;
+
+ task = _nfs41_free_stateid(server, stateid, cred, true);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ ret = rpc_wait_for_completion_task(task);
+ if (!ret)
+ ret = task->tk_status;
+ rpc_put_task(task);
+ return ret;
+}
+
+static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ struct rpc_task *task;
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+ task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+ nfs4_free_lock_state(server, lsp);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
}
static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -6726,9 +7412,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.minor_version = 0,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_CHANGE_ATTR
+ | NFS_CAP_POSIX_LOCK,
.call_sync = _nfs4_call_sync,
.match_stateid = nfs4_match_stateid,
.find_root_sec = nfs4_find_root_sec,
+ .free_lock_state = nfs4_release_lockowner,
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
.state_renewal_ops = &nfs40_state_renewal_ops,
@@ -6737,9 +7428,35 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
#if defined(CONFIG_NFS_V4_1)
static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.minor_version = 1,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_CHANGE_ATTR
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1,
+ .call_sync = nfs4_call_sync_sequence,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+ .minor_version = 2,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_CHANGE_ATTR
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1,
.call_sync = nfs4_call_sync_sequence,
.match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
@@ -6751,6 +7468,9 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#if defined(CONFIG_NFS_V4_1)
[1] = &nfs_v4_1_minor_ops,
#endif
+#if defined(CONFIG_NFS_V4_2)
+ [2] = &nfs_v4_2_minor_ops,
+#endif
};
const struct inode_operations nfs4_dir_inode_operations = {
@@ -6850,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ &nfs4_xattr_nfs4_label_handler,
+#endif
NULL
};
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index ebda5f4a031b..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
tbl->highest_used_slotid = new_max;
else {
tbl->highest_used_slotid = NFS4_NO_SLOT;
- nfs4_session_drain_complete(tbl->session, tbl);
+ nfs4_slot_tbl_drain_complete(tbl);
}
}
dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
@@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
struct nfs4_slot *slot = pslot;
struct nfs4_slot_table *tbl = slot->table;
- if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
return false;
slot->generation = tbl->generation;
args->sa_slot = slot;
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
return 0;
}
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
- struct nfs4_session *session;
- unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
- unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
-
if (!nfs4_has_session(clp))
return 0;
- if (server->rsize != 0)
- target_max_resp_sz = server->rsize;
- target_max_resp_sz += nfs41_maxread_overhead;
-
- if (server->wsize != 0)
- target_max_rqst_sz = server->wsize;
- target_max_rqst_sz += nfs41_maxwrite_overhead;
-
- session = clp->cl_session;
- spin_lock(&clp->cl_lock);
- if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
- /* Initialise targets and channel attributes */
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
- session->fc_target_max_resp_sz = target_max_resp_sz;
- session->fc_attrs.max_resp_sz = target_max_resp_sz;
- } else {
- /* Just adjust the targets */
- if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- if (target_max_resp_sz > session->fc_target_max_resp_sz) {
- session->fc_target_max_resp_sz = target_max_resp_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- }
- spin_unlock(&clp->cl_lock);
-
- if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- nfs4_schedule_lease_recovery(clp);
-
+ clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
return nfs41_check_session_ready(clp);
}
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 6f3cb39386d4..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -25,6 +25,10 @@ struct nfs4_slot {
};
/* Sessions */
+enum nfs4_slot_tbl_state {
+ NFS4_SLOT_TBL_DRAINING,
+};
+
#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
struct nfs4_slot_table {
struct nfs4_session *session; /* Parent session */
@@ -43,6 +47,7 @@ struct nfs4_slot_table {
unsigned long generation; /* Generation counter for
target_highest_slotid */
struct completion complete;
+ unsigned long slot_tbl_state;
};
/*
@@ -61,14 +66,10 @@ struct nfs4_session {
struct nfs4_channel_attrs bc_attrs;
struct nfs4_slot_table bc_slot_table;
struct nfs_client *clp;
- /* Create session arguments */
- unsigned int fc_target_max_rqst_sz;
- unsigned int fc_target_max_resp_sz;
};
enum nfs4_session_state {
NFS4_SESSION_INITING,
- NFS4_SESSION_DRAINING,
};
#if defined(CONFIG_NFS_V4_1)
@@ -85,15 +86,14 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
-extern void nfs4_session_drain_complete(struct nfs4_session *session,
- struct nfs4_slot_table *tbl);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
-static inline bool nfs4_session_draining(struct nfs4_session *session)
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
{
- return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
+ return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
}
bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -119,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
#else /* defined(CONFIG_NFS_V4_1) */
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
{
return 0;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d41a3518509f..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -154,18 +154,6 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
return cred;
}
-static void nfs4_clear_machine_cred(struct nfs_client *clp)
-{
- struct rpc_cred *cred;
-
- spin_lock(&clp->cl_lock);
- cred = clp->cl_machine_cred;
- clp->cl_machine_cred = NULL;
- spin_unlock(&clp->cl_lock);
- if (cred != NULL)
- put_rpccred(cred);
-}
-
static struct rpc_cred *
nfs4_get_renew_cred_server_locked(struct nfs_server *server)
{
@@ -240,38 +228,37 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
return status;
}
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
{
- struct nfs4_session *ses = clp->cl_session;
- struct nfs4_slot_table *tbl;
-
- if (ses == NULL)
- return;
- tbl = &ses->fc_slot_table;
- if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
+ if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_lock(&tbl->slot_tbl_lock);
nfs41_wake_slot_table(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
}
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
/*
* Signal state manager thread if session fore channel is drained
*/
-void nfs4_session_drain_complete(struct nfs4_session *session,
- struct nfs4_slot_table *tbl)
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
{
- if (nfs4_session_draining(session))
+ if (nfs4_slot_tbl_draining(tbl))
complete(&tbl->complete);
}
-static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
{
+ set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
spin_lock(&tbl->slot_tbl_lock);
if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
INIT_COMPLETION(tbl->complete);
@@ -287,13 +274,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
struct nfs4_session *ses = clp->cl_session;
int ret = 0;
- set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
/* back channel */
- ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+ ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
if (ret)
return ret;
/* fore channel */
- return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+ return nfs4_drain_slot_tbl(&ses->fc_slot_table);
}
static void nfs41_finish_session_reset(struct nfs_client *clp)
@@ -699,6 +685,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
list_for_each_entry(state, &nfsi->open_states, inode_states) {
if (state->owner != owner)
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (atomic_inc_not_zero(&state->count))
return state;
}
@@ -931,6 +919,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
*/
void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
{
+ struct nfs_server *server;
struct nfs4_state *state;
if (lsp == NULL)
@@ -942,11 +931,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
+ server = state->owner->so_server;
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- if (nfs4_release_lockowner(lsp) == 0)
- return;
- }
- nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+ } else
+ nfs4_free_lock_state(server, lsp);
}
static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -987,13 +978,14 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
return 0;
}
-static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state,
const struct nfs_lockowner *lockowner)
{
struct nfs4_lock_state *lsp;
fl_owner_t fl_owner;
pid_t fl_pid;
- bool ret = false;
+ int ret = -ENOENT;
if (lockowner == NULL)
@@ -1008,7 +1000,10 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
nfs4_stateid_copy(dst, &lsp->ls_stateid);
- ret = true;
+ ret = 0;
+ smp_rmb();
+ if (!list_empty(&lsp->ls_seqid.list))
+ ret = -EWOULDBLOCK;
}
spin_unlock(&state->state_lock);
nfs4_put_lock_state(lsp);
@@ -1016,28 +1011,44 @@ out:
return ret;
}
-static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
+ const nfs4_stateid *src;
+ int ret;
int seq;
do {
+ src = &zero_stateid;
seq = read_seqbegin(&state->seqlock);
- nfs4_stateid_copy(dst, &state->stateid);
+ if (test_bit(NFS_OPEN_STATE, &state->flags))
+ src = &state->open_stateid;
+ nfs4_stateid_copy(dst, src);
+ ret = 0;
+ smp_rmb();
+ if (!list_empty(&state->owner->so_seqid.list))
+ ret = -EWOULDBLOCK;
} while (read_seqretry(&state->seqlock, seq));
+ return ret;
}
/*
* Byte-range lock aware utility to initialize the stateid of read/write
* requests.
*/
-void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
fmode_t fmode, const struct nfs_lockowner *lockowner)
{
+ int ret = 0;
if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
- return;
- if (nfs4_copy_lock_stateid(dst, state, lockowner))
- return;
- nfs4_copy_open_stateid(dst, state);
+ goto out;
+ ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ if (ret != -ENOENT)
+ goto out;
+ ret = nfs4_copy_open_stateid(dst, state);
+out:
+ if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+ dst->seqid = 0;
+ return ret;
}
struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
@@ -1182,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
snprintf(buf, sizeof(buf), "%s-manager",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
rcu_read_unlock();
- task = kthread_run(nfs4_run_state_manager, clp, buf);
+ task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
if (IS_ERR(task)) {
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
@@ -1286,14 +1297,17 @@ static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_s
return 1;
}
-void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
{
struct nfs_client *clp = server->nfs_client;
+ if (!nfs4_valid_open_stateid(state))
+ return -EBADF;
nfs4_state_mark_reclaim_nograce(clp, state);
dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
clp->cl_hostname);
nfs4_schedule_state_manager(clp);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
@@ -1323,6 +1337,27 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
nfs4_schedule_state_manager(clp);
}
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
+{
+ struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ if (ctx->state != state)
+ continue;
+ set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+ }
+ spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
+{
+ set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+ nfs4_state_mark_open_context_bad(state);
+}
+
static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
{
@@ -1337,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
/* Guard against delegation returns and new lock/unlock calls */
down_write(&nfsi->rwsem);
/* Protect inode->i_flock using the BKL */
- lock_flocks();
+ spin_lock(&inode->i_lock);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
continue;
if (nfs_file_open_context(fl->fl_file)->state != state)
continue;
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
status = ops->recover_lock(state, fl);
switch (status) {
case 0:
@@ -1370,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
/* kill_proc(fl->fl_pid, SIGLOST, 1); */
status = 0;
}
- lock_flocks();
+ spin_lock(&inode->i_lock);
}
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
out:
up_write(&nfsi->rwsem);
return status;
@@ -1398,6 +1433,8 @@ restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (state->state == 0)
continue;
atomic_inc(&state->count);
@@ -1430,11 +1467,10 @@ restart:
* Open state on this file cannot be recovered
* All we can do is revert to using the zero stateid.
*/
- memset(&state->stateid, 0,
- sizeof(state->stateid));
- /* Mark the file as being 'closed' */
- state->state = 0;
+ nfs4_state_mark_recovery_failed(state, status);
break;
+ case -EAGAIN:
+ ssleep(1);
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_BAD_STATEID:
@@ -1526,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
}
static void nfs4_reclaim_complete(struct nfs_client *clp,
- const struct nfs4_state_recovery_ops *ops)
+ const struct nfs4_state_recovery_ops *ops,
+ struct rpc_cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
- (void)ops->reclaim_complete(clp);
+ (void)ops->reclaim_complete(clp, cred);
}
static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1575,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
+ const struct nfs4_state_recovery_ops *ops;
+ struct rpc_cred *cred;
+
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
- nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+ ops = clp->cl_mvops->reboot_recovery_ops;
+ cred = ops->get_clid_cred(clp);
+ nfs4_reclaim_complete(clp, ops, cred);
+ put_rpccred(cred);
}
static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1696,6 +1739,10 @@ static int nfs4_check_lease(struct nfs_client *clp)
}
status = ops->renew_lease(clp, cred);
put_rpccred(cred);
+ if (status == -ETIMEDOUT) {
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ return 0;
+ }
out:
return nfs4_recovery_handle_error(clp, status);
}
@@ -1725,10 +1772,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
return -EPERM;
case -EACCES:
- if (clp->cl_machine_cred == NULL)
- return -EACCES;
- /* Handle case where the user hasn't set up machine creds */
- nfs4_clear_machine_cred(clp);
case -NFS4ERR_DELAY:
case -ETIMEDOUT:
case -EAGAIN:
@@ -1823,31 +1866,18 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
{
const struct nfs4_state_recovery_ops *ops =
clp->cl_mvops->reboot_recovery_ops;
- rpc_authflavor_t *flavors, flav, save;
struct rpc_clnt *clnt;
struct rpc_cred *cred;
- int i, len, status;
+ int i, status;
dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
- len = NFS_MAX_SECFLAVORS;
- flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL);
- if (flavors == NULL) {
- status = -ENOMEM;
- goto out;
- }
- len = rpcauth_list_flavors(flavors, len);
- if (len < 0) {
- status = len;
- goto out_free;
- }
clnt = clp->cl_rpcclient;
- save = clnt->cl_auth->au_flavor;
i = 0;
mutex_lock(&nfs_clid_init_mutex);
- status = -ENOENT;
again:
+ status = -ENOENT;
cred = ops->get_clid_cred(clp);
if (cred == NULL)
goto out_unlock;
@@ -1857,12 +1887,6 @@ again:
switch (status) {
case 0:
break;
-
- case -EACCES:
- if (clp->cl_machine_cred == NULL)
- break;
- /* Handle case where the user hasn't set up machine creds */
- nfs4_clear_machine_cred(clp);
case -NFS4ERR_DELAY:
case -ETIMEDOUT:
case -EAGAIN:
@@ -1871,17 +1895,12 @@ again:
dprintk("NFS: %s after status %d, retrying\n",
__func__, status);
goto again;
-
+ case -EACCES:
+ if (i++)
+ break;
case -NFS4ERR_CLID_INUSE:
case -NFS4ERR_WRONGSEC:
- status = -EPERM;
- if (i >= len)
- break;
-
- flav = flavors[i++];
- if (flav == save)
- flav = flavors[i++];
- clnt = rpc_clone_client_set_auth(clnt, flav);
+ clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
if (IS_ERR(clnt)) {
status = PTR_ERR(clnt);
break;
@@ -1903,13 +1922,15 @@ again:
case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
* in nfs4_exchange_id */
status = -EKEYEXPIRED;
+ break;
+ default:
+ pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n",
+ __func__, status);
+ status = -EIO;
}
out_unlock:
mutex_unlock(&nfs_clid_init_mutex);
-out_free:
- kfree(flavors);
-out:
dprintk("NFS: %s: status = %d\n", __func__, status);
return status;
}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 569b166cc050..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
#include "delegation.h"
#include "internal.h"
#include "nfs4_fs.h"
+#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
@@ -252,6 +253,8 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+ if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR)
+ data->auth_flavors[0] = RPC_AUTH_UNIX;
export_path = data->nfs_server.export_path;
data->nfs_server.export_path = "/";
root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
@@ -329,18 +332,24 @@ static int __init init_nfs_v4(void)
{
int err;
- err = nfs_idmap_init();
+ err = nfs_dns_resolver_init();
if (err)
goto out;
- err = nfs4_register_sysctl();
+ err = nfs_idmap_init();
if (err)
goto out1;
+ err = nfs4_register_sysctl();
+ if (err)
+ goto out2;
+
register_nfs_version(&nfs_v4);
return 0;
-out1:
+out2:
nfs_idmap_quit();
+out1:
+ nfs_dns_resolver_destroy();
out:
return err;
}
@@ -350,6 +359,7 @@ static void __exit exit_nfs_v4(void)
unregister_nfs_version(&nfs_v4);
nfs4_unregister_sysctl();
nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
}
MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e3edda554ac7..0abfb8466e79 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define nfs4_label_maxsz 0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
/* We support only one layout type per file system */
#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
/* This is based on getfattr, which uses the most attributes: */
#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
3 + 3 + 3 + nfs4_owner_maxsz + \
- nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+ nfs4_group_maxsz + nfs4_label_maxsz + \
+ decode_mdsthreshold_maxsz))
#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
nfs4_fattr_value_maxsz)
#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
1 + 2 + 1 + \
nfs4_owner_maxsz + \
nfs4_group_maxsz + \
+ nfs4_label_maxsz + \
4 + 4)
#define encode_savefh_maxsz (op_encode_hdr_maxsz)
#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
encode_stateid_maxsz + 3)
#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
- 2 + encode_verifier_maxsz + 5)
+ 2 + encode_verifier_maxsz + 5 + \
+ nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + \
+ nfs4_label_maxsz + nfs4_fattr_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -530,14 +544,10 @@ static int nfs4_stat_to_errno(int);
decode_setclientid_maxsz)
#define NFS4_enc_setclientid_confirm_sz \
(compound_encode_hdr_maxsz + \
- encode_setclientid_confirm_maxsz + \
- encode_putrootfh_maxsz + \
- encode_fsinfo_maxsz)
+ encode_setclientid_confirm_maxsz)
#define NFS4_dec_setclientid_confirm_sz \
(compound_decode_hdr_maxsz + \
- decode_setclientid_confirm_maxsz + \
- decode_putrootfh_maxsz + \
- decode_fsinfo_maxsz)
+ decode_setclientid_confirm_maxsz)
#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -857,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
decode_sequence_maxsz +
decode_putfh_maxsz) *
XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz) *
+ XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
#endif /* CONFIG_NFS_V4_1 */
static const umode_t nfs_type2fmt[] = {
@@ -972,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
}
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+ const struct nfs4_label *label,
+ const struct nfs_server *server)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -983,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
int len;
uint32_t bmval0 = 0;
uint32_t bmval1 = 0;
+ uint32_t bmval2 = 0;
/*
* We reserve enough space to write the entire attribute buffer at once.
* In the worst-case, this would be
- * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
- * = 36 bytes, plus any contribution from variable-length fields
+ * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ * = 40 bytes, plus any contribution from variable-length fields
* such as owner/group.
*/
- len = 16;
+ len = 20;
/* Sigh */
if (iap->ia_valid & ATTR_SIZE)
@@ -1021,6 +1040,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
}
len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
}
+ if (label)
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
if (iap->ia_valid & ATTR_ATIME_SET)
len += 16;
else if (iap->ia_valid & ATTR_ATIME)
@@ -1035,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
* We write the bitmap length now, but leave the bitmap and the attribute
* buffer length to be backfilled at the end of this routine.
*/
- *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(3);
q = p;
- p += 3;
+ p += 4;
if (iap->ia_valid & ATTR_SIZE) {
bmval0 |= FATTR4_WORD0_SIZE;
@@ -1058,8 +1079,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
if (iap->ia_valid & ATTR_ATIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+ p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
*p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
}
else if (iap->ia_valid & ATTR_ATIME) {
@@ -1069,14 +1089,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
if (iap->ia_valid & ATTR_MTIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+ p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
}
else if (iap->ia_valid & ATTR_MTIME) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
+ if (label) {
+ bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+ *p++ = cpu_to_be32(label->lfs);
+ *p++ = cpu_to_be32(label->pi);
+ *p++ = cpu_to_be32(label->len);
+ p = xdr_encode_opaque_fixed(p, label->label, label->len);
+ }
/*
* Now we backfill the bitmap and the attribute buffer length.
@@ -1086,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len, ((char *)p - (char *)q) + 4);
BUG();
}
- len = (char *)p - (char *)q - 12;
+ len = (char *)p - (char *)q - 16;
*q++ = htonl(bmval0);
*q++ = htonl(bmval1);
+ *q++ = htonl(bmval2);
*q = htonl(len);
/* out: */
@@ -1142,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1194,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr,
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
- bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+ encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+ bitmask[1] & nfs4_fattr_bitmap[1],
+ bitmask[2] & nfs4_fattr_bitmap[2],
+ hdr);
}
static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1366,33 +1395,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
+ struct iattr dummy;
__be32 *p;
- struct nfs_client *clp;
p = reserve_space(xdr, 4);
- switch(arg->open_flags & O_EXCL) {
- case 0:
+ switch(arg->createmode) {
+ case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
break;
- default:
- clp = arg->server->nfs_client;
- if (clp->cl_mvops->minor_version > 0) {
- if (nfs4_has_persistent_session(clp)) {
- *p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
- } else {
- struct iattr dummy;
-
- *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
- encode_nfs4_verifier(xdr, &arg->u.verifier);
- dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->server);
- }
- } else {
- *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
- encode_nfs4_verifier(xdr, &arg->u.verifier);
- }
+ case NFS4_CREATE_GUARDED:
+ *p = cpu_to_be32(NFS4_CREATE_GUARDED);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ dummy.ia_valid = 0;
+ encode_attrs(xdr, &dummy, arg->label, arg->server);
}
}
@@ -1459,6 +1483,23 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
encode_string(xdr, name->len, name->name);
}
+static inline void encode_claim_fh(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH);
+}
+
+static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ encode_nfs4_stateid(xdr, stateid);
+}
+
static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
{
encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
@@ -1474,6 +1515,12 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
break;
+ case NFS4_OPEN_CLAIM_FH:
+ encode_claim_fh(xdr);
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ encode_claim_delegate_cur_fh(xdr, &arg->u.delegation);
+ break;
default:
BUG();
}
@@ -1506,35 +1553,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
}
-static void encode_open_stateid(struct xdr_stream *xdr,
- const struct nfs_open_context *ctx,
- const struct nfs_lock_context *l_ctx,
- fmode_t fmode,
- int zero_seqid)
-{
- nfs4_stateid stateid;
-
- if (ctx->state != NULL) {
- const struct nfs_lockowner *lockowner = NULL;
-
- if (l_ctx != NULL)
- lockowner = &l_ctx->lockowner;
- nfs4_select_rw_stateid(&stateid, ctx->state,
- fmode, lockowner);
- if (zero_seqid)
- stateid.seqid = 0;
- encode_nfs4_stateid(xdr, &stateid);
- } else
- encode_nfs4_stateid(xdr, &zero_stateid);
-}
-
static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
{
__be32 *p;
encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
- encode_open_stateid(xdr, args->context, args->lock_context,
- FMODE_READ, hdr->minorversion);
+ encode_nfs4_stateid(xdr, &args->stateid);
p = reserve_space(xdr, 12);
p = xdr_encode_hyper(p, args->offset);
@@ -1543,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
- uint32_t attrs[2] = {
+ uint32_t attrs[3] = {
FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
@@ -1566,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
encode_uint64(xdr, readdir->cookie);
encode_nfs4_verifier(xdr, &readdir->verifier);
- p = reserve_space(xdr, 20);
+ p = reserve_space(xdr, encode_readdir_space);
*p++ = cpu_to_be32(dircount);
*p++ = cpu_to_be32(readdir->count);
- *p++ = cpu_to_be32(2);
-
+ *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
- *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ if (encode_readdir_bitmask_sz > 2) {
+ if (hdr->minorversion > 1)
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+ }
memcpy(verf, readdir->verifier.data, sizeof(verf));
- dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+
+ dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
__func__,
(unsigned long long)readdir->cookie,
verf[0], verf[1],
attrs[0] & readdir->bitmask[0],
- attrs[1] & readdir->bitmask[1]);
+ attrs[1] & readdir->bitmask[1],
+ attrs[2] & readdir->bitmask[2]);
}
static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1638,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, server);
+ encode_attrs(xdr, arg->iap, arg->label, server);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1670,8 +1700,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
__be32 *p;
encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
- encode_open_stateid(xdr, args->context, args->lock_context,
- FMODE_WRITE, hdr->minorversion);
+ encode_nfs4_stateid(xdr, &args->stateid);
p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->offset);
@@ -1901,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
- *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
+ *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
*p++ = cpu_to_be32(0); /* bitmap length 0 */
}
@@ -2015,7 +2044,7 @@ static void encode_free_stateid(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
- encode_nfs4_stateid(xdr, args->stateid);
+ encode_nfs4_stateid(xdr, &args->stateid);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -2609,12 +2638,9 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
struct compound_hdr hdr = {
.nops = 0,
};
- const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
encode_compound_hdr(xdr, req, &hdr);
encode_setclientid_confirm(xdr, arg, &hdr);
- encode_putrootfh(xdr, &hdr);
- encode_fsinfo(xdr, lease_bitmap, &hdr);
encode_nops(&hdr);
}
@@ -3497,8 +3523,11 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
if (n == 0)
goto root_path;
dprintk("pathname4: ");
- path->ncomponents = 0;
- while (path->ncomponents < n) {
+ if (n > NFS4_PATHNAME_MAXCOMPONENTS) {
+ dprintk("cannot parse %d components in path\n", n);
+ goto out_eio;
+ }
+ for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {
struct nfs4_string *component = &path->components[path->ncomponents];
status = decode_opaque_inline(xdr, &component->len, &component->data);
if (unlikely(status != 0))
@@ -3507,12 +3536,6 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
pr_cont("%s%.*s ",
(path->ncomponents != n ? "/ " : ""),
component->len, component->data);
- if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
- path->ncomponents++;
- else {
- dprintk("cannot parse %d components in path\n", n);
- goto out_eio;
- }
}
out:
return status;
@@ -3557,27 +3580,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
- res->nlocations = 0;
- while (res->nlocations < n) {
+ for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
u32 m;
- struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+ struct nfs4_fs_location *loc;
+ if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES)
+ break;
+ loc = &res->locations[res->nlocations];
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
m = be32_to_cpup(p);
- loc->nservers = 0;
dprintk("%s: servers:\n", __func__);
- while (loc->nservers < m) {
- struct nfs4_string *server = &loc->servers[loc->nservers];
- status = decode_opaque_inline(xdr, &server->len, &server->data);
- if (unlikely(status != 0))
- goto out_eio;
- dprintk("%s ", server->data);
- if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
- loc->nservers++;
- else {
+ for (loc->nservers = 0; loc->nservers < m; loc->nservers++) {
+ struct nfs4_string *server;
+
+ if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {
unsigned int i;
dprintk("%s: using first %u of %u servers "
"returned for location %u\n",
@@ -3591,13 +3610,17 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
if (unlikely(status != 0))
goto out_eio;
}
+ break;
}
+ server = &loc->servers[loc->nservers];
+ status = decode_opaque_inline(xdr, &server->len, &server->data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ dprintk("%s ", server->data);
}
status = decode_pathname(xdr, &loc->rootpath);
if (unlikely(status != 0))
goto out_eio;
- if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
- res->nlocations++;
}
if (res->nlocations != 0)
status = NFS_ATTR_FATTR_V4_LOCATIONS;
@@ -4056,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
return status;
}
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_label *label)
+{
+ uint32_t pi = 0;
+ uint32_t lfs = 0;
+ __u32 len;
+ __be32 *p;
+ int status = 0;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ lfs = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ pi = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (len < NFS4_MAXLABELLEN) {
+ if (label) {
+ memcpy(label->label, p, len);
+ label->len = len;
+ label->pi = pi;
+ label->lfs = lfs;
+ status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+ }
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ } else
+ printk(KERN_WARNING "%s: label too long (%u)!\n",
+ __func__, len);
+ }
+ if (label && label->label)
+ dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+ (char *)label->label, label->len, label->pi, label->lfs);
+ return status;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
{
int status = 0;
@@ -4398,7 +4471,7 @@ out_overflow:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- struct nfs4_fs_locations *fs_loc,
+ struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
const struct nfs_server *server)
{
int status;
@@ -4506,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
+ if (label) {
+ status = decode_attr_security_label(xdr, bitmap, label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+ }
+
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
@@ -4513,7 +4593,7 @@ xdr_error:
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
- const struct nfs_server *server)
+ struct nfs4_label *label, const struct nfs_server *server)
{
unsigned int savep;
uint32_t attrlen,
@@ -4532,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+ label, server);
if (status < 0)
goto xdr_error;
@@ -4542,10 +4623,16 @@ xdr_error:
return status;
}
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs4_label *label, const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
}
/*
@@ -5209,27 +5296,30 @@ static int decode_delegreturn(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_DELEGRETURN);
}
-static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+ struct nfs4_secinfo4 *flavor)
{
+ u32 oid_len;
__be32 *p;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
- flavor->gss.sec_oid4.len = be32_to_cpup(p);
- if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
+ oid_len = be32_to_cpup(p);
+ if (oid_len > GSS_OID_MAX_LEN)
goto out_err;
- p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
+ p = xdr_inline_decode(xdr, oid_len);
if (unlikely(!p))
goto out_overflow;
- memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
+ memcpy(flavor->flavor_info.oid.data, p, oid_len);
+ flavor->flavor_info.oid.len = oid_len;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
goto out_overflow;
- flavor->gss.qop4 = be32_to_cpup(p++);
- flavor->gss.service = be32_to_cpup(p);
+ flavor->flavor_info.qop = be32_to_cpup(p++);
+ flavor->flavor_info.service = be32_to_cpup(p);
return 0;
@@ -5242,10 +5332,10 @@ out_err:
static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
{
- struct nfs4_secinfo_flavor *sec_flavor;
+ struct nfs4_secinfo4 *sec_flavor;
+ unsigned int i, num_flavors;
int status;
__be32 *p;
- int i, num_flavors;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
@@ -5934,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -5960,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr,
+ res->label, res->server);
out:
return status;
}
@@ -6051,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6080,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6112,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6245,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (res->access_request)
decode_access(xdr, &res->access_supported, &res->access_result);
- decode_getfattr(xdr, res->f_attr, res->server);
+ decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
out:
return status;
}
@@ -6322,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6648,8 +6739,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
* Decode SETCLIENTID_CONFIRM response
*/
static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs_fsinfo *fsinfo)
+ struct xdr_stream *xdr)
{
struct compound_hdr hdr;
int status;
@@ -6657,10 +6747,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
status = decode_compound_hdr(xdr, &hdr);
if (!status)
status = decode_setclientid_confirm(xdr);
- if (!status)
- status = decode_putrootfh(xdr);
- if (!status)
- status = decode_fsinfo(xdr, fsinfo);
return status;
}
@@ -6716,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
NULL, res->fs_locations,
- res->fs_locations->server);
+ NULL, res->fs_locations->server);
out:
return status;
}
@@ -7129,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- NULL, entry->server) < 0)
+ NULL, entry->label, entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 88f9611a945c..5457745dd4f1 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -234,7 +234,7 @@ static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
lseg = kzalloc(lseg_size, gfp_flags);
if (unlikely(!lseg)) {
- dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+ dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
numdevs, lseg_size);
return -ENOMEM;
}
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
pd.pgbase = 0;
pd.pglen = PAGE_SIZE;
pd.mincount = 0;
+ pd.maxcount = PAGE_SIZE;
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+ pnfslay->plh_lc_cred);
dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
if (err)
goto err_out;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 880ba086be94..87aa1dec6120 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -114,7 +114,7 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-/* objio_free_result will free these @oir structs recieved from
+/* objio_free_result will free these @oir structs received from
* objlayout_{read,write}_done
*/
extern void objio_free_result(struct objlayout_io_res *oir);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e56e846e9d2d..29cfb7ade121 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p)
kmem_cache_free(nfs_page_cachep, p);
}
+static void
+nfs_iocounter_inc(struct nfs_io_counter *c)
+{
+ atomic_inc(&c->io_count);
+}
+
+static void
+nfs_iocounter_dec(struct nfs_io_counter *c)
+{
+ if (atomic_dec_and_test(&c->io_count)) {
+ clear_bit(NFS_IO_INPROGRESS, &c->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
+ }
+}
+
+static int
+__nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
+ DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
+ set_bit(NFS_IO_INPROGRESS, &c->flags);
+ if (atomic_read(&c->io_count) == 0)
+ break;
+ ret = nfs_wait_bit_killable(&c->flags);
+ } while (atomic_read(&c->io_count) != 0);
+ finish_wait(wq, &q.wait);
+ return ret;
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @c: nfs_io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+ if (atomic_read(&c->io_count) == 0)
+ return 0;
+ return __nfs_iocounter_wait(c);
+}
+
/**
* nfs_create_request - Create an NFS read/write request.
* @ctx: open context to use
@@ -104,6 +153,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
struct nfs_page *req;
struct nfs_lock_context *l_ctx;
+ if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+ return ERR_PTR(-EBADF);
/* try to allocate the request struct */
req = nfs_page_alloc();
if (req == NULL)
@@ -116,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
return ERR_CAST(l_ctx);
}
req->wb_lock_context = l_ctx;
+ nfs_iocounter_inc(&l_ctx->io_count);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
@@ -175,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
+ nfs_iocounter_dec(&l_ctx->io_count);
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4bdffe0ba025..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
-static inline u64
+static u64
end_offset(u64 start, u64 len)
{
u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_contained(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_intersecting(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
}
static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
- struct pnfs_layout_range *recall_range)
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
{
return (recall_range->iomode == IOMODE_ANY ||
lseg_range->iomode == recall_range->iomode) &&
- lo_seg_intersecting(lseg_range, recall_range);
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
}
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -718,6 +718,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
spin_lock(&lo->plh_inode->i_lock);
if (pnfs_layoutgets_blocked(lo, 1)) {
status = -EAGAIN;
+ } else if (!nfs4_valid_open_stateid(open_state)) {
+ status = -EBADF;
} else if (list_empty(&lo->plh_segs)) {
int seq;
@@ -764,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
/* Synchronously retrieve layout information from server and
* store in lseg.
@@ -858,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
lrp->args.inode = ino;
lrp->args.layout = lo;
lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = lo->plh_lc_cred;
status = nfs4_proc_layoutreturn(lrp);
out:
@@ -982,8 +986,8 @@ out:
* are seen first.
*/
static s64
-cmp_layout(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
s64 d;
@@ -1010,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+ if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1048,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
INIT_LIST_HEAD(&lo->plh_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
- lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+ lo->plh_lc_cred = get_rpccred(ctx->cred);
return lo;
}
@@ -1089,21 +1093,21 @@ out_existing:
* READ READ true
* READ RW true
*/
-static int
-is_matching_lseg(struct pnfs_layout_range *ls_range,
- struct pnfs_layout_range *range)
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+ const struct pnfs_layout_range *range)
{
struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
- !lo_seg_intersecting(ls_range, range))
+ !pnfs_lseg_range_intersecting(ls_range, range))
return 0;
/* range1 covers only the first byte in the range */
range1 = *range;
range1.length = 1;
- return lo_seg_contained(ls_range, &range1);
+ return pnfs_lseg_range_contained(ls_range, &range1);
}
/*
@@ -1119,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(&lseg->pls_range, range)) {
+ pnfs_lseg_range_match(&lseg->pls_range, range)) {
ret = pnfs_get_lseg(lseg);
break;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
struct nfs4_deviceid dev_id;
unsigned int layout_type;
unsigned int mincount;
+ unsigned int maxcount; /* gdia_maxcount */
struct page **pages;
unsigned int pgbase;
- unsigned int pglen;
+ unsigned int pglen; /* reply buffer length */
};
#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
const struct nfs_fh *fh,
struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
- struct pnfs_device *dev);
+ struct pnfs_device *dev,
+ struct rpc_cred *cred);
extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* should fill in the data with a LOOKUP call on the wire.
*/
if (status == 0)
- status = nfs_instantiate(dentry, fh, fattr);
+ status = nfs_instantiate(dentry, fh, fattr, NULL);
out_free:
nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a5e5d9899d56..70a26c651f09 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_read_data *data = calldata;
NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ rpc_exit(task, -EIO);
}
static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8a29db0f1b..71fdc0dfa0d2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
enum {
Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
- Opt_vers_4_1,
+ Opt_vers_4_1, Opt_vers_4_2,
Opt_vers_err
};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
{ Opt_vers_4, "4" },
{ Opt_vers_4_0, "4.0" },
{ Opt_vers_4_1, "4.1" },
+ { Opt_vers_4_2, "4.2" },
{ Opt_vers_err, NULL }
};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
seq_printf(m, "\n\tnfsv4:\t");
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
show_sessions(m, nfss);
show_pnfs(m, nfss);
@@ -920,7 +922,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
data->mount_server.port = NFS_UNSPEC_PORT;
data->nfs_server.port = NFS_UNSPEC_PORT;
data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- data->auth_flavors[0] = RPC_AUTH_UNIX;
+ data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR;
data->auth_flavor_len = 1;
data->minorversion = 0;
data->need_mount = true;
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
mnt->version = 4;
mnt->minorversion = 1;
break;
+ case Opt_vers_4_2:
+ mnt->version = 4;
+ mnt->minorversion = 2;
+ break;
default:
return 0;
}
@@ -1608,49 +1614,35 @@ out_security_failure:
}
/*
- * Match the requested auth flavors with the list returned by
- * the server. Returns zero and sets the mount's authentication
- * flavor on success; returns -EACCES if server does not support
- * the requested flavor.
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * the server. Returns 0 if it's ok, and -EACCES if not.
*/
-static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
- struct nfs_mount_request *request)
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+ rpc_authflavor_t *server_authlist, unsigned int count)
{
- unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
-
- /*
- * Certain releases of Linux's mountd return an empty
- * flavor list. To prevent behavioral regression with
- * these servers (ie. rejecting mounts that used to
- * succeed), revert to pre-2.6.32 behavior (no checking)
- * if the returned flavor list is empty.
- */
- if (server_authlist_len == 0)
- return 0;
+ unsigned int i;
/*
- * We avoid sophisticated negotiating here, as there are
- * plenty of cases where we can get it wrong, providing
- * either too little or too much security.
+ * If the sec= mount option is used, the specified flavor or AUTH_NULL
+ * must be in the list returned by the server.
*
- * RFC 2623, section 2.7 suggests we SHOULD prefer the
- * flavor listed first. However, some servers list
- * AUTH_NULL first. Our caller plants AUTH_SYS, the
- * preferred default, in args->auth_flavors[0] if user
- * didn't specify sec= mount option.
+ * AUTH_NULL has a special meaning when it's in the server list - it
+ * means that the server will ignore the rpc creds, so any flavor
+ * can be used.
*/
- for (i = 0; i < args->auth_flavor_len; i++)
- for (j = 0; j < server_authlist_len; j++)
- if (args->auth_flavors[i] == request->auth_flavs[j]) {
- dfprintk(MOUNT, "NFS: using auth flavor %d\n",
- request->auth_flavs[j]);
- args->auth_flavors[0] = request->auth_flavs[j];
- return 0;
- }
+ for (i = 0; i < count; i++) {
+ if (args->auth_flavors[0] == server_authlist[i] ||
+ server_authlist[i] == RPC_AUTH_NULL)
+ goto out;
+ }
- dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
- nfs_umount(request);
+ dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
+ args->auth_flavors[0]);
return -EACCES;
+
+out:
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+ return 0;
}
/*
@@ -1658,10 +1650,10 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
* corresponding to the provided path.
*/
static int nfs_request_mount(struct nfs_parsed_mount_data *args,
- struct nfs_fh *root_fh)
+ struct nfs_fh *root_fh,
+ rpc_authflavor_t *server_authlist,
+ unsigned int *server_authlist_len)
{
- rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
- unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
struct nfs_mount_request request = {
.sap = (struct sockaddr *)
&args->mount_server.address,
@@ -1669,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
.protocol = args->mount_server.protocol,
.fh = root_fh,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
- .auth_flav_len = &server_authlist_len,
+ .auth_flav_len = server_authlist_len,
.auth_flavs = server_authlist,
.net = args->net,
};
@@ -1713,29 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
return status;
}
+ return 0;
+}
+
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+ struct nfs_subversion *nfs_mod)
+{
+ int status;
+ unsigned int i;
+ bool tried_auth_unix = false;
+ bool auth_null_in_list = false;
+ struct nfs_server *server = ERR_PTR(-EACCES);
+ struct nfs_parsed_mount_data *args = mount_info->parsed;
+ rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+ unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+ status = nfs_request_mount(args, mount_info->mntfh, authlist,
+ &authlist_len);
+ if (status)
+ return ERR_PTR(status);
+
/*
- * MNTv1 (NFSv2) does not support auth flavor negotiation.
+ * Was a sec= authflavor specified in the options? First, verify
+ * whether the server supports it, and then just try to use it if so.
*/
- if (args->mount_server.version != NFS_MNT3_VERSION)
- return 0;
- return nfs_walk_authlist(args, &request);
+ if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+ status = nfs_verify_authflavor(args, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+ if (status)
+ return ERR_PTR(status);
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ }
+
+ /*
+ * No sec= option was provided. RFC 2623, section 2.7 suggests we
+ * SHOULD prefer the flavor listed first. However, some servers list
+ * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+ */
+ for (i = 0; i < authlist_len; ++i) {
+ rpc_authflavor_t flavor;
+ struct rpcsec_gss_info info;
+
+ flavor = authlist[i];
+ switch (flavor) {
+ case RPC_AUTH_UNIX:
+ tried_auth_unix = true;
+ break;
+ case RPC_AUTH_NULL:
+ auth_null_in_list = true;
+ continue;
+ default:
+ if (rpcauth_get_gssinfo(flavor, &info) != 0)
+ continue;
+ /* Fallthrough */
+ }
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+ args->auth_flavors[0] = flavor;
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ if (!IS_ERR(server))
+ return server;
+ }
+
+ /*
+ * Nothing we tried so far worked. At this point, give up if we've
+ * already tried AUTH_UNIX or if the server's list doesn't contain
+ * AUTH_NULL
+ */
+ if (tried_auth_unix || !auth_null_in_list)
+ return server;
+
+ /* Last chance! Try AUTH_UNIX */
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
}
struct dentry *nfs_try_mount(int flags, const char *dev_name,
struct nfs_mount_info *mount_info,
struct nfs_subversion *nfs_mod)
{
- int status;
struct nfs_server *server;
- if (mount_info->parsed->need_mount) {
- status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
- if (status)
- return ERR_PTR(status);
- }
+ if (mount_info->parsed->need_mount)
+ server = nfs_try_mount_request(mount_info, nfs_mod);
+ else
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
- /* Get a volume representation */
- server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
if (IS_ERR(server))
return ERR_CAST(server);
@@ -1904,6 +1959,7 @@ static int nfs23_validate_mount_data(void *options,
args->namlen = data->namlen;
args->bsize = data->bsize;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->flags & NFS_MOUNT_SECFLAVOUR)
args->auth_flavors[0] = data->pseudoflavor;
if (!args->nfs_server.hostname)
@@ -2373,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
- return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+ int error;
+ unsigned long kflags = 0, kflags_out = 0;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+ error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+ kflags, &kflags_out);
+ if (error)
+ goto err;
+
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+ return error;
}
EXPORT_SYMBOL_GPL(nfs_set_sb_security);
@@ -2381,10 +2451,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
/* clone any lsm security options from the parent to the new sb */
- security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
return -ESTALE;
- return 0;
+ return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
}
EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
@@ -2600,6 +2669,7 @@ static int nfs4_validate_mount_data(void *options,
goto out_no_address;
args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->auth_flavourlen) {
if (data->auth_flavourlen > 1)
goto out_inval_auth;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- dentry->d_count);
+ d_count(dentry));
nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c483cc50b82e..a2c7c28049d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ rpc_exit(task, -EIO);
}
void nfs_commit_prepare(struct rpc_task *task, void *calldata)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
If unsure, say N.
+config NFSD_V4_SECURITY_LABEL
+ bool "Provide Security Label support for NFSv4 server"
+ depends on NFSD_V4 && SECURITY
+ help
+
+ Say Y here if you want enable fine-grained security label attribute
+ support for NFS version 4. Security labels allow security modules like
+ SELinux and Smack to label files to facilitate enforcement of their policies.
+ Without this an NFSv4 mount will have the same label on each file.
+
+ If you do not wish to enable fine-grained security labels SELinux or
+ Smack policies on NFSv4 files, say N.
+
+ WARNING: there is still a chance of backwards-incompatible protocol changes.
+ For now we recommend "Y" only for developers and testers."
+
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 87fd1410b737..d5c5b3e00266 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -82,6 +82,7 @@ int nfsd_reply_cache_init(void);
void nfsd_reply_cache_shutdown(void);
int nfsd_cache_lookup(struct svc_rqst *);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int nfsd_reply_cache_stats_open(struct inode *, struct file *);
#ifdef CONFIG_NFSD_V4
void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1051bebff1b0..849a7c3ced22 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -80,6 +80,7 @@ struct nfsd_net {
*/
struct list_head client_lru;
struct list_head close_lru;
+ struct list_head del_recall_lru;
struct delayed_work laundromat_work;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 99bc85ff0217..7f05cd140de3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -37,6 +37,7 @@
#include "nfsd.h"
#include "state.h"
#include "netns.h"
+#include "xdr4cb.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -53,30 +54,6 @@ enum {
NFSPROC4_CLNT_CB_SEQUENCE,
};
-#define NFS4_MAXTAGLEN 20
-
-#define NFS4_enc_cb_null_sz 0
-#define NFS4_dec_cb_null_sz 0
-#define cb_compound_enc_hdr_sz 4
-#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
-#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
-#define cb_sequence_enc_sz (sessionid_sz + 4 + \
- 1 /* no referring calls list yet */)
-#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
-
-#define op_enc_sz 1
-#define op_dec_sz 2
-#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
-#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
-#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
- cb_sequence_enc_sz + \
- 1 + enc_stateid_sz + \
- enc_nfs4_fh_sz)
-
-#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
- cb_sequence_dec_sz + \
- op_dec_sz)
-
struct nfs4_cb_compound_hdr {
/* args */
u32 ident; /* minorversion 0 only */
@@ -817,8 +794,7 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfs4_client *clp = cb->cb_clp;
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
@@ -839,8 +815,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfs4_client *clp = cb->cb_clp;
dprintk("%s: minorversion=%d\n", __func__,
clp->cl_minorversion);
@@ -863,7 +838,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
nfsd4_cb_done(task, calldata);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ae73175e6e68..a7cee864e7b2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
#include "current_stateid.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+ struct inode *inode = resfh->fh_dentry->d_inode;
+ int status;
+
+ mutex_lock(&inode->i_mutex);
+ status = security_inode_setsecctx(resfh->fh_dentry,
+ label->data, label->len);
+ mutex_unlock(&inode->i_mutex);
+
+ if (status)
+ /*
+ * XXX: We should really fail the whole open, but we may
+ * already have created a new file, so it may be too
+ * late. For now this seems the least of evils:
+ */
+ bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
+
#define NFSDDBG_FACILITY NFSDDBG_PROC
static u32 nfsd_attrmask[] = {
@@ -191,9 +221,18 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
return nfserr_symlink;
}
+static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
+{
+ if (nfsd4_has_session(cstate))
+ return;
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
+ &resfh->fh_handle);
+}
+
static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
{
+ struct svc_fh *current_fh = &cstate->current_fh;
struct svc_fh *resfh;
int accmode;
__be32 status;
@@ -230,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
+ if (!status && open->op_label.len)
+ nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+
/*
* Following rfc 3530 14.2.16, use the returned bitmask
* to indicate which attributes we used to store the
@@ -252,11 +294,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
if (is_create_with_attrs(open) && open->op_acl != NULL)
do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
- /* set reply cache */
- fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
- &resfh->fh_handle);
+ nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
accmode = NFSD_MAY_NOP;
- if (open->op_created)
+ if (open->op_created ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
accmode |= NFSD_MAY_OWNER_OVERRIDE;
status = do_open_permission(rqstp, resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
@@ -268,9 +309,11 @@ out:
}
static __be32
-do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
{
+ struct svc_fh *current_fh = &cstate->current_fh;
__be32 status;
+ int accmode = 0;
/* We don't know the target directory, and therefore can not
* set the change info
@@ -278,15 +321,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
- /* set replay cache */
- fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
- &current_fh->fh_handle);
+ nfsd4_set_open_owner_reply_cache(cstate, open, current_fh);
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
(open->op_iattr.ia_size == 0);
+ /*
+ * In the delegation case, the client is telling us about an
+ * open that it *already* performed locally, some time ago. We
+ * should let it succeed now if possible.
+ *
+ * In the case of a CLAIM_FH open, on the other hand, the client
+ * may be counting on us to enforce permissions (the Linux 4.1
+ * client uses this for normal opens, for example).
+ */
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+ accmode = NFSD_MAY_OWNER_OVERRIDE;
- status = do_open_permission(rqstp, current_fh, open,
- NFSD_MAY_OWNER_OVERRIDE);
+ status = do_open_permission(rqstp, current_fh, open, accmode);
return status;
}
@@ -351,6 +402,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
if (status)
goto out;
+ if (open->op_xdr_error) {
+ status = open->op_xdr_error;
+ goto out;
+ }
status = nfsd4_check_open_attributes(rqstp, cstate, open);
if (status)
@@ -368,8 +423,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_NULL:
- status = do_open_lookup(rqstp, &cstate->current_fh,
- open);
+ status = do_open_lookup(rqstp, cstate, open);
if (status)
goto out;
break;
@@ -382,8 +436,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
- status = do_open_fhandle(rqstp, &cstate->current_fh,
- open);
+ status = do_open_fhandle(rqstp, cstate, open);
if (status)
goto out;
break;
@@ -409,14 +462,33 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
WARN_ON(status && open->op_created);
out:
nfsd4_cleanup_open_state(open, status);
- if (open->op_openowner)
+ if (open->op_openowner && !nfsd4_has_session(cstate))
cstate->replay_owner = &open->op_openowner->oo_owner;
- else
+ nfsd4_bump_seqid(cstate, status);
+ if (!cstate->replay_owner)
nfs4_unlock_state();
return status;
}
/*
+ * OPEN is the only seqid-mutating operation whose decoding can fail
+ * with a seqid-mutating error (specifically, decoding of user names in
+ * the attributes). Therefore we have to do some processing to look up
+ * the stateowner so that we can bump the seqid.
+ */
+static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op)
+{
+ struct nfsd4_open *open = (struct nfsd4_open *)&op->u;
+
+ if (!seqid_mutating_err(ntohl(op->status)))
+ return op->status;
+ if (nfsd4_has_session(cstate))
+ return op->status;
+ open->op_xdr_error = op->status;
+ return nfsd4_open(rqstp, cstate, open);
+}
+
+/*
* filehandle-manipulating ops.
*/
static __be32
@@ -599,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
+ if (create->cr_label.len)
+ nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+
if (create->cr_acl != NULL)
do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
create->cr_bmval);
@@ -786,21 +861,11 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
rename->rn_snamelen, &cstate->current_fh,
rename->rn_tname, rename->rn_tnamelen);
-
- /* the underlying filesystem returns different error's than required
- * by NFSv4. both save_fh and current_fh have been verified.. */
- if (status == nfserr_isdir)
- status = nfserr_exist;
- else if ((status == nfserr_notdir) &&
- (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
- S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
- status = nfserr_exist;
-
- if (!status) {
- set_change_info(&rename->rn_sinfo, &cstate->current_fh);
- set_change_info(&rename->rn_tinfo, &cstate->save_fh);
- }
- return status;
+ if (status)
+ return status;
+ set_change_info(&rename->rn_sinfo, &cstate->current_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+ return nfs_ok;
}
static __be32
@@ -888,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
setattr->sa_acl);
if (status)
goto out;
+ if (setattr->sa_label.len)
+ status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+ &setattr->sa_label);
+ if (status)
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
out:
@@ -931,14 +1001,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
cstate, stateid, WR_STATE, &filp);
- if (filp)
- get_file(filp);
- nfs4_unlock_state();
-
if (status) {
+ nfs4_unlock_state();
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
}
+ if (filp)
+ get_file(filp);
+ nfs4_unlock_state();
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
@@ -1244,8 +1314,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* for example, if there is a miscellaneous XDR error
* it will be set to nfserr_bad_xdr.
*/
- if (op->status)
+ if (op->status) {
+ if (op->opnum == OP_OPEN)
+ op->status = nfsd4_open_omfg(rqstp, cstate, op);
goto encode_op;
+ }
/* We must be able to encode a successful response to
* this operation, with enough room left over to encode a
@@ -1282,12 +1355,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
if (op->status)
goto encode_op;
- if (opdesc->op_func) {
- if (opdesc->op_get_currentstateid)
- opdesc->op_get_currentstateid(cstate, &op->u);
- op->status = opdesc->op_func(rqstp, cstate, &op->u);
- } else
- BUG_ON(op->status == nfs_ok);
+ if (opdesc->op_get_currentstateid)
+ opdesc->op_get_currentstateid(cstate, &op->u);
+ op->status = opdesc->op_func(rqstp, cstate, &op->u);
if (!op->status) {
if (opdesc->op_set_currentstateid)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 899ca26dd194..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -146,7 +146,7 @@ out_no_tfm:
* then disable recovery tracking.
*/
static void
-legacy_recdir_name_error(int error)
+legacy_recdir_name_error(struct nfs4_client *clp, int error)
{
printk(KERN_ERR "NFSD: unable to generate recoverydir "
"name (%d).\n", error);
@@ -159,9 +159,7 @@ legacy_recdir_name_error(int error)
if (error == -ENOENT) {
printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
"Reboot recovery will not function correctly!\n");
-
- /* the argument is ignored by the legacy exit function */
- nfsd4_client_tracking_exit(NULL);
+ nfsd4_client_tracking_exit(clp->net);
}
}
@@ -184,7 +182,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
- return legacy_recdir_name_error(status);
+ return legacy_recdir_name_error(clp, status);
status = nfs4_save_creds(&original_cred);
if (status < 0)
@@ -242,11 +240,16 @@ struct name_list {
struct list_head list;
};
+struct nfs4_dir_ctx {
+ struct dir_context ctx;
+ struct list_head names;
+};
+
static int
nfsd4_build_namelist(void *arg, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct list_head *names = arg;
+ struct nfs4_dir_ctx *ctx = arg;
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
@@ -256,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
return -ENOMEM;
memcpy(entry->name, name, HEXDIR_LEN - 1);
entry->name[HEXDIR_LEN - 1] = '\0';
- list_add(&entry->list, names);
+ list_add(&entry->list, &ctx->names);
return 0;
}
@@ -265,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
{
const struct cred *original_cred;
struct dentry *dir = nn->rec_file->f_path.dentry;
- LIST_HEAD(names);
+ struct nfs4_dir_ctx ctx = {
+ .ctx.actor = nfsd4_build_namelist,
+ .names = LIST_HEAD_INIT(ctx.names)
+ };
int status;
status = nfs4_save_creds(&original_cred);
@@ -278,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
return status;
}
- status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+ status = iterate_dir(nn->rec_file, &ctx.ctx);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
- while (!list_empty(&names)) {
+ while (!list_empty(&ctx.names)) {
struct name_list *entry;
- entry = list_entry(names.next, struct name_list, list);
+ entry = list_entry(ctx.names.next, struct name_list, list);
if (!status) {
struct dentry *dentry;
dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
@@ -341,7 +347,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
- return legacy_recdir_name_error(status);
+ return legacy_recdir_name_error(clp, status);
status = mnt_want_write_file(nn->rec_file);
if (status)
@@ -601,7 +607,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status) {
- legacy_recdir_name_error(status);
+ legacy_recdir_name_error(clp, status);
return status;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e27430b9070..280acef6f0dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
#include "xdr4.h"
+#include "xdr4cb.h"
#include "vfs.h"
#include "current_stateid.h"
@@ -94,17 +95,33 @@ nfs4_lock_state(void)
mutex_lock(&client_mutex);
}
-static void free_session(struct kref *);
+static void free_session(struct nfsd4_session *);
-/* Must be called under the client_lock */
-static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
{
- kref_put(&ses->se_ref, free_session);
+ return ses->se_flags & NFS4_SESSION_DEAD;
}
-static void nfsd4_get_session(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
+{
+ if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+ free_session(ses);
+}
+
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
+{
+ if (atomic_read(&ses->se_ref) > ref_held_by_me)
+ return nfserr_jukebox;
+ ses->se_flags |= NFS4_SESSION_DEAD;
+ return nfs_ok;
+}
+
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
{
- kref_get(&ses->se_ref);
+ if (is_session_dead(ses))
+ return nfserr_badsession;
+ atomic_inc(&ses->se_ref);
+ return nfs_ok;
}
void
@@ -113,6 +130,90 @@ nfs4_unlock_state(void)
mutex_unlock(&client_mutex);
}
+static bool is_client_expired(struct nfs4_client *clp)
+{
+ return clp->cl_time == 0;
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+ if (atomic_read(&clp->cl_refcount))
+ return nfserr_jukebox;
+ clp->cl_time = 0;
+ return nfs_ok;
+}
+
+static __be32 mark_client_expired(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ __be32 ret;
+
+ spin_lock(&nn->client_lock);
+ ret = mark_client_expired_locked(clp);
+ spin_unlock(&nn->client_lock);
+ return ret;
+}
+
+static __be32 get_client_locked(struct nfs4_client *clp)
+{
+ if (is_client_expired(clp))
+ return nfserr_expired;
+ atomic_inc(&clp->cl_refcount);
+ return nfs_ok;
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (is_client_expired(clp)) {
+ WARN_ON(1);
+ printk("%s: client (clientid %08x/%08x) already expired\n",
+ __func__,
+ clp->cl_clientid.cl_boot,
+ clp->cl_clientid.cl_id);
+ return;
+ }
+
+ dprintk("renewing client (clientid %08x/%08x)\n",
+ clp->cl_clientid.cl_boot,
+ clp->cl_clientid.cl_id);
+ list_move_tail(&clp->cl_lru, &nn->client_lru);
+ clp->cl_time = get_seconds();
+}
+
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ renew_client_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+static void put_client_renew_locked(struct nfs4_client *clp)
+{
+ if (!atomic_dec_and_test(&clp->cl_refcount))
+ return;
+ if (!is_client_expired(clp))
+ renew_client_locked(clp);
+}
+
+void put_client_renew(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
+ return;
+ if (!is_client_expired(clp))
+ renew_client_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+
static inline u32
opaque_hashval(const void *ptr, int nbytes)
{
@@ -126,8 +227,6 @@ opaque_hashval(const void *ptr, int nbytes)
return x;
}
-static struct list_head del_recall_lru;
-
static void nfsd4_free_file(struct nfs4_file *f)
{
kmem_cache_free(file_slab, f);
@@ -137,7 +236,7 @@ static inline void
put_nfs4_file(struct nfs4_file *fi)
{
if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
- list_del(&fi->fi_hash);
+ hlist_del(&fi->fi_hash);
spin_unlock(&recall_lock);
iput(fi->fi_inode);
nfsd4_free_file(fi);
@@ -181,7 +280,7 @@ static unsigned int file_hashval(struct inode *ino)
return hash_ptr(ino, FILE_HASH_BITS);
}
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
@@ -210,13 +309,7 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
{
if (atomic_dec_and_test(&fp->fi_access[oflag])) {
nfs4_file_put_fd(fp, oflag);
- /*
- * It's also safe to get rid of the RDWR open *if*
- * we no longer have need of the other kind of access
- * or if we already have the other kind of open:
- */
- if (fp->fi_fds[1-oflag]
- || atomic_read(&fp->fi_access[1 - oflag]) == 0)
+ if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
nfs4_file_put_fd(fp, O_RDWR);
}
}
@@ -234,7 +327,6 @@ static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
kmem_cache *slab)
{
struct idr *stateids = &cl->cl_stateids;
- static int min_stateid = 0;
struct nfs4_stid *stid;
int new_id;
@@ -242,7 +334,7 @@ kmem_cache *slab)
if (!stid)
return NULL;
- new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL);
+ new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL);
if (new_id < 0)
goto out_free;
stid->sc_client = cl;
@@ -261,13 +353,9 @@ kmem_cache *slab)
* amount of time until an id is reused, by ensuring they always
* "increase" (mod INT_MAX):
*/
-
- min_stateid = new_id+1;
- if (min_stateid == INT_MAX)
- min_stateid = 0;
return stid;
out_free:
- kfree(stid);
+ kmem_cache_free(slab, stid);
return NULL;
}
@@ -277,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
dprintk("NFSD alloc_init_deleg\n");
- /*
- * Major work on the lease subsystem (for example, to support
- * calbacks on stat) will be required before we can support
- * write delegations properly.
- */
- if (type != NFS4_OPEN_DELEGATE_READ)
- return NULL;
if (fp->fi_had_conflict)
return NULL;
if (num_delegations > max_delegations)
@@ -310,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
INIT_LIST_HEAD(&dp->dl_recall_lru);
get_nfs4_file(fp);
dp->dl_file = fp;
- dp->dl_type = type;
+ dp->dl_type = NFS4_OPEN_DELEGATE_READ;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -318,21 +399,18 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
return dp;
}
-static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab)
+static void remove_stid(struct nfs4_stid *s)
{
struct idr *stateids = &s->sc_client->cl_stateids;
idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
- kmem_cache_free(slab, s);
}
void
nfs4_put_delegation(struct nfs4_delegation *dp)
{
if (atomic_dec_and_test(&dp->dl_count)) {
- dprintk("NFSD: freeing dp %p\n",dp);
- put_nfs4_file(dp->dl_file);
- free_stid(&dp->dl_stid, deleg_slab);
+ kmem_cache_free(deleg_slab, dp);
num_delegations--;
}
}
@@ -356,16 +434,45 @@ static void unhash_stid(struct nfs4_stid *s)
static void
unhash_delegation(struct nfs4_delegation *dp)
{
- unhash_stid(&dp->dl_stid);
list_del_init(&dp->dl_perclnt);
spin_lock(&recall_lock);
list_del_init(&dp->dl_perfile);
list_del_init(&dp->dl_recall_lru);
spin_unlock(&recall_lock);
nfs4_put_deleg_lease(dp->dl_file);
+ put_nfs4_file(dp->dl_file);
+ dp->dl_file = NULL;
+}
+
+
+
+static void destroy_revoked_delegation(struct nfs4_delegation *dp)
+{
+ list_del_init(&dp->dl_recall_lru);
+ remove_stid(&dp->dl_stid);
+ nfs4_put_delegation(dp);
+}
+
+static void destroy_delegation(struct nfs4_delegation *dp)
+{
+ unhash_delegation(dp);
+ remove_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
}
+static void revoke_delegation(struct nfs4_delegation *dp)
+{
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ if (clp->cl_minorversion == 0)
+ destroy_delegation(dp);
+ else {
+ unhash_delegation(dp);
+ dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ }
+}
+
/*
* SETCLIENTID state
*/
@@ -506,7 +613,8 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
static void free_generic_stateid(struct nfs4_ol_stateid *stp)
{
- free_stid(&stp->st_stid, stateid_slab);
+ remove_stid(&stp->st_stid);
+ kmem_cache_free(stateid_slab, stp);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -622,6 +730,28 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
}
#endif
+/*
+ * Bump the seqid on cstate->replay_owner, and clear replay_owner if it
+ * won't be used for replay.
+ */
+void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
+{
+ struct nfs4_stateowner *so = cstate->replay_owner;
+
+ if (nfserr == nfserr_replay_me)
+ return;
+
+ if (!seqid_mutating_err(ntohl(nfserr))) {
+ cstate->replay_owner = NULL;
+ return;
+ }
+ if (!so)
+ return;
+ if (so->so_is_open_owner)
+ release_last_closed_stateid(openowner(so));
+ so->so_seqid++;
+ return;
+}
static void
gen_sessionid(struct nfsd4_session *ses)
@@ -662,17 +792,15 @@ free_session_slots(struct nfsd4_session *ses)
* We don't actually need to cache the rpc and session headers, so we
* can allocate a little less for each slot:
*/
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
{
- return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
-}
+ u32 size;
-static int nfsd4_sanitize_slot_size(u32 size)
-{
- size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
- size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
-
- return size;
+ if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
+ size = 0;
+ else
+ size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+ return size + sizeof(struct nfsd4_slot);
}
/*
@@ -680,12 +808,12 @@ static int nfsd4_sanitize_slot_size(u32 size)
* re-negotiate active sessions and reduce their slot usage to make
* room for new connections. For now we just fail the create session.
*/
-static int nfsd4_get_drc_mem(int slotsize, u32 num)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
{
+ u32 slotsize = slot_bytes(ca);
+ u32 num = ca->maxreqs;
int avail;
- num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-
spin_lock(&nfsd_drc_lock);
avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
nfsd_drc_max_mem - nfsd_drc_mem_used);
@@ -696,15 +824,19 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
return num;
}
-static void nfsd4_put_drc_mem(int slotsize, int num)
+static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
{
+ int slotsize = slot_bytes(ca);
+
spin_lock(&nfsd_drc_lock);
- nfsd_drc_mem_used -= slotsize * num;
+ nfsd_drc_mem_used -= slotsize * ca->maxreqs;
spin_unlock(&nfsd_drc_lock);
}
-static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
{
+ int numslots = attrs->maxreqs;
+ int slotsize = slot_bytes(attrs);
struct nfsd4_session *new;
int mem, i;
@@ -717,8 +849,7 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
return NULL;
/* allocate each struct nfsd4_slot and data cache in one piece */
for (i = 0; i < numslots; i++) {
- mem = sizeof(struct nfsd4_slot) + slotsize;
- new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+ new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
if (!new->se_slots[i])
goto out_free;
}
@@ -730,21 +861,6 @@ out_free:
return NULL;
}
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
- struct nfsd4_channel_attrs *req,
- int numslots, int slotsize,
- struct nfsd_net *nn)
-{
- u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
-
- new->maxreqs = numslots;
- new->maxresp_cached = min_t(u32, req->maxresp_cached,
- slotsize + NFSD_MIN_HDR_SEQ_SZ);
- new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
- new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
- new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
-}
-
static void free_conn(struct nfsd4_conn *c)
{
svc_xprt_put(c->cn_xprt);
@@ -761,8 +877,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
list_del(&c->cn_persession);
free_conn(c);
}
- spin_unlock(&clp->cl_lock);
nfsd4_probe_callback(clp);
+ spin_unlock(&clp->cl_lock);
}
static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -846,59 +962,20 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
static void __free_session(struct nfsd4_session *ses)
{
- nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
free_session_slots(ses);
kfree(ses);
}
-static void free_session(struct kref *kref)
+static void free_session(struct nfsd4_session *ses)
{
- struct nfsd4_session *ses;
- struct nfsd_net *nn;
-
- ses = container_of(kref, struct nfsd4_session, se_ref);
- nn = net_generic(ses->se_client->net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
lockdep_assert_held(&nn->client_lock);
nfsd4_del_conns(ses);
+ nfsd4_put_drc_mem(&ses->se_fchannel);
__free_session(ses);
}
-void nfsd4_put_session(struct nfsd4_session *ses)
-{
- struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
-
- spin_lock(&nn->client_lock);
- nfsd4_put_session_locked(ses);
- spin_unlock(&nn->client_lock);
-}
-
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
- struct nfsd_net *nn)
-{
- struct nfsd4_session *new;
- int numslots, slotsize;
- /*
- * Note decreasing slot size below client's request may
- * make it difficult for client to function correctly, whereas
- * decreasing the number of slots will (just?) affect
- * performance. When short on memory we therefore prefer to
- * decrease number of slots instead of their size.
- */
- slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
- numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
- if (numslots < 1)
- return NULL;
-
- new = __alloc_session(slotsize, numslots);
- if (!new) {
- nfsd4_put_drc_mem(slotsize, numslots);
- return NULL;
- }
- init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
- return new;
-}
-
static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
{
int idx;
@@ -913,7 +990,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
new->se_flags = cses->flags;
new->se_cb_prog = cses->callback_prog;
new->se_cb_sec = cses->cb_sec;
- kref_init(&new->se_ref);
+ atomic_set(&new->se_ref, 0);
idx = hash_sessionid(&new->se_sessionid);
spin_lock(&nn->client_lock);
list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
@@ -921,7 +998,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
spin_unlock(&nn->client_lock);
-
+ memcpy(&new->se_fchannel, &cses->fore_channel,
+ sizeof(struct nfsd4_channel_attrs));
if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
/*
@@ -968,38 +1046,6 @@ unhash_session(struct nfsd4_session *ses)
spin_unlock(&ses->se_client->cl_lock);
}
-/* must be called under the client_lock */
-static inline void
-renew_client_locked(struct nfs4_client *clp)
-{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
- if (is_client_expired(clp)) {
- WARN_ON(1);
- printk("%s: client (clientid %08x/%08x) already expired\n",
- __func__,
- clp->cl_clientid.cl_boot,
- clp->cl_clientid.cl_id);
- return;
- }
-
- dprintk("renewing client (clientid %08x/%08x)\n",
- clp->cl_clientid.cl_boot,
- clp->cl_clientid.cl_id);
- list_move_tail(&clp->cl_lru, &nn->client_lru);
- clp->cl_time = get_seconds();
-}
-
-static inline void
-renew_client(struct nfs4_client *clp)
-{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
- spin_lock(&nn->client_lock);
- renew_client_locked(clp);
- spin_unlock(&nn->client_lock);
-}
-
/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
static int
STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
@@ -1043,7 +1089,8 @@ free_client(struct nfs4_client *clp)
ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
se_perclnt);
list_del(&ses->se_perclnt);
- nfsd4_put_session_locked(ses);
+ WARN_ON_ONCE(atomic_read(&ses->se_ref));
+ free_session(ses);
}
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
@@ -1051,29 +1098,12 @@ free_client(struct nfs4_client *clp)
kfree(clp);
}
-void
-release_session_client(struct nfsd4_session *session)
-{
- struct nfs4_client *clp = session->se_client;
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
- if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
- return;
- if (is_client_expired(clp)) {
- free_client(clp);
- session->se_client = NULL;
- } else
- renew_client_locked(clp);
- spin_unlock(&nn->client_lock);
-}
-
/* must be called under the client_lock */
static inline void
unhash_client_locked(struct nfs4_client *clp)
{
struct nfsd4_session *ses;
- mark_client_expired(clp);
list_del(&clp->cl_lru);
spin_lock(&clp->cl_lock);
list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
@@ -1099,7 +1129,7 @@ destroy_client(struct nfs4_client *clp)
spin_unlock(&recall_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
- unhash_delegation(dp);
+ destroy_delegation(dp);
}
while (!list_empty(&clp->cl_openowners)) {
oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
@@ -1115,8 +1145,8 @@ destroy_client(struct nfs4_client *clp)
rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
spin_lock(&nn->client_lock);
unhash_client_locked(clp);
- if (atomic_read(&clp->cl_refcount) == 0)
- free_client(clp);
+ WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
+ free_client(clp);
spin_unlock(&nn->client_lock);
}
@@ -1152,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
target->cr_gid = source->cr_gid;
target->cr_group_info = source->cr_group_info;
get_group_info(target->cr_group_info);
+ target->cr_gss_mech = source->cr_gss_mech;
+ if (source->cr_gss_mech)
+ gss_mech_get(source->cr_gss_mech);
return 0;
}
@@ -1226,6 +1259,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
}
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+ u32 service;
+
+ service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+ return service == RPC_GSS_SVC_INTEGRITY ||
+ service == RPC_GSS_SVC_PRIVACY;
+}
+
+static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+
+ if (!cl->cl_mach_cred)
+ return true;
+ if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+ return false;
+ if (!svc_rqst_integrity_protected(rqstp))
+ return false;
+ if (!cr->cr_principal)
+ return false;
+ return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
+
static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
static u32 current_clientid = 1;
@@ -1295,6 +1353,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
+ INIT_LIST_HEAD(&clp->cl_revoked);
spin_lock_init(&clp->cl_lock);
nfsd4_init_callback(&clp->cl_cb_null);
clp->cl_time = get_seconds();
@@ -1376,12 +1435,12 @@ move_to_confirmed(struct nfs4_client *clp)
}
static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
{
struct nfs4_client *clp;
unsigned int idhashval = clientid_hashval(clid->cl_id);
- list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
+ list_for_each_entry(clp, &tbl[idhashval], cl_idhash) {
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
@@ -1393,19 +1452,19 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
}
static struct nfs4_client *
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+ struct list_head *tbl = nn->conf_id_hashtbl;
+
+ return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static struct nfs4_client *
find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
- struct nfs4_client *clp;
- unsigned int idhashval = clientid_hashval(clid->cl_id);
+ struct list_head *tbl = nn->unconf_id_hashtbl;
- list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
- if (same_clid(&clp->cl_clientid, clid)) {
- if ((bool)clp->cl_minorversion != sessions)
- return NULL;
- return clp;
- }
- }
- return NULL;
+ return find_client_in_id_table(tbl, clid, sessions);
}
static bool clp_used_exchangeid(struct nfs4_client *clp)
@@ -1602,15 +1661,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
- /* Currently only support SP4_NONE */
switch (exid->spa_how) {
+ case SP4_MACH_CRED:
+ if (!svc_rqst_integrity_protected(rqstp))
+ return nfserr_inval;
case SP4_NONE:
break;
default: /* checked by xdr code */
WARN_ON_ONCE(1);
case SP4_SSV:
- case SP4_MACH_CRED:
- return nfserr_serverfault; /* no excuse :-/ */
+ return nfserr_encr_alg_unsupp;
}
/* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1625,6 +1685,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_inval;
goto out;
}
+ if (!mach_creds_match(conf, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
if (!creds_match) { /* case 9 */
status = nfserr_perm;
goto out;
@@ -1671,7 +1735,8 @@ out_new:
status = nfserr_jukebox;
goto out;
}
- new->cl_minorversion = 1;
+ new->cl_minorversion = cstate->minorversion;
+ new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -1750,10 +1815,73 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
/* seqid, slotID, slotID, slotID, status */ \
5 ) * sizeof(__be32))
-static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
{
- return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
- || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
+ u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
+
+ if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
+ return nfserr_toosmall;
+ if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
+ return nfserr_toosmall;
+ ca->headerpadsz = 0;
+ ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
+ ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+ ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+ ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
+ NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
+ ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
+ /*
+ * Note decreasing slot size below client's request may make it
+ * difficult for client to function correctly, whereas
+ * decreasing the number of slots will (just?) affect
+ * performance. When short on memory we therefore prefer to
+ * decrease number of slots instead of their size. Clients that
+ * request larger slots than they need will get poor results:
+ */
+ ca->maxreqs = nfsd4_get_drc_mem(ca);
+ if (!ca->maxreqs)
+ return nfserr_jukebox;
+
+ return nfs_ok;
+}
+
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
+{
+ ca->headerpadsz = 0;
+
+ /*
+ * These RPC_MAX_HEADER macros are overkill, especially since we
+ * don't even do gss on the backchannel yet. But this is still
+ * less than 1k. Tighten up this estimate in the unlikely event
+ * it turns out to be a problem for some client:
+ */
+ if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+ return nfserr_toosmall;
+ if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+ return nfserr_toosmall;
+ ca->maxresp_cached = 0;
+ if (ca->maxops < 2)
+ return nfserr_toosmall;
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+ switch (cbs->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ return nfs_ok;
+ default:
+ /*
+ * GSS case: the spec doesn't allow us to return this
+ * error. But it also doesn't allow us not to support
+ * GSS.
+ * I'd rather this fail hard than return some error the
+ * client might think it can already handle:
+ */
+ return nfserr_encr_alg_unsupp;
+ }
}
__be32
@@ -1771,12 +1899,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
- if (check_forechannel_attrs(cr_ses->fore_channel))
- return nfserr_toosmall;
- new = alloc_session(&cr_ses->fore_channel, nn);
- if (!new)
- return nfserr_jukebox;
+ status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+ if (status)
+ return status;
+ status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+ if (status)
+ return status;
+ status = check_backchannel_attrs(&cr_ses->back_channel);
+ if (status)
+ return status;
status = nfserr_jukebox;
+ new = alloc_session(&cr_ses->fore_channel);
+ if (!new)
+ goto out_release_drc_mem;
conn = alloc_conn_from_crses(rqstp, cr_ses);
if (!conn)
goto out_free_session;
@@ -1784,8 +1919,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
nfs4_lock_state();
unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
conf = find_confirmed_client(&cr_ses->clientid, true, nn);
+ WARN_ON_ONCE(conf && unconf);
if (conf) {
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(conf, rqstp))
+ goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
@@ -1802,6 +1941,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
goto out_free_conn;
}
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(unconf, rqstp))
+ goto out_free_conn;
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
@@ -1810,8 +1952,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
- if (old)
+ if (old) {
+ status = mark_client_expired(old);
+ if (status)
+ goto out_free_conn;
expire_client(old);
+ }
move_to_confirmed(unconf);
conf = unconf;
} else {
@@ -1830,23 +1976,21 @@ nfsd4_create_session(struct svc_rqst *rqstp,
memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
- memcpy(&cr_ses->fore_channel, &new->se_fchannel,
- sizeof(struct nfsd4_channel_attrs));
cs_slot->sl_seqid++;
cr_ses->seqid = cs_slot->sl_seqid;
/* cache solo and embedded create sessions under the state lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
nfs4_unlock_state();
-out:
- dprintk("%s returns %d\n", __func__, ntohl(status));
return status;
out_free_conn:
nfs4_unlock_state();
free_conn(conn);
out_free_session:
__free_session(new);
- goto out;
+out_release_drc_mem:
+ nfsd4_put_drc_mem(&cr_ses->fore_channel);
+ return status;
}
static __be32 nfsd4_map_bcts_dir(u32 *dir)
@@ -1867,7 +2011,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
{
struct nfsd4_session *session = cstate->session;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+ status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+ if (status)
+ return status;
spin_lock(&nn->client_lock);
session->se_cb_prog = bc->bc_cb_program;
session->se_cb_sec = bc->bc_cb_sec;
@@ -1884,30 +2032,33 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
{
__be32 status;
struct nfsd4_conn *conn;
+ struct nfsd4_session *session;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (!nfsd4_last_compound_op(rqstp))
return nfserr_not_only_op;
+ nfs4_lock_state();
spin_lock(&nn->client_lock);
- cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
- /* Sorta weird: we only need the refcnt'ing because new_conn acquires
- * client_lock iself: */
- if (cstate->session) {
- nfsd4_get_session(cstate->session);
- atomic_inc(&cstate->session->se_client->cl_refcount);
- }
+ session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
spin_unlock(&nn->client_lock);
- if (!cstate->session)
- return nfserr_badsession;
-
+ status = nfserr_badsession;
+ if (!session)
+ goto out;
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(session->se_client, rqstp))
+ goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
- return status;
+ goto out;
conn = alloc_conn(rqstp, bcts->dir);
+ status = nfserr_jukebox;
if (!conn)
- return nfserr_jukebox;
- nfsd4_init_conn(rqstp, conn, cstate->session);
- return nfs_ok;
+ goto out;
+ nfsd4_init_conn(rqstp, conn, session);
+ status = nfs_ok;
+out:
+ nfs4_unlock_state();
+ return status;
}
static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1923,42 +2074,43 @@ nfsd4_destroy_session(struct svc_rqst *r,
struct nfsd4_destroy_session *sessionid)
{
struct nfsd4_session *ses;
- __be32 status = nfserr_badsession;
+ __be32 status;
+ int ref_held_by_me = 0;
struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
- /* Notes:
- * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
- * - Should we return nfserr_back_chan_busy if waiting for
- * callbacks on to-be-destroyed session?
- * - Do we need to clear any callback info from previous session?
- */
-
+ nfs4_lock_state();
+ status = nfserr_not_only_op;
if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
if (!nfsd4_last_compound_op(r))
- return nfserr_not_only_op;
+ goto out;
+ ref_held_by_me++;
}
dump_sessionid(__func__, &sessionid->sessionid);
spin_lock(&nn->client_lock);
ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
- if (!ses) {
- spin_unlock(&nn->client_lock);
- goto out;
- }
-
+ status = nfserr_badsession;
+ if (!ses)
+ goto out_client_lock;
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(ses->se_client, r))
+ goto out_client_lock;
+ nfsd4_get_session_locked(ses);
+ status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+ if (status)
+ goto out_put_session;
unhash_session(ses);
spin_unlock(&nn->client_lock);
- nfs4_lock_state();
nfsd4_probe_callback_sync(ses->se_client);
- nfs4_unlock_state();
spin_lock(&nn->client_lock);
- nfsd4_del_conns(ses);
- nfsd4_put_session_locked(ses);
- spin_unlock(&nn->client_lock);
status = nfs_ok;
+out_put_session:
+ nfsd4_put_session(ses);
+out_client_lock:
+ spin_unlock(&nn->client_lock);
out:
- dprintk("%s returns %d\n", __func__, ntohl(status));
+ nfs4_unlock_state();
return status;
}
@@ -1974,26 +2126,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
return NULL;
}
-static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
{
struct nfs4_client *clp = ses->se_client;
struct nfsd4_conn *c;
+ __be32 status = nfs_ok;
int ret;
spin_lock(&clp->cl_lock);
c = __nfsd4_find_conn(new->cn_xprt, ses);
- if (c) {
- spin_unlock(&clp->cl_lock);
- free_conn(new);
- return;
- }
+ if (c)
+ goto out_free;
+ status = nfserr_conn_not_bound_to_session;
+ if (clp->cl_mach_cred)
+ goto out_free;
__nfsd4_hash_conn(new, ses);
spin_unlock(&clp->cl_lock);
ret = nfsd4_register_conn(new);
if (ret)
/* oops; xprt is already down: */
nfsd4_conn_lost(&new->cn_xpt_user);
- return;
+ return nfs_ok;
+out_free:
+ spin_unlock(&clp->cl_lock);
+ free_conn(new);
+ return status;
}
static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2018,6 +2175,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
struct nfsd4_session *session;
+ struct nfs4_client *clp;
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
__be32 status;
@@ -2038,19 +2196,26 @@ nfsd4_sequence(struct svc_rqst *rqstp,
status = nfserr_badsession;
session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
if (!session)
- goto out;
+ goto out_no_session;
+ clp = session->se_client;
+ status = get_client_locked(clp);
+ if (status)
+ goto out_no_session;
+ status = nfsd4_get_session_locked(session);
+ if (status)
+ goto out_put_client;
status = nfserr_too_many_ops;
if (nfsd4_session_too_many_ops(rqstp, session))
- goto out;
+ goto out_put_session;
status = nfserr_req_too_big;
if (nfsd4_request_too_big(rqstp, session))
- goto out;
+ goto out_put_session;
status = nfserr_badslot;
if (seq->slotid >= session->se_fchannel.maxreqs)
- goto out;
+ goto out_put_session;
slot = session->se_slots[seq->slotid];
dprintk("%s: slotid %d\n", __func__, seq->slotid);
@@ -2065,7 +2230,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (status == nfserr_replay_cache) {
status = nfserr_seq_misordered;
if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
- goto out;
+ goto out_put_session;
cstate->slot = slot;
cstate->session = session;
/* Return the cached reply status and set cstate->status
@@ -2075,10 +2240,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
goto out;
}
if (status)
- goto out;
+ goto out_put_session;
- nfsd4_sequence_check_conn(conn, session);
+ status = nfsd4_sequence_check_conn(conn, session);
conn = NULL;
+ if (status)
+ goto out_put_session;
/* Success! bump slot seqid */
slot->sl_seqid = seq->seqid;
@@ -2092,27 +2259,27 @@ nfsd4_sequence(struct svc_rqst *rqstp,
cstate->session = session;
out:
- /* Hold a session reference until done processing the compound. */
- if (cstate->session) {
- struct nfs4_client *clp = session->se_client;
-
- nfsd4_get_session(cstate->session);
- atomic_inc(&clp->cl_refcount);
- switch (clp->cl_cb_state) {
- case NFSD4_CB_DOWN:
- seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
- break;
- case NFSD4_CB_FAULT:
- seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
- break;
- default:
- seq->status_flags = 0;
- }
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
}
+ if (!list_empty(&clp->cl_revoked))
+ seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+out_no_session:
kfree(conn);
spin_unlock(&nn->client_lock);
- dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
+out_put_session:
+ nfsd4_put_session(session);
+out_put_client:
+ put_client_renew_locked(clp);
+ goto out_no_session;
}
__be32
@@ -2125,17 +2292,12 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
nfs4_lock_state();
unconf = find_unconfirmed_client(&dc->clientid, true, nn);
conf = find_confirmed_client(&dc->clientid, true, nn);
+ WARN_ON_ONCE(conf && unconf);
if (conf) {
clp = conf;
- if (!is_client_expired(conf) && client_has_state(conf)) {
- status = nfserr_clientid_busy;
- goto out;
- }
-
- /* rfc5661 18.50.3 */
- if (cstate->session && conf == cstate->session->se_client) {
+ if (client_has_state(conf)) {
status = nfserr_clientid_busy;
goto out;
}
@@ -2145,11 +2307,13 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfserr_stale_clientid;
goto out;
}
-
+ if (!mach_creds_match(clp, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
expire_client(clp);
out:
nfs4_unlock_state();
- dprintk("%s return %d\n", __func__, ntohl(status));
return status;
}
@@ -2287,8 +2451,12 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
expire_client(unconf);
} else { /* case 3: normal case; new or rebooted client */
conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
- if (conf)
+ if (conf) {
+ status = mark_client_expired(conf);
+ if (status)
+ goto out;
expire_client(conf);
+ }
move_to_confirmed(unconf);
nfsd4_probe_callback(unconf);
}
@@ -2308,7 +2476,6 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
unsigned int hashval = file_hashval(ino);
atomic_set(&fp->fi_ref, 1);
- INIT_LIST_HEAD(&fp->fi_hash);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
fp->fi_inode = igrab(ino);
@@ -2317,7 +2484,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
spin_lock(&recall_lock);
- list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
spin_unlock(&recall_lock);
}
@@ -2503,7 +2670,7 @@ find_file(struct inode *ino)
struct nfs4_file *fp;
spin_lock(&recall_lock);
- list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+ hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
if (fp->fi_inode == ino) {
get_nfs4_file(fp);
spin_unlock(&recall_lock);
@@ -2526,8 +2693,6 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
struct nfs4_ol_stateid *stp;
__be32 ret;
- dprintk("NFSD: nfs4_share_conflict\n");
-
fp = find_file(ino);
if (!fp)
return nfs_ok;
@@ -2546,6 +2711,9 @@ out:
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
{
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
/* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
* callback (and since the lease code is serialized by the kernel
@@ -2553,15 +2721,15 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* it's safe to take a reference: */
atomic_inc(&dp->dl_count);
- list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
+ list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
- /* only place dl_time is set. protected by lock_flocks*/
+ /* Only place dl_time is set; protected by i_lock: */
dp->dl_time = get_seconds();
nfsd4_cb_recall(dp);
}
-/* Called from break_lease() with lock_flocks() held. */
+/* Called from break_lease() with i_lock held. */
static void nfsd_break_deleg_cb(struct file_lock *fl)
{
struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -2699,7 +2867,7 @@ static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
}
static __be32
-nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
struct nfs4_delegation **dp)
{
int flags;
@@ -2850,13 +3018,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
return fl;
}
-static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
struct file_lock *fl;
int status;
- fl = nfs4_alloc_init_lease(dp, flag);
+ fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
@@ -2874,12 +3042,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
return 0;
}
-static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+static int nfs4_set_delegation(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
if (!fp->fi_lease)
- return nfs4_setlease(dp, flag);
+ return nfs4_setlease(dp);
spin_lock(&recall_lock);
if (fp->fi_had_conflict) {
spin_unlock(&recall_lock);
@@ -2915,6 +3083,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
/*
* Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
*/
static void
nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -2923,39 +3094,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
- int status = 0, flag = 0;
+ int status = 0;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
- flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
open->op_recall = 1;
- flag = open->op_delegate_type;
- if (flag == NFS4_OPEN_DELEGATE_NONE)
- goto out;
+ if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
+ goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
- /* Let's not give out any delegations till everyone's
- * had the chance to reclaim theirs.... */
+ /*
+ * Let's not give out any delegations till everyone's
+ * had the chance to reclaim theirs....
+ */
if (locks_in_grace(net))
- goto out;
+ goto out_no_deleg;
if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
- goto out;
+ goto out_no_deleg;
+ /*
+ * Also, if the file was opened for write or
+ * create, there's a good chance the client's
+ * about to write to it, resulting in an
+ * immediate recall (since we don't support
+ * write delegations):
+ */
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
- flag = NFS4_OPEN_DELEGATE_WRITE;
- else
- flag = NFS4_OPEN_DELEGATE_READ;
+ goto out_no_deleg;
+ if (open->op_create == NFS4_OPEN_CREATE)
+ goto out_no_deleg;
break;
default:
- goto out;
+ goto out_no_deleg;
}
-
- dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
+ dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
if (dp == NULL)
goto out_no_deleg;
- status = nfs4_set_delegation(dp, flag);
+ status = nfs4_set_delegation(dp);
if (status)
goto out_free;
@@ -2963,24 +3140,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
STATEID_VAL(&dp->dl_stid.sc_stateid));
-out:
- open->op_delegate_type = flag;
- if (flag == NFS4_OPEN_DELEGATE_NONE) {
- if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
- open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
- dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-
- /* 4.1 client asking for a delegation? */
- if (open->op_deleg_want)
- nfsd4_open_deleg_none_ext(open, status);
- }
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
return;
out_free:
unhash_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
out_no_deleg:
- flag = NFS4_OPEN_DELEGATE_NONE;
- goto out;
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+ open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+ dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+ open->op_recall = 1;
+ }
+
+ /* 4.1 client asking for a delegation? */
+ if (open->op_deleg_want)
+ nfsd4_open_deleg_none_ext(open, status);
+ return;
}
static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3024,7 +3200,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (fp) {
if ((status = nfs4_check_open(fp, open, &stp)))
goto out;
- status = nfs4_check_deleg(cl, fp, open, &dp);
+ status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
} else {
@@ -3202,13 +3378,12 @@ nfs4_laundromat(struct nfsd_net *nn)
clientid_val = t;
break;
}
- if (atomic_read(&clp->cl_refcount)) {
+ if (mark_client_expired_locked(clp)) {
dprintk("NFSD: client in use (clientid %08x)\n",
clp->cl_clientid.cl_id);
continue;
}
- unhash_client_locked(clp);
- list_add(&clp->cl_lru, &reaplist);
+ list_move(&clp->cl_lru, &reaplist);
}
spin_unlock(&nn->client_lock);
list_for_each_safe(pos, next, &reaplist) {
@@ -3218,7 +3393,7 @@ nfs4_laundromat(struct nfsd_net *nn)
expire_client(clp);
}
spin_lock(&recall_lock);
- list_for_each_safe(pos, next, &del_recall_lru) {
+ list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
continue;
@@ -3233,7 +3408,7 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- unhash_delegation(dp);
+ revoke_delegation(dp);
}
test_val = nn->nfsd4_lease;
list_for_each_safe(pos, next, &nn->close_lru) {
@@ -3276,16 +3451,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s
return nfs_ok;
}
-static int
-STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn)
-{
- if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time)
- return 0;
- dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
- STATEID_VAL(stateid));
- return 1;
-}
-
static inline int
access_permit_read(struct nfs4_ol_stateid *stp)
{
@@ -3348,7 +3513,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
/* Returns true iff a is later than b: */
static bool stateid_generation_after(stateid_t *a, stateid_t *b)
{
- return (s32)a->si_generation - (s32)b->si_generation > 0;
+ return (s32)(a->si_generation - b->si_generation) > 0;
}
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -3402,13 +3567,24 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
status = check_stateid_generation(stateid, &s->sc_stateid, 1);
if (status)
return status;
- if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ return nfs_ok;
+ case NFS4_REVOKED_DELEG_STID:
+ return nfserr_deleg_revoked;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ ols = openlockstateid(s);
+ if (ols->st_stateowner->so_is_open_owner
+ && !(openowner(ols->st_stateowner)->oo_flags
+ & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
return nfs_ok;
- ols = openlockstateid(s);
- if (ols->st_stateowner->so_is_open_owner
- && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ default:
+ printk("unknown stateid type %x\n", s->sc_type);
+ case NFS4_CLOSED_STID:
return nfserr_bad_stateid;
- return nfs_ok;
+ }
}
static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
@@ -3416,19 +3592,20 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
struct nfsd_net *nn)
{
struct nfs4_client *cl;
+ __be32 status;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return nfserr_bad_stateid;
- if (STALE_STATEID(stateid, nn))
+ status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
+ nn, &cl);
+ if (status == nfserr_stale_clientid)
return nfserr_stale_stateid;
- cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
- if (!cl)
- return nfserr_expired;
+ if (status)
+ return status;
*s = find_stateid_by_type(cl, stateid, typemask);
if (!*s)
return nfserr_bad_stateid;
return nfs_ok;
-
}
/*
@@ -3538,6 +3715,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
stateid_t *stateid = &free_stateid->fr_stateid;
struct nfs4_stid *s;
+ struct nfs4_delegation *dp;
struct nfs4_client *cl = cstate->session->se_client;
__be32 ret = nfserr_bad_stateid;
@@ -3559,6 +3737,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
else
ret = nfserr_locks_held;
break;
+ case NFS4_REVOKED_DELEG_STID:
+ dp = delegstateid(s);
+ destroy_revoked_delegation(dp);
+ ret = nfs_ok;
+ break;
default:
ret = nfserr_bad_stateid;
}
@@ -3583,10 +3766,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
status = nfsd4_check_seqid(cstate, sop, seqid);
if (status)
return status;
- if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+ if (stp->st_stid.sc_type == NFS4_CLOSED_STID
+ || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
/*
* "Closed" stateid's exist *only* to return
- * nfserr_replay_me from the previous step.
+ * nfserr_replay_me from the previous step, and
+ * revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
@@ -3616,7 +3801,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
if (status)
return status;
*stpp = openlockstateid(s);
- cstate->replay_owner = (*stpp)->st_stateowner;
+ if (!nfsd4_has_session(cstate))
+ cstate->replay_owner = (*stpp)->st_stateowner;
return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
}
@@ -3674,6 +3860,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_client_record_create(oo->oo_owner.so_client);
status = nfs_ok;
out:
+ nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
nfs4_unlock_state();
return status;
@@ -3757,31 +3944,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
status = nfs_ok;
out:
+ nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
nfs4_unlock_state();
return status;
}
-void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
-{
- struct nfs4_openowner *oo;
- struct nfs4_ol_stateid *s;
-
- if (!so->so_is_open_owner)
- return;
- oo = openowner(so);
- s = oo->oo_last_closed_stid;
- if (!s)
- return;
- if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
- /* Release the last_closed_stid on the next seqid bump: */
- oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
- return;
- }
- oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
- release_last_closed_stateid(oo);
-}
-
static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
{
unhash_open_stateid(s);
@@ -3810,28 +3978,30 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&close->cl_stateid,
NFS4_OPEN_STID|NFS4_CLOSED_STID,
&stp, nn);
+ nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
- status = nfs_ok;
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
nfsd4_close_open_stateid(stp);
- release_last_closed_stateid(oo);
- oo->oo_last_closed_stid = stp;
+
+ if (cstate->minorversion) {
+ unhash_stid(&stp->st_stid);
+ free_generic_stateid(stp);
+ } else
+ oo->oo_last_closed_stid = stp;
if (list_empty(&oo->oo_owner.so_stateids)) {
- if (cstate->minorversion) {
+ if (cstate->minorversion)
release_openowner(oo);
- cstate->replay_owner = NULL;
- } else {
+ else {
/*
* In the 4.0 case we need to keep the owners around a
* little while to handle CLOSE replay.
*/
- if (list_empty(&oo->oo_owner.so_stateids))
- move_to_close_lru(oo, SVC_NET(rqstp));
+ move_to_close_lru(oo, SVC_NET(rqstp));
}
}
out:
@@ -3863,7 +4033,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- unhash_delegation(dp);
+ destroy_delegation(dp);
out:
nfs4_unlock_state();
@@ -4241,6 +4411,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
if (status && new_state)
release_lockowner(lock_sop);
+ nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
nfs4_unlock_state();
if (file_lock)
@@ -4395,21 +4566,16 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
locku->lu_length);
nfs4_transform_lock_offset(file_lock);
- /*
- * Try to unlock the file in the VFS.
- */
err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
if (err) {
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
- /*
- * OK, unlock succeeded; the only thing left to do is update the stateid.
- */
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
out:
+ nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
nfs4_unlock_state();
if (file_lock)
@@ -4433,7 +4599,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
struct inode *inode = filp->fi_inode;
int status = 0;
- lock_flocks();
+ spin_lock(&inode->i_lock);
for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
status = 1;
@@ -4441,7 +4607,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
}
}
out:
- unlock_flocks();
+ spin_unlock(&inode->i_lock);
return status;
}
@@ -4602,6 +4768,8 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
{
+ if (mark_client_expired(clp))
+ return 0;
expire_client(clp);
return 1;
}
@@ -4708,7 +4876,7 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
spin_unlock(&recall_lock);
list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
- unhash_delegation(dp);
+ revoke_delegation(dp);
return count;
}
@@ -4780,12 +4948,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
void
nfs4_state_init(void)
{
- int i;
-
- for (i = 0; i < FILE_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&file_hashtbl[i]);
- }
- INIT_LIST_HEAD(&del_recall_lru);
}
/*
@@ -4849,6 +5011,7 @@ static int nfs4_state_create_net(struct net *net)
nn->unconf_name_tree = RB_ROOT;
INIT_LIST_HEAD(&nn->client_lru);
INIT_LIST_HEAD(&nn->close_lru);
+ INIT_LIST_HEAD(&nn->del_recall_lru);
spin_lock_init(&nn->client_lock);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -4961,16 +5124,14 @@ nfs4_state_shutdown_net(struct net *net)
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
- list_for_each_safe(pos, next, &del_recall_lru) {
+ list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- if (dp->dl_stid.sc_client->net != net)
- continue;
list_move(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- unhash_delegation(dp);
+ destroy_delegation(dp);
}
nfsd4_client_tracking_exit(net);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a2720071f282..0c0f3ea90de5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
#include "cache.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
+
+
#define NFSDDBG_FACILITY NFSDDBG_XDR
/*
@@ -134,6 +139,19 @@ xdr_error: \
} \
} while (0)
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+ argp->pagelist++;
+ argp->p = page_address(argp->pagelist[0]);
+ if (argp->pagelen < PAGE_SIZE) {
+ argp->end = argp->p + (argp->pagelen>>2);
+ argp->pagelen = 0;
+ } else {
+ argp->end = argp->p + (PAGE_SIZE>>2);
+ argp->pagelen -= PAGE_SIZE;
+ }
+}
+
static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
* guarantee p points to at least nbytes bytes.
*/
memcpy(p, argp->p, avail);
- /* step to next page */
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + (argp->pagelen>>2);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
+ next_decode_page(argp);
memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
argp->p += XDR_QUADLEN(nbytes - avail);
return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
static __be32
nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
- struct iattr *iattr, struct nfs4_acl **acl)
+ struct iattr *iattr, struct nfs4_acl **acl,
+ struct xdr_netobj *label)
{
int expected_len, len = 0;
u32 dummy32;
@@ -344,10 +354,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ32(dummy32);
- if (dummy32)
- return nfserr_inval;
- READ32(iattr->ia_atime.tv_sec);
+ READ64(iattr->ia_atime.tv_sec);
READ32(iattr->ia_atime.tv_nsec);
if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
@@ -370,10 +377,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ32(dummy32);
- if (dummy32)
- return nfserr_inval;
- READ32(iattr->ia_mtime.tv_sec);
+ READ64(iattr->ia_mtime.tv_sec);
READ32(iattr->ia_mtime.tv_nsec);
if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
@@ -386,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
goto xdr_error;
}
}
+
+ label->len = 0;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* lfs: we don't use it */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* pi: we don't use it either */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32);
+ READ_BUF(dummy32);
+ if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+ return nfserr_badlabel;
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+ if (!label->data)
+ return nfserr_jukebox;
+ defer_free(argp, kfree, label->data);
+ memcpy(label->data, buf, dummy32);
+ }
+#endif
+
if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
|| bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
|| bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -434,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
/* callback_sec_params4 */
READ_BUF(4);
READ32(nr_secflavs);
- cbs->flavor = (u32)(-1);
+ if (nr_secflavs)
+ cbs->flavor = (u32)(-1);
+ else
+ /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+ cbs->flavor = 0;
for (i = 0; i < nr_secflavs; ++i) {
READ_BUF(4);
READ32(dummy);
@@ -582,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
- &create->cr_acl);
+ &create->cr_acl, &create->cr_label);
if (status)
goto out;
@@ -804,6 +838,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
open->op_iattr.ia_valid = 0;
open->op_openowner = NULL;
+ open->op_xdr_error = 0;
/* seqid, share_access, share_deny, clientid, ownerlen */
READ_BUF(4);
READ32(open->op_seqid);
@@ -832,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -846,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -1068,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
if (status)
return status;
return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
- &setattr->sa_acl);
+ &setattr->sa_acl, &setattr->sa_label);
}
static __be32
@@ -1572,6 +1607,7 @@ struct nfsd4_minorversion_ops {
static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
+ [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
};
static __be32
@@ -1692,36 +1728,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
} while (0)
#define ADJUST_ARGS() resp->p = p
-/*
- * Header routine to setup seqid operation replay cache
- */
-#define ENCODE_SEQID_OP_HEAD \
- __be32 *save; \
- \
- save = resp->p;
-
-/*
- * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This
- * is where sequence id's are incremented, and the replay cache is filled.
- * Note that we increment sequence id's here, at the last moment, so we're sure
- * we know whether the error to be returned is a sequence id mutating error.
- */
-
-static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
-{
- struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
-
- if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
- stateowner->so_seqid++;
- stateowner->so_replay.rp_status = nfserr;
- stateowner->so_replay.rp_buflen =
- (char *)resp->p - (char *)save;
- memcpy(stateowner->so_replay.rp_buf, save,
- stateowner->so_replay.rp_buflen);
- nfsd4_purge_closed_stateid(stateowner);
- }
-}
-
/* Encode as an array of strings the string given with components
* separated @sep, escaped with esc_enter and esc_exit.
*/
@@ -1988,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{
+ __be32 *p = *pp;
+
+ if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+ return nfserr_resource;
+
+ /*
+ * For now we use a 0 here to indicate the null translation; in
+ * the future we may place a call to translation code here.
+ */
+ if ((*buflen -= 8) < 0)
+ return nfserr_resource;
+
+ WRITE32(0); /* lfs */
+ WRITE32(0); /* pi */
+ p = xdr_encode_opaque(p, context, len);
+ *buflen -= (XDR_QUADLEN(len) << 2) + 4;
+
+ *pp = p;
+ return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{ return 0; }
+#endif
+
static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
{
/* As per referral draft: */
@@ -2047,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
int err;
int aclsupport = 0;
struct nfs4_acl *acl = NULL;
+ void *context = NULL;
+ int contextlen;
+ bool contextsupport = false;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
struct path path = {
@@ -2100,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
}
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ err = security_inode_getsecctx(dentry->d_inode,
+ &context, &contextlen);
+ contextsupport = (err == 0);
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ if (err == -EOPNOTSUPP)
+ bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ else if (err)
+ goto out_nfserr;
+ }
+ }
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+
if (bmval2) {
if ((buflen -= 16) < 0)
goto out_resource;
@@ -2128,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
+ if (!contextsupport)
+ word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
if (!word2) {
if ((buflen -= 12) < 0)
goto out_resource;
@@ -2401,8 +2457,7 @@ out_acl:
if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
if ((buflen -= 12) < 0)
goto out_resource;
- WRITE32(0);
- WRITE32(stat.atime.tv_sec);
+ WRITE64((s64)stat.atime.tv_sec);
WRITE32(stat.atime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
@@ -2415,15 +2470,13 @@ out_acl:
if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
if ((buflen -= 12) < 0)
goto out_resource;
- WRITE32(0);
- WRITE32(stat.ctime.tv_sec);
+ WRITE64((s64)stat.ctime.tv_sec);
WRITE32(stat.ctime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
if ((buflen -= 12) < 0)
goto out_resource;
- WRITE32(0);
- WRITE32(stat.mtime.tv_sec);
+ WRITE64((s64)stat.mtime.tv_sec);
WRITE32(stat.mtime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
@@ -2438,6 +2491,12 @@ out_acl:
get_parent_attributes(exp, &stat);
WRITE64(stat.ino);
}
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ status = nfsd4_encode_security_label(rqstp, context,
+ contextlen, &p, &buflen);
+ if (status)
+ goto out;
+ }
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2450,6 +2509,10 @@ out_acl:
status = nfs_ok;
out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (context)
+ security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
kfree(acl);
if (fhp == &tempfh)
fh_put(&tempfh);
@@ -2661,12 +2724,9 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
- ENCODE_SEQID_OP_HEAD;
-
if (!nfserr)
nfsd4_encode_stateid(resp, &close->cl_stateid);
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2762,14 +2822,11 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
static __be32
nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
- ENCODE_SEQID_OP_HEAD;
-
if (!nfserr)
nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2784,12 +2841,9 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
- ENCODE_SEQID_OP_HEAD;
-
if (!nfserr)
nfsd4_encode_stateid(resp, &locku->lu_stateid);
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -2812,7 +2866,6 @@ static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
__be32 *p;
- ENCODE_SEQID_OP_HEAD;
if (nfserr)
goto out;
@@ -2884,31 +2937,24 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
}
/* XXX save filehandle here */
out:
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
- ENCODE_SEQID_OP_HEAD;
-
if (!nfserr)
nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
- ENCODE_SEQID_OP_HEAD;
-
if (!nfserr)
nfsd4_encode_stateid(resp, &od->od_stateid);
- encode_seqid_op_tail(resp, save, nfserr);
return nfserr;
}
@@ -3138,13 +3184,13 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
- __be32 nfserr,struct svc_export *exp)
+ __be32 nfserr, struct svc_export *exp)
{
- int i = 0;
- u32 nflavs;
+ u32 i, nflavs, supported;
struct exp_flavor_info *flavs;
struct exp_flavor_info def_flavs[2];
- __be32 *p;
+ __be32 *p, *flavorsp;
+ static bool report = true;
if (nfserr)
goto out;
@@ -3168,34 +3214,40 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
}
}
+ supported = 0;
RESERVE_SPACE(4);
- WRITE32(nflavs);
+ flavorsp = p++; /* to be backfilled later */
ADJUST_ARGS();
+
for (i = 0; i < nflavs; i++) {
- u32 flav = flavs[i].pseudoflavor;
- struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+ rpc_authflavor_t pf = flavs[i].pseudoflavor;
+ struct rpcsec_gss_info info;
- if (gm) {
- RESERVE_SPACE(4);
+ if (rpcauth_get_gssinfo(pf, &info) == 0) {
+ supported++;
+ RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
WRITE32(RPC_AUTH_GSS);
+ WRITE32(info.oid.len);
+ WRITEMEM(info.oid.data, info.oid.len);
+ WRITE32(info.qop);
+ WRITE32(info.service);
ADJUST_ARGS();
- RESERVE_SPACE(4 + gm->gm_oid.len);
- WRITE32(gm->gm_oid.len);
- WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
- ADJUST_ARGS();
+ } else if (pf < RPC_AUTH_MAXFLAVOR) {
+ supported++;
RESERVE_SPACE(4);
- WRITE32(0); /* qop */
+ WRITE32(pf);
ADJUST_ARGS();
- RESERVE_SPACE(4);
- WRITE32(gss_pseudoflavor_to_service(gm, flav));
- ADJUST_ARGS();
- gss_mech_put(gm);
} else {
- RESERVE_SPACE(4);
- WRITE32(flav);
- ADJUST_ARGS();
+ if (report)
+ pr_warn("NFS: SECINFO: security flavor %u "
+ "is not supported\n", pf);
}
}
+
+ if (nflavs != supported)
+ report = false;
+ *flavorsp = htonl(supported);
+
out:
if (exp)
exp_put(exp);
@@ -3225,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
{
__be32 *p;
- RESERVE_SPACE(12);
+ RESERVE_SPACE(16);
if (nfserr) {
- WRITE32(2);
+ WRITE32(3);
+ WRITE32(0);
WRITE32(0);
WRITE32(0);
}
else {
- WRITE32(2);
+ WRITE32(3);
WRITE32(setattr->sa_bmval[0]);
WRITE32(setattr->sa_bmval[1]);
+ WRITE32(setattr->sa_bmval[2]);
}
ADJUST_ARGS();
return nfserr;
@@ -3275,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
return nfserr;
}
+static const u32 nfs4_minimal_spo_must_enforce[2] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+};
+
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
@@ -3313,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
/* state_protect4_r. Currently only support SP4_NONE */
BUG_ON(exid->spa_how != SP4_NONE);
WRITE32(exid->spa_how);
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce bitmap: */
+ WRITE32(2);
+ WRITE32(nfs4_minimal_spo_must_enforce[0]);
+ WRITE32(nfs4_minimal_spo_must_enforce[1]);
+ /* empty spo_must_allow bitmap: */
+ WRITE32(0);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
/* The server_owner struct */
WRITE64(minor_id); /* Minor id */
@@ -3566,6 +3642,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
+ struct nfs4_stateowner *so = resp->cstate.replay_owner;
__be32 *statp;
__be32 *p;
@@ -3582,6 +3659,11 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
/* nfsd4_check_drc_limit guarantees enough room for error status */
if (!op->status)
op->status = nfsd4_check_resp_size(resp, 0);
+ if (so) {
+ so->so_replay.rp_status = op->status;
+ so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
+ memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen);
+ }
status:
/*
* Note: We write the status directly, instead of using WRITE32(),
@@ -3678,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
if (nfsd4_has_session(cs)) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_client *clp = cs->session->se_client;
if (cs->status != nfserr_replay_cache) {
nfsd4_store_cache_entry(resp);
cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
}
/* Renew the clientid on success and on replay */
- release_session_client(cs->session);
+ spin_lock(&nn->client_lock);
nfsd4_put_session(cs->session);
+ spin_unlock(&nn->client_lock);
+ put_client_renew(clp);
}
return 1;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ca05f6dc3544..e76244edd748 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -11,6 +11,8 @@
#include <linux/slab.h>
#include <linux/sunrpc/addr.h>
#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
#include <net/checksum.h>
#include "nfsd.h"
@@ -18,30 +20,49 @@
#define NFSDDBG_FACILITY NFSDDBG_REPCACHE
-#define HASHSIZE 64
+/*
+ * We use this value to determine the number of hash buckets from the max
+ * cache size, the idea being that when the cache is at its maximum number
+ * of entries, then this should be the average number of entries per bucket.
+ */
+#define TARGET_BUCKET_SIZE 64
static struct hlist_head * cache_hash;
static struct list_head lru_head;
static struct kmem_cache *drc_slab;
-static unsigned int num_drc_entries;
+
+/* max number of entries allowed in the cache */
static unsigned int max_drc_entries;
+/* number of significant bits in the hash value */
+static unsigned int maskbits;
+
/*
- * Calculate the hash index from an XID.
+ * Stats and other tracking of on the duplicate reply cache. All of these and
+ * the "rc" fields in nfsdstats are protected by the cache_lock
*/
-static inline u32 request_hash(u32 xid)
-{
- u32 h = xid;
- h ^= (xid >> 24);
- return h & (HASHSIZE-1);
-}
+
+/* total number of entries */
+static unsigned int num_drc_entries;
+
+/* cache misses due only to checksum comparison failures */
+static unsigned int payload_misses;
+
+/* amount of memory (in bytes) currently consumed by the DRC */
+static unsigned int drc_mem_usage;
+
+/* longest hash chain seen */
+static unsigned int longest_chain;
+
+/* size of cache when we saw the longest hash chain */
+static unsigned int longest_chain_cachesize;
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
static void cache_cleaner_func(struct work_struct *unused);
static int nfsd_reply_cache_shrink(struct shrinker *shrink,
struct shrink_control *sc);
-struct shrinker nfsd_reply_cache_shrinker = {
+static struct shrinker nfsd_reply_cache_shrinker = {
.shrink = nfsd_reply_cache_shrink,
.seeks = 1,
};
@@ -82,6 +103,16 @@ nfsd_cache_size_limit(void)
return min_t(unsigned int, limit, 256*1024);
}
+/*
+ * Compute the number of hash buckets we need. Divide the max cachesize by
+ * the "target" max bucket size, and round up to next power of two.
+ */
+static unsigned int
+nfsd_hashsize(unsigned int limit)
+{
+ return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+}
+
static struct svc_cacherep *
nfsd_reply_cache_alloc(void)
{
@@ -100,12 +131,15 @@ nfsd_reply_cache_alloc(void)
static void
nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
{
- if (rp->c_type == RC_REPLBUFF)
+ if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+ drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
+ }
if (!hlist_unhashed(&rp->c_hash))
hlist_del(&rp->c_hash);
list_del(&rp->c_lru);
--num_drc_entries;
+ drc_mem_usage -= sizeof(*rp);
kmem_cache_free(drc_slab, rp);
}
@@ -119,9 +153,13 @@ nfsd_reply_cache_free(struct svc_cacherep *rp)
int nfsd_reply_cache_init(void)
{
+ unsigned int hashsize;
+
INIT_LIST_HEAD(&lru_head);
max_drc_entries = nfsd_cache_size_limit();
num_drc_entries = 0;
+ hashsize = nfsd_hashsize(max_drc_entries);
+ maskbits = ilog2(hashsize);
register_shrinker(&nfsd_reply_cache_shrinker);
drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
@@ -129,7 +167,7 @@ int nfsd_reply_cache_init(void)
if (!drc_slab)
goto out_nomem;
- cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+ cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
if (!cache_hash)
goto out_nomem;
@@ -180,7 +218,7 @@ static void
hash_refile(struct svc_cacherep *rp)
{
hlist_del_init(&rp->c_hash);
- hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
+ hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
}
static inline bool
@@ -273,6 +311,26 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
return csum;
}
+static bool
+nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+{
+ /* Check RPC header info first */
+ if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
+ rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
+ rqstp->rq_arg.len != rp->c_len ||
+ !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+ rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return false;
+
+ /* compare checksum of NFS data */
+ if (csum != rp->c_csum) {
+ ++payload_misses;
+ return false;
+ }
+
+ return true;
+}
+
/*
* Search the request hash for an entry that matches the given rqstp.
* Must be called with cache_lock held. Returns the found entry or
@@ -281,23 +339,30 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
static struct svc_cacherep *
nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
{
- struct svc_cacherep *rp;
+ struct svc_cacherep *rp, *ret = NULL;
struct hlist_head *rh;
- __be32 xid = rqstp->rq_xid;
- u32 proto = rqstp->rq_prot,
- vers = rqstp->rq_vers,
- proc = rqstp->rq_proc;
+ unsigned int entries = 0;
- rh = &cache_hash[request_hash(xid)];
+ rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
hlist_for_each_entry(rp, rh, c_hash) {
- if (xid == rp->c_xid && proc == rp->c_proc &&
- proto == rp->c_prot && vers == rp->c_vers &&
- rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum &&
- rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) &&
- rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr))
- return rp;
+ ++entries;
+ if (nfsd_cache_match(rqstp, csum, rp)) {
+ ret = rp;
+ break;
+ }
}
- return NULL;
+
+ /* tally hash chain length stats */
+ if (entries > longest_chain) {
+ longest_chain = entries;
+ longest_chain_cachesize = num_drc_entries;
+ } else if (entries == longest_chain) {
+ /* prefer to keep the smallest cachesize possible here */
+ longest_chain_cachesize = min(longest_chain_cachesize,
+ num_drc_entries);
+ }
+
+ return ret;
}
/*
@@ -318,55 +383,55 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
__wsum csum;
unsigned long age;
int type = rqstp->rq_cachetype;
- int rtn;
+ int rtn = RC_DOIT;
rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
nfsdstats.rcnocache++;
- return RC_DOIT;
+ return rtn;
}
csum = nfsd_cache_csum(rqstp);
+ /*
+ * Since the common case is a cache miss followed by an insert,
+ * preallocate an entry. First, try to reuse the first entry on the LRU
+ * if it works, then go ahead and prune the LRU list.
+ */
spin_lock(&cache_lock);
- rtn = RC_DOIT;
-
- rp = nfsd_cache_search(rqstp, csum);
- if (rp)
- goto found_entry;
-
- /* Try to use the first entry on the LRU */
if (!list_empty(&lru_head)) {
rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
if (nfsd_cache_entry_expired(rp) ||
num_drc_entries >= max_drc_entries) {
lru_put_end(rp);
prune_cache_entries();
- goto setup_entry;
+ goto search_cache;
}
}
- /* Drop the lock and allocate a new entry */
+ /* No expired ones available, allocate a new one. */
spin_unlock(&cache_lock);
rp = nfsd_reply_cache_alloc();
- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- return RC_DOIT;
- }
spin_lock(&cache_lock);
- ++num_drc_entries;
+ if (likely(rp)) {
+ ++num_drc_entries;
+ drc_mem_usage += sizeof(*rp);
+ }
- /*
- * Must search again just in case someone inserted one
- * after we dropped the lock above.
- */
+search_cache:
found = nfsd_cache_search(rqstp, csum);
if (found) {
- nfsd_reply_cache_free_locked(rp);
+ if (likely(rp))
+ nfsd_reply_cache_free_locked(rp);
rp = found;
goto found_entry;
}
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ goto out;
+ }
+
/*
* We're keeping the one we just allocated. Are we now over the
* limit? Prune one off the tip of the LRU in trade for the one we
@@ -376,7 +441,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
struct svc_cacherep, c_lru));
-setup_entry:
nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
@@ -394,6 +458,7 @@ setup_entry:
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
+ drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
rp->c_replvec.iov_base = NULL;
}
@@ -462,6 +527,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
int len;
+ size_t bufsize = 0;
if (!rp)
return;
@@ -483,19 +549,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
break;
case RC_REPLBUFF:
cachv = &rp->c_replvec;
- cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+ bufsize = len << 2;
+ cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
if (!cachv->iov_base) {
nfsd_reply_cache_free(rp);
return;
}
- cachv->iov_len = len << 2;
- memcpy(cachv->iov_base, statp, len << 2);
+ cachv->iov_len = bufsize;
+ memcpy(cachv->iov_base, statp, bufsize);
break;
case RC_NOCACHE:
nfsd_reply_cache_free(rp);
return;
}
spin_lock(&cache_lock);
+ drc_mem_usage += bufsize;
lru_put_end(rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
@@ -523,3 +591,30 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
vec->iov_len += data->iov_len;
return 1;
}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+{
+ spin_lock(&cache_lock);
+ seq_printf(m, "max entries: %u\n", max_drc_entries);
+ seq_printf(m, "num entries: %u\n", num_drc_entries);
+ seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
+ seq_printf(m, "mem usage: %u\n", drc_mem_usage);
+ seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
+ seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses);
+ seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache);
+ seq_printf(m, "payload misses: %u\n", payload_misses);
+ seq_printf(m, "longest chain len: %u\n", longest_chain);
+ seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
+ spin_unlock(&cache_lock);
+ return 0;
+}
+
+int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nfsd_reply_cache_stats_show, NULL);
+}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f33455b4d957..7f555179bf81 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,6 +35,7 @@ enum {
NFSD_Threads,
NFSD_Pool_Threads,
NFSD_Pool_Stats,
+ NFSD_Reply_Cache_Stats,
NFSD_Versions,
NFSD_Ports,
NFSD_MaxBlkSize,
@@ -177,7 +178,7 @@ static int export_features_open(struct inode *inode, struct file *file)
return single_open(file, export_features_show, NULL);
}
-static struct file_operations export_features_operations = {
+static const struct file_operations export_features_operations = {
.open = export_features_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -196,7 +197,7 @@ static int supported_enctypes_open(struct inode *inode, struct file *file)
return single_open(file, supported_enctypes_show, NULL);
}
-static struct file_operations supported_enctypes_ops = {
+static const struct file_operations supported_enctypes_ops = {
.open = supported_enctypes_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -212,6 +213,13 @@ static const struct file_operations pool_stats_operations = {
.owner = THIS_MODULE,
};
+static struct file_operations reply_cache_stats_operations = {
+ .open = nfsd_reply_cache_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/*----------------------------------------------------------------------------*/
/*
* payload - write methods
@@ -1047,6 +1055,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+ [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
@@ -1102,8 +1111,10 @@ static int create_proc_exports_entry(void)
return -ENOMEM;
entry = proc_create("exports", 0, entry,
&exports_proc_operations);
- if (!entry)
+ if (!entry) {
+ remove_proc_entry("fs/nfs", NULL);
return -ENOMEM;
+ }
return 0;
}
#else /* CONFIG_PROC_FS */
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..2bbd94e51efc 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
/*
* nfsd version
*/
-#define NFSD_SUPPORTED_MINOR_VERSION 1
+#define NFSD_SUPPORTED_MINOR_VERSION 2
/*
* Maximum blocksizes supported by daemon under various circumstances.
*/
@@ -243,6 +243,12 @@ void nfsd_lockd_shutdown(void);
#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
/* error codes for internal use */
/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +328,13 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#else
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#endif
+
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +349,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
static inline u32 nfsd_suppattrs2(u32 minorversion)
{
- return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
- : NFSD4_SUPPORTED_ATTRS_WORD2;
+ switch (minorversion) {
+ default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
+ case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
+ case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
+ }
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +366,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#else
#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#endif
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..6b9f48ca4c25 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,7 @@ struct svc_program nfsd_program = {
};
-u32 nfsd_supported_minorversion;
+u32 nfsd_supported_minorversion = 1;
int nfsd_vers(int vers, enum vers_op change)
{
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1a8c7391f7ae..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -79,6 +79,8 @@ struct nfs4_stid {
#define NFS4_DELEG_STID 4
/* For an open stateid kept around *only* to process close replays: */
#define NFS4_CLOSED_STID 8
+/* For a deleg stateid kept around only to process free_stateid's: */
+#define NFS4_REVOKED_DELEG_STID 16
unsigned char sc_type;
stateid_t sc_stateid;
struct nfs4_client *sc_client;
@@ -194,9 +196,11 @@ struct nfsd4_conn {
};
struct nfsd4_session {
- struct kref se_ref;
+ atomic_t se_ref;
struct list_head se_hash; /* hash by sessionid */
struct list_head se_perclnt;
+/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */
+#define NFS4_SESSION_DEAD 0x010
u32 se_flags;
struct nfs4_client *se_client;
struct nfs4_sessionid se_sessionid;
@@ -236,11 +240,13 @@ struct nfs4_client {
struct list_head cl_openowners;
struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
+ struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
struct list_head cl_lru; /* tail queue */
struct xdr_netobj cl_name; /* id generated by client */
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
+ bool cl_mach_cred; /* SP4_MACH_CRED in force */
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
@@ -286,18 +292,6 @@ struct nfs4_client {
struct net *net;
};
-static inline void
-mark_client_expired(struct nfs4_client *clp)
-{
- clp->cl_time = 0;
-}
-
-static inline bool
-is_client_expired(struct nfs4_client *clp)
-{
- return clp->cl_time == 0;
-}
-
/* struct nfs4_client_reset
* one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
* upon lease reset, or from upcall to state_daemon (to read in state
@@ -365,7 +359,6 @@ struct nfs4_openowner {
struct nfs4_ol_stateid *oo_last_closed_stid;
time_t oo_time; /* time of placement on so_close_lru */
#define NFS4_OO_CONFIRMED 1
-#define NFS4_OO_PURGE_CLOSE 2
#define NFS4_OO_NEW 4
unsigned char oo_flags;
};
@@ -373,7 +366,7 @@ struct nfs4_openowner {
struct nfs4_lockowner {
struct nfs4_stateowner lo_owner; /* must be first element */
struct list_head lo_owner_ino_hash; /* hash by owner,file */
- struct list_head lo_perstateid; /* for lockowners only */
+ struct list_head lo_perstateid;
struct list_head lo_list; /* for temporary uses */
};
@@ -390,7 +383,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
struct nfs4_file {
atomic_t fi_ref;
- struct list_head fi_hash; /* hash by "struct inode *" */
+ struct hlist_node fi_hash; /* hash by "struct inode *" */
struct list_head fi_stateids;
struct list_head fi_delegations;
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
@@ -486,8 +479,7 @@ extern void nfs4_put_delegation(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
-extern void release_session_client(struct nfsd4_session *);
-extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
+extern void put_client_renew(struct nfs4_client *clp);
/* nfs4recover operations */
extern int nfsd4_client_tracking_init(struct net *net);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2b2e2396a869..8ff6a0019b0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
#include <asm/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
+#include <linux/security.h>
#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
return 1;
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+
+ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+
+ mutex_lock(&dentry->d_inode->i_mutex);
+ host_error = security_inode_setsecctx(dentry, label->data, label->len);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ return nfserr_notsupp;
+}
+#endif
+
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
@@ -1758,10 +1786,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
tdentry = tfhp->fh_dentry;
tdir = tdentry->d_inode;
- err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
- if (ffhp->fh_export != tfhp->fh_export)
- goto out;
-
err = nfserr_perm;
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
@@ -1802,6 +1826,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = -EXDEV;
if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
goto out_dput_new;
+ if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+ goto out_dput_new;
host_err = nfsd_break_lease(odentry->d_inode);
if (host_err)
@@ -1914,6 +1940,7 @@ struct buffered_dirent {
};
struct readdir_data {
+ struct dir_context ctx;
char *dirent;
size_t used;
int full;
@@ -1945,13 +1972,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
struct readdir_cd *cdp, loff_t *offsetp)
{
- struct readdir_data buf;
struct buffered_dirent *de;
int host_err;
int size;
loff_t offset;
+ struct readdir_data buf = {
+ .ctx.actor = nfsd_buffered_filldir,
+ .dirent = (void *)__get_free_page(GFP_KERNEL)
+ };
- buf.dirent = (void *)__get_free_page(GFP_KERNEL);
if (!buf.dirent)
return nfserrno(-ENOMEM);
@@ -1965,7 +1994,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
buf.used = 0;
buf.full = 0;
- host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+ host_err = iterate_dir(file, &buf.ctx);
if (buf.full)
host_err = 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
/* nfsd/vfs.c */
-int fh_lock_parent(struct svc_fh *, struct dentry *);
int nfsd_racache_init(int);
void nfsd_racache_shutdown(void);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
struct nfs4_acl *);
int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+ struct xdr_netobj *);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
struct svc_fh *, char *, int);
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
-int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
- unsigned long size);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t *, struct readdir_cd *, filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
-int nfsd_notify_change(struct inode *, struct iattr *);
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
-int nfsd_sync_dir(struct dentry *dp);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 546f8983ecf1..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
#include "state.h"
#include "nfsd.h"
+#define NFSD4_MAX_SEC_LABEL_LEN 2048
#define NFSD4_MAX_TAGLEN 128
#define XDR_LEN(n) (((n) + 3) & ~3)
@@ -118,6 +119,7 @@ struct nfsd4_create {
struct iattr cr_iattr; /* request */
struct nfsd4_change_info cr_cinfo; /* response */
struct nfs4_acl *cr_acl;
+ struct xdr_netobj cr_label;
};
#define cr_linklen u.link.namelen
#define cr_linkname u.link.name
@@ -184,7 +186,6 @@ struct nfsd4_lock {
#define lk_old_lock_stateid v.old.lock_stateid
#define lk_old_lock_seqid v.old.lock_seqid
-#define lk_rflags u.ok.rflags
#define lk_resp_stateid u.ok.stateid
#define lk_denied u.denied
@@ -237,6 +238,7 @@ struct nfsd4_open {
u32 op_share_deny; /* request */
u32 op_deleg_want; /* request */
stateid_t op_stateid; /* response */
+ __be32 op_xdr_error; /* see nfsd4_open_omfg() */
u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
u32 op_rflags; /* response */
@@ -246,6 +248,7 @@ struct nfsd4_open {
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_acl *op_acl;
+ struct xdr_netobj op_label;
};
#define op_iattr iattr
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
u32 sa_bmval[3]; /* request */
struct iattr sa_iattr; /* request */
struct nfs4_acl *sa_acl;
+ struct xdr_netobj sa_label;
};
struct nfsd4_setclientid {
@@ -623,6 +627,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid);
extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
+extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
#endif
/*
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
new file mode 100644
index 000000000000..c5c55dfb91a9
--- /dev/null
+++ b/fs/nfsd/xdr4cb.h
@@ -0,0 +1,23 @@
+#define NFS4_MAXTAGLEN 20
+
+#define NFS4_enc_cb_null_sz 0
+#define NFS4_dec_cb_null_sz 0
+#define cb_compound_enc_hdr_sz 4
+#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz (sessionid_sz + 4 + \
+ 1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz 1
+#define op_dec_sz 2
+#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + enc_stateid_sz + \
+ enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
}
/**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+ unsigned long *desc_blocks)
+{
+ unsigned long blknum;
+ int ret;
+
+ ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+ if (likely(!ret))
+ *desc_blocks = DIV_ROUND_UP(
+ blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+ return ret;
+}
+
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ * MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+ unsigned long desc_blocks)
+{
+ return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+ nilfs_palloc_groups_count(inode);
+}
+
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ * described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+ unsigned long desc_blocks = 0;
+ u64 entries_per_desc_block, nmax;
+ int err;
+
+ err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+ if (unlikely(err))
+ return err;
+
+ entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+ nilfs_palloc_groups_per_desc_block(inode);
+ nmax = entries_per_desc_block * desc_blocks;
+
+ if (nused == nmax &&
+ nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+ nmax += entries_per_desc_block;
+
+ if (nused > nmax)
+ return -ERANGE;
+
+ *nmaxp = nmax;
+ return 0;
+}
+
+/**
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
const struct buffer_head *, void *);
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
+
/**
* nilfs_palloc_req - persistent allocator request and reply
* @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
- unsigned char *types = NULL;
- int ret;
if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
- goto success;
-
- types = nilfs_filetype_table;
+ return 0;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
nilfs_error(sb, __func__, "bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
- ret = -EIO;
- goto done;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
+ return -EIO;
}
kaddr = page_address(page);
de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (de->rec_len == 0) {
nilfs_error(sb, __func__,
"zero-length directory entry");
- ret = -EIO;
nilfs_put_page(page);
- goto done;
+ return -EIO;
}
if (de->inode) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < NILFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < NILFS_FT_MAX)
+ t = nilfs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
- le64_to_cpu(de->inode), d_type);
- if (over) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), t)) {
nilfs_put_page(page);
- goto success;
+ return 0;
}
}
- filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+ ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
}
nilfs_put_page(page);
}
-
-success:
- ret = 0;
-done:
- return ret;
+ return 0;
}
/*
@@ -678,7 +666,7 @@ not_empty:
const struct file_operations nilfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = nilfs_readdir,
+ .iterate = nilfs_readdir,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
}
/**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+ u64 *nmaxinodes, u64 *nfreeinodes)
+{
+ u64 nused;
+ int err;
+
+ *nmaxinodes = 0;
+ *nfreeinodes = 0;
+
+ nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
+ err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+ if (likely(!err))
+ *nfreeinodes = *nmaxinodes - nused;
+ return err;
+}
+
+/**
* nilfs_ifile_read - read or get ifile inode
* @sb: super block instance
* @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
int nilfs_ifile_delete_inode(struct inode *, ino_t);
int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
+
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
size_t inode_size, struct nilfs_inode *raw_inode,
struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6b49f14eac8c..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
- atomic_add(n, &root->blocks_count);
+ atomic64_add(n, &root->blocks_count);
}
void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
- atomic_sub(n, &root->blocks_count);
+ atomic64_sub(n, &root->blocks_count);
}
/**
@@ -175,6 +175,11 @@ static int nilfs_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int err = 0;
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ nilfs_clear_dirty_pages(mapping, false);
+ return -EROFS;
+ }
+
if (wbc->sync_mode == WB_SYNC_ALL)
err = nilfs_construct_dsync_segment(inode->i_sb, inode,
wbc->range_start,
@@ -187,6 +192,18 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
int err;
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /*
+ * It means that filesystem was remounted in read-only
+ * mode because of error or metadata corruption. But we
+ * have dirty pages that try to be flushed in background.
+ * So, here we simply discard this dirty page.
+ */
+ nilfs_clear_dirty_page(page, false);
+ unlock_page(page);
+ return -EROFS;
+ }
+
redirty_page_for_writepage(wbc, page);
unlock_page(page);
@@ -202,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
- int ret = __set_page_dirty_buffers(page);
+ int ret = __set_page_dirty_nobuffers(page);
- if (ret) {
+ if (page_has_buffers(page)) {
struct inode *inode = page->mapping->host;
- unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ unsigned nr_dirty = 0;
+ struct buffer_head *bh, *head;
- nilfs_set_file_dirty(inode, nr_dirty);
+ /*
+ * This page is locked by callers, and no other thread
+ * concurrently marks its buffers dirty since they are
+ * only dirtied through routines in fs/buffer.c in
+ * which call sites of mark_buffer_dirty are protected
+ * by page lock.
+ */
+ bh = head = page_buffers(page);
+ do {
+ /* Do not mark hole blocks dirty */
+ if (buffer_dirty(bh) || !buffer_mapped(bh))
+ continue;
+
+ set_buffer_dirty(bh);
+ nr_dirty++;
+ } while (bh = bh->b_this_page, bh != head);
+
+ if (nr_dirty)
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
@@ -333,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
- atomic_inc(&root->inodes_count);
+ atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -765,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
if (!ret)
- atomic_dec(&ii->i_root->inodes_count);
+ atomic64_dec(&ii->i_root->inodes_count);
nilfs_clear_inode(inode);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f9897d09c693..c4dcd1db57ee 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -375,14 +375,25 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
static int
nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode;
+ struct inode *inode = page->mapping->host;
struct super_block *sb;
int err = 0;
+ if (inode && (inode->i_sb->s_flags & MS_RDONLY)) {
+ /*
+ * It means that filesystem was remounted in read-only
+ * mode because of error or metadata corruption. But we
+ * have dirty pages that try to be flushed in background.
+ * So, here we simply discard this dirty page.
+ */
+ nilfs_clear_dirty_page(page, false);
+ unlock_page(page);
+ return -EROFS;
+ }
+
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- inode = page->mapping->host;
if (!inode)
return 0;
@@ -561,10 +572,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
if (mi->mi_palloc_cache)
nilfs_palloc_clear_cache(inode);
- nilfs_clear_dirty_pages(inode->i_mapping);
+ nilfs_clear_dirty_pages(inode->i_mapping, true);
nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
- nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+ nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 07f76db04ec7..0ba679866e50 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -370,7 +370,12 @@ repeat:
goto repeat;
}
-void nilfs_clear_dirty_pages(struct address_space *mapping)
+/**
+ * nilfs_clear_dirty_pages - discard dirty pages in address space
+ * @mapping: address space with dirty pages for discarding
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
{
struct pagevec pvec;
unsigned int i;
@@ -382,25 +387,9 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
lock_page(page);
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
- bh = head = page_buffers(page);
- do {
- lock_buffer(bh);
- clear_buffer_dirty(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_checked(bh);
- clear_buffer_nilfs_redirected(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
- unlock_buffer(bh);
- bh = bh->b_this_page;
- } while (bh != head);
-
- __nilfs_clear_page_dirty(page);
+ nilfs_clear_dirty_page(page, silent);
unlock_page(page);
}
pagevec_release(&pvec);
@@ -408,6 +397,51 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
}
}
+/**
+ * nilfs_clear_dirty_page - discard dirty page
+ * @page: dirty page that will be discarded
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_page(struct page *page, bool silent)
+{
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(!PageLocked(page));
+
+ if (!silent) {
+ nilfs_warning(sb, __func__,
+ "discard page: offset %lld, ino %lu",
+ page_offset(page), inode->i_ino);
+ }
+
+ ClearPageUptodate(page);
+ ClearPageMappedToDisk(page);
+
+ if (page_has_buffers(page)) {
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ lock_buffer(bh);
+ if (!silent) {
+ nilfs_warning(sb, __func__,
+ "discard block %llu, size %zu",
+ (u64)bh->b_blocknr, bh->b_size);
+ }
+ clear_buffer_dirty(bh);
+ clear_buffer_nilfs_volatile(bh);
+ clear_buffer_nilfs_checked(bh);
+ clear_buffer_nilfs_redirected(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ unlock_buffer(bh);
+ } while (bh = bh->b_this_page, bh != head);
+ }
+
+ __nilfs_clear_page_dirty(page);
+}
+
unsigned nilfs_page_count_clean_buffers(struct page *page,
unsigned from, unsigned to)
{
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb7de71605a0..ef30c5c2426f 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -55,7 +55,8 @@ void nilfs_page_bug(struct page *);
int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
-void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_clear_dirty_page(struct page *, bool);
+void nilfs_clear_dirty_pages(struct address_space *, bool);
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
struct backing_dev_info *bdi);
unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
raw_cp->cp_snapshot_list.ssl_next = 0;
raw_cp->cp_snapshot_list.ssl_prev = 0;
raw_cp->cp_inodes_count =
- cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+ cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
raw_cp->cp_blocks_count =
- cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+ cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
raw_cp->cp_nblk_inc =
cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
if (err)
goto failed_bh;
- atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
- atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+ atomic64_set(&root->inodes_count,
+ le64_to_cpu(raw_cp->cp_inodes_count));
+ atomic64_set(&root->blocks_count,
+ le64_to_cpu(raw_cp->cp_blocks_count));
nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
unsigned long overhead;
unsigned long nrsvblocks;
sector_t nfreeblocks;
+ u64 nmaxinodes, nfreeinodes;
int err;
/*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (unlikely(err))
return err;
+ err = nilfs_ifile_count_free_inodes(root->ifile,
+ &nmaxinodes, &nfreeinodes);
+ if (unlikely(err)) {
+ printk(KERN_WARNING
+ "NILFS warning: fail to count free inodes: err %d.\n",
+ err);
+ if (err == -ERANGE) {
+ /*
+ * If nilfs_palloc_count_max_entries() returns
+ * -ERANGE error code then we simply treat
+ * curent inodes count as maximum possible and
+ * zero as free inodes value.
+ */
+ nmaxinodes = atomic64_read(&root->inodes_count);
+ nfreeinodes = 0;
+ err = 0;
+ } else
+ return err;
+ }
+
buf->f_type = NILFS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = blocks - overhead;
buf->f_bfree = nfreeblocks;
buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
(buf->f_bfree - nrsvblocks) : 0;
- buf->f_files = atomic_read(&root->inodes_count);
- buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+ buf->f_files = nmaxinodes;
+ buf->f_ffree = nfreeinodes;
buf->f_namelen = NILFS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
static int nilfs_tree_was_touched(struct dentry *root_dentry)
{
- return root_dentry->d_count > 1;
+ return d_count(root_dentry) > 1;
}
/**
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
new->ifile = NULL;
new->nilfs = nilfs;
atomic_set(&new->count, 1);
- atomic_set(&new->inodes_count, 0);
- atomic_set(&new->blocks_count, 0);
+ atomic64_set(&new->inodes_count, 0);
+ atomic64_set(&new->blocks_count, 0);
rb_link_node(&new->rb_node, parent, p);
rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
struct the_nilfs *nilfs;
struct inode *ifile;
- atomic_t inodes_count;
- atomic_t blocks_count;
+ atomic64_t inodes_count;
+ atomic64_t blocks_count;
};
/* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
spin_unlock(&fsn_mark->lock);
- /* nothing else could have found us thanks to the dnotify_mark_mutex */
+ /* nothing else could have found us thanks to the dnotify_groups
+ mark_mutex */
if (dn_mark->dn == NULL)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
- fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+ fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+ NULL, 0);
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
- * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
- * only time we clean up the marks we need to get our mark off
- * the list. */
+ * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
+ * fd is the only time we clean up the marks we need to get our mark
+ * off the list. */
if (f != filp) {
/* if we added ourselves, shoot ourselves, it's possible that
* the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5d8444268a16..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <asm/ioctls.h>
@@ -121,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
metadata->event_len = FAN_EVENT_METADATA_LEN;
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
+ metadata->reserved = 0;
metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -398,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
wake_up(&group->fanotify_data.access_waitq);
#endif
- if (file->f_flags & FASYNC)
- fsnotify_fasync(-1, file, 0);
-
/* matches the fanotify_init->fsnotify_alloc_group */
fsnotify_destroy_group(group);
@@ -525,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -549,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
+
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
if (removed & inode->i_fsnotify_mask)
@@ -592,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
return mask & ~oldmask;
}
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+ struct inode *inode,
+ struct vfsmount *mnt)
+{
+ struct fsnotify_mark *mark;
+ int ret;
+
+ if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+ return ERR_PTR(-ENOSPC);
+
+ mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+ if (!mark)
+ return ERR_PTR(-ENOMEM);
+
+ fsnotify_init_mark(mark, fanotify_free_mark);
+ ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+ if (ret) {
+ fsnotify_put_mark(mark);
+ return ERR_PTR(ret);
+ }
+
+ return mark;
+}
+
+
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
unsigned int flags)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -629,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -643,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
(atomic_read(&inode->i_writecount) > 0))
return 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~inode->i_fsnotify_mask)
fsnotify_recalc_inode_mask(inode);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
/* fanotify syscalls */
@@ -755,9 +778,9 @@ out_destroy_group:
return fd;
}
-SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
- __u64 mask, int dfd,
- const char __user * pathname)
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+ __u64, mask, int, dfd,
+ const char __user *, pathname)
{
struct inode *inode = NULL;
struct vfsmount *mnt = NULL;
@@ -857,15 +880,20 @@ fput_and_out:
return ret;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
- long dfd, long pathname)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(fanotify_mark,
+ int, fanotify_fd, unsigned int, flags,
+ __u32, mask0, __u32, mask1, int, dfd,
+ const char __user *, pathname)
{
- return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
- mask, (int) dfd,
- (const char __user *) pathname);
+ return sys_fanotify_mark(fanotify_fd, flags,
+#ifdef __BIG_ENDIAN
+ ((__u64)mask1 << 32) | mask0,
+#else
+ ((__u64)mask0 << 32) | mask1,
+#endif
+ dfd, pathname);
}
-SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
#endif
/*
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e0f7c1241a6a..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -287,9 +287,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
pr_debug("%s: group=%p\n", __func__, group);
- if (file->f_flags & FASYNC)
- fsnotify_fasync(-1, file, 0);
-
/* free this group, matching get was inotify_init->fsnotify_obtain_group */
fsnotify_destroy_group(group);
@@ -359,7 +356,6 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
}
static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
- int *last_wd,
struct inotify_inode_mark *i_mark)
{
int ret;
@@ -367,11 +363,10 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
idr_preload(GFP_KERNEL);
spin_lock(idr_lock);
- ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT);
+ ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT);
if (ret >= 0) {
/* we added the mark to the idr, take a reference */
i_mark->wd = ret;
- *last_wd = i_mark->wd;
fsnotify_get_mark(&i_mark->fsn_mark);
}
@@ -572,7 +567,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
int add = (arg & IN_MASK_ADD);
int ret;
- /* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
fsn_mark = fsnotify_find_inode_mark(group, inode);
@@ -623,7 +617,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- /* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
@@ -638,13 +631,13 @@ static int inotify_new_watch(struct fsnotify_group *group,
if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
goto out_err;
- ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
- tmp_i_mark);
+ ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
if (ret)
goto out_err;
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+ NULL, 0);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
@@ -668,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
-retry:
+ mutex_lock(&group->mark_mutex);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- /*
- * inotify_new_watch could race with another thread which did an
- * inotify_new_watch between the update_existing and the add watch
- * here, go back and try to update an existing mark again.
- */
- if (ret == -EEXIST)
- goto retry;
+ mutex_unlock(&group->mark_mutex);
return ret;
}
@@ -697,7 +684,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
- group->inotify_data.last_wd = 0;
group->inotify_data.user = get_current_user();
if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
@@ -751,6 +737,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
int ret;
unsigned flags = 0;
+ /* don't allow invalid bits: we don't want flags set */
+ if (unlikely(!(mask & ALL_INOTIFY_BITS)))
+ return -EINVAL;
+
f = fdget(fd);
if (unlikely(!f.file))
return -EBADF;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
* fsnotify inode mark locking/lifetime/and refcnting
*
* REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object. The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
*
* LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
*
+ * group->mark_mutex
* mark->lock
- * group->mark_lock
* inode->i_lock
*
- * mark->lock protects 2 things, mark->group and mark->inode. You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list. It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list. It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
*
* inode->i_lock protects the i_fsnotify_marks list anchored inside a
* given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
* inode. We take i_lock and walk the i_fsnotify_marks safely. For each
* mark on the list we take a reference (so the mark can't disappear under us).
* We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode. For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode. If
- * we see the group and inode are not NULL we take those locks. Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future. Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref. We drop our reference we took before we unhooked it
- * from the inode. When the ref hits 0 we can free the mark.
- *
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
* Very similarly for freeing by group, except we use free_g_list.
*
* This has the very interesting property of being able to run concurrently with
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0);
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
/**
* ntfs_filldir - ntfs specific filldir method
* @vol: current ntfs volume
- * @fpos: position in the directory
* @ndir: ntfs inode of current directory
* @ia_page: page in which the index allocation buffer @ie is in resides
* @ie: current index entry
* @name: buffer to use for the converted name
- * @dirent: vfs filldir callback context
- * @filldir: vfs filldir callback
+ * @actor: what to feed the entries to
*
* Convert the Unicode @name to the loaded NLS and pass it to the @filldir
* callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
* retake the lock if we are returning a non-zero value as ntfs_readdir()
* would need to drop the lock immediately anyway.
*/
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
- u8 *name, void *dirent, filldir_t filldir)
+ u8 *name, struct dir_context *actor)
{
unsigned long mref;
- int name_len, rc;
+ int name_len;
unsigned dt_type;
FILE_NAME_TYPE_FLAGS name_type;
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
if (ia_page)
unlock_page(ia_page);
ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
- "0x%lx, DT_%s.", name, name_len, fpos, mref,
+ "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
dt_type == DT_DIR ? "DIR" : "REG");
- rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+ if (!dir_emit(actor, name, name_len, mref, dt_type))
+ return 1;
/* Relock the page but not if we are aborting ->readdir. */
- if (!rc && ia_page)
+ if (ia_page)
lock_page(ia_page);
- return rc;
+ return 0;
}
/*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
* removes them again after the write is complete after which it
* unlocks the page.
*/
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
{
s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
- loff_t fpos, i_size;
- struct inode *bmp_vi, *vdir = file_inode(filp);
+ loff_t i_size;
+ struct inode *bmp_vi, *vdir = file_inode(file);
struct super_block *sb = vdir->i_sb;
ntfs_inode *ndir = NTFS_I(vdir);
ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
u8 *kaddr, *bmp, *index_end;
ntfs_attr_search_ctx *ctx;
- fpos = filp->f_pos;
ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
- vdir->i_ino, fpos);
+ vdir->i_ino, actor->pos);
rc = err = 0;
/* Are we at end of dir yet? */
i_size = i_size_read(vdir);
- if (fpos >= i_size + vol->mft_record_size)
- goto done;
+ if (actor->pos >= i_size + vol->mft_record_size)
+ return 0;
/* Emulate . and .. for all directories. */
- if (!fpos) {
- ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
- "inode 0x%lx, DT_DIR.", vdir->i_ino);
- rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
- if (rc)
- goto done;
- fpos++;
- }
- if (fpos == 1) {
- ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
- "inode 0x%lx, DT_DIR.",
- (unsigned long)parent_ino(filp->f_path.dentry));
- rc = filldir(dirent, "..", 2, fpos,
- parent_ino(filp->f_path.dentry), DT_DIR);
- if (rc)
- goto done;
- fpos++;
- }
+ if (!dir_emit_dots(file, actor))
+ return 0;
m = NULL;
ctx = NULL;
/*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto err_out;
}
/* Are we jumping straight into the index allocation attribute? */
- if (fpos >= vol->mft_record_size)
+ if (actor->pos >= vol->mft_record_size)
goto skip_index_root;
/* Get hold of the mft record for the directory. */
m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto err_out;
}
/* Get the offset into the index root attribute. */
- ir_pos = (s64)fpos;
+ ir_pos = (s64)actor->pos;
/* Find the index root attribute in the mft record. */
err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (ir_pos > (u8*)ie - (u8*)ir)
continue;
/* Advance the position even if going to skip the entry. */
- fpos = (u8*)ie - (u8*)ir;
+ actor->pos = (u8*)ie - (u8*)ir;
/* Submit the name to the filldir callback. */
- rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
- filldir);
+ rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
if (rc) {
kfree(ir);
goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (!NInoIndexAllocPresent(ndir))
goto EOD;
/* Advance fpos to the beginning of the index allocation. */
- fpos = vol->mft_record_size;
+ actor->pos = vol->mft_record_size;
skip_index_root:
kaddr = NULL;
prev_ia_pos = -1LL;
/* Get the offset into the index allocation attribute. */
- ia_pos = (s64)fpos - vol->mft_record_size;
+ ia_pos = (s64)actor->pos - vol->mft_record_size;
ia_mapping = vdir->i_mapping;
ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
continue;
/* Advance the position even if going to skip the entry. */
- fpos = (u8*)ie - (u8*)ia +
+ actor->pos = (u8*)ie - (u8*)ia +
(sle64_to_cpu(ia->index_block_vcn) <<
ndir->itype.index.vcn_size_bits) +
vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
* before returning, unless a non-zero value is returned in
* which case the page is left unlocked.
*/
- rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
- filldir);
+ rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
if (rc) {
/* @ia_page is already unlocked in this case. */
ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
iput(bmp_vi);
EOD:
/* We are finished, set fpos to EOD. */
- fpos = i_size + vol->mft_record_size;
+ actor->pos = i_size + vol->mft_record_size;
abort:
kfree(name);
-done:
-#ifdef DEBUG
- if (!rc)
- ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
- else
- ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
- rc, fpos);
-#endif
- filp->f_pos = fpos;
return 0;
err_out:
if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
if (!err)
err = -EIO;
ntfs_debug("Failed. Returning error code %i.", -err);
- filp->f_pos = fpos;
return err;
}
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
const struct file_operations ntfs_dir_ops = {
.llseek = generic_file_llseek, /* Seek inside directory. */
.read = generic_read_dir, /* Return -EISDIR. */
- .readdir = ntfs_readdir, /* Read directory contents. */
+ .iterate = ntfs_readdir, /* Read directory contents. */
#ifdef NTFS_RW
.fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
/*.aio_fsync = ,*/ /* Sync all outstanding async
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5b2d4f0853ac..c5670b8d198c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -2129,7 +2130,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
@@ -2138,7 +2138,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0)
ret = err;
}
- sb_end_write(inode->i_sb);
return ret;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118cc6ffa..2778b0255dc6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
+#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
&ref_tree, NULL);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto bail;
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
&extra_blocks);
if (ret < 0) {
mlog_errno(ret);
- goto out;
+ goto bail;
}
}
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
extra_blocks);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
mutex_unlock(&tl_inode->i_mutex);
-
+bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
* from ext3. PageChecked() bits have been removed as OCFS2 does not
* do journalled data.
*/
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- jbd2_journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset, length);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da370a99..f671e49beb34 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
+#include <linux/aio.h>
+
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
-static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
{
int ret = -1;
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE, bio);
+ submit_bio(WRITE_SYNC, bio);
status = 0;
bail:
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
continue;
- ret = o2hb_global_hearbeat_mode_set(i);
+ ret = o2hb_global_heartbeat_mode_set(i);
if (!ret)
printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
NULL,
};
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
.show_attribute = o2hb_heartbeat_group_show,
.store_attribute = o2hb_heartbeat_group_store,
};
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_hearbeat_group_item_ops,
+ .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
assert_spin_locked(&o2hb_live_lock);
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
uuid = config_item_name(&reg->hr_item);
/* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
assert_spin_locked(&o2hb_live_lock);
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
uuid = config_item_name(&reg->hr_item);
if (region_uuid) {
if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
p = region_uuids;
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
if (numregs < max_regions) {
memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
sc->sc_node = NULL;
o2net_debug_del_sc(sc);
+
+ if (sc->sc_page)
+ __free_page(sc->sc_page);
kfree(sc);
}
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
state_change = sc->sc_state_change;
switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- o2net_sc_queue_work(sc, &sc->sc_connect_work);
- break;
- default:
- printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
- " shutdown, state %d\n",
- SC_NODEF_ARGS(sc), sk->sk_state);
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
- break;
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
}
out:
read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..eb760d8acd50 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
static int ocfs2_dir_foreach_blk_id(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx)
{
- int ret, i, filldir_ret;
- unsigned long offset = *f_pos;
+ int ret, i;
+ unsigned long offset = ctx->pos;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
di = (struct ocfs2_dinode *)di_bh->b_data;
data = &di->id2.i_data;
- while (*f_pos < i_size_read(inode)) {
-revalidate:
+ while (ctx->pos < i_size_read(inode)) {
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
break;
i += le16_to_cpu(de->rec_len);
}
- *f_pos = offset = i;
+ ctx->pos = offset = i;
*f_version = inode->i_version;
}
- de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
- if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+ de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
/* On error, skip the f_pos to the end. */
- *f_pos = i_size_read(inode);
- goto out;
+ ctx->pos = i_size_read(inode);
+ break;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- filldir_ret = filldir(priv, de->name,
- de->name_len,
- *f_pos,
- le64_to_cpu(de->inode),
- d_type);
- if (filldir_ret) {
- if (filldir_err)
- *filldir_err = filldir_ret;
- break;
- }
- if (version != *f_version)
- goto revalidate;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), d_type))
+ goto out;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
-
out:
brelse(di_bh);
-
return 0;
}
@@ -1855,27 +1834,26 @@ out:
*/
static int ocfs2_dir_foreach_blk_el(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
- int error = 0;
unsigned long offset, blk, last_ra_blk = 0;
- int i, stored;
+ int i;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
+ int stored = 0;
- stored = 0;
bh = NULL;
- offset = (*f_pos) & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && *f_pos < i_size_read(inode)) {
- blk = (*f_pos) >> sb->s_blocksize_bits;
+ while (ctx->pos < i_size_read(inode)) {
+ blk = ctx->pos >> sb->s_blocksize_bits;
if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
/* Skip the corrupt dirblock and keep trying */
- *f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
ra_sectors = 8;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
i += le16_to_cpu(de->rec_len);
}
offset = i;
- *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
*f_version = inode->i_version;
}
- while (!error && *f_pos < i_size_read(inode)
+ while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
- *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
brelse(bh);
- goto out;
+ continue;
}
- offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- error = filldir(priv, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- *f_pos,
le64_to_cpu(de->inode),
- d_type);
- if (error) {
- if (filldir_err)
- *filldir_err = error;
- break;
+ d_type)) {
+ brelse(bh);
+ return 0;
}
- if (version != *f_version)
- goto revalidate;
- stored ++;
+ stored++;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
bh = NULL;
+ if (!persist && stored)
+ break;
}
-
- stored = 0;
-out:
- return stored;
+ return 0;
}
static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
- loff_t *f_pos, void *priv, filldir_t filldir,
- int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
- filldir, filldir_err);
-
- return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
- filldir_err);
+ return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+ return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
}
/*
* This is intended to be called from inside other kernel functions,
* so we fake some arguments.
*/
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- int ret = 0, filldir_err = 0;
u64 version = inode->i_version;
-
- while (*f_pos < i_size_read(inode)) {
- ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
- filldir, &filldir_err);
- if (ret || filldir_err)
- break;
- }
-
- if (ret > 0)
- ret = -EIO;
-
+ ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
* ocfs2_readdir()
*
*/
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
- dirent, filldir, NULL);
+ error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
if (error)
@@ -2120,6 +2067,7 @@ bail:
}
struct ocfs2_empty_dir_priv {
+ struct dir_context ctx;
unsigned seen_dot;
unsigned seen_dot_dot;
unsigned seen_other;
@@ -2204,8 +2152,9 @@ out:
int ocfs2_empty_dir(struct inode *inode)
{
int ret;
- loff_t start = 0;
- struct ocfs2_empty_dir_priv priv;
+ struct ocfs2_empty_dir_priv priv = {
+ .ctx.actor = ocfs2_empty_dir_filldir
+ };
memset(&priv, 0, sizeof(priv));
@@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode)
*/
}
- ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+ ret = ocfs2_dir_foreach(inode, &priv.ctx);
if (ret)
mlog_errno(ret);
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
struct ocfs2_dir_lookup_result *res);
int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
lock->ml.node);
}
} else {
+ status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index eeac97bb3bfa..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
static int dlm_do_recovery(struct dlm_ctxt *dlm);
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
u8 dead_node)
{
struct dlm_lock_request lr;
- enum dlm_status ret;
+ int ret;
mlog(0, "\n");
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
lr.dead_node = dead_node;
// send message
- ret = DLM_NOLOCKMGR;
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
&lr, sizeof(lr), request_from, NULL);
@@ -1408,6 +1404,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
@@ -1498,10 +1495,8 @@ leave:
dlm_put(dlm);
if (ret < 0) {
- if (buf)
- kfree(buf);
- if (item)
- kfree(item);
+ kfree(buf);
+ kfree(item);
mlog_errno(ret);
}
@@ -2697,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
spin_unlock(&dlm->spinlock);
+ dlm_put(dlm);
return -EAGAIN;
}
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 12ae194ac943..3a44a648dae7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1c39efb71bab..2487116d0d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6474cb44004d..41000f223ca4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2248,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
if (iocb->ki_left == 0)
return 0;
- sb_start_write(inode->i_sb);
-
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2290,7 +2288,7 @@ relock:
ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
mlog_errno(ret);
- goto out_sems;
+ goto out;
}
ocfs2_inode_unlock(inode, 1);
@@ -2423,7 +2421,6 @@ out_sems:
ocfs2_iocb_clear_sem_locked(iocb);
mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
if (written)
ret = written;
@@ -2468,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
out->f_path.dentry->d_name.len,
out->f_path.dentry->d_name.name, len);
- if (pipe->inode)
- mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
+ pipe_lock(pipe);
splice_from_pipe_begin(&sd);
do {
@@ -2489,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
} while (ret > 0);
splice_from_pipe_end(pipe, &sd);
- if (pipe->inode)
- mutex_unlock(&pipe->inode->i_mutex);
+ pipe_unlock(pipe);
if (sd.num_spliced)
ret = sd.num_spliced;
@@ -2651,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
goto out;
}
- if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
- ret = -EINVAL;
- if (!ret && offset > inode->i_sb->s_maxbytes)
- ret = -EINVAL;
- if (ret)
- goto out;
-
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
mutex_unlock(&inode->i_mutex);
@@ -2717,7 +2702,7 @@ const struct file_operations ocfs2_fops = {
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2764,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3133fa..621fc73bf23d 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode,
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
struct buffer_head *ocfs2_bread(struct inode *inode,
int block, int *err, int reada);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 752f0b26221d..0c60ef2d8056 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
- }
-
oldflags = ocfs2_inode->ip_attr;
flags = flags & mask;
flags |= oldflags & ~mask;
@@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_commit;
+ goto bail_unlock;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
}
ocfs2_inode->ip_attr = flags;
@@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
-bail_commit:
ocfs2_commit_trans(osb, handle);
+
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
@@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
o2info_set_request_filled(&oiff->iff_req);
- if (o2info_to_user(*oiff, req))
+ if (o2info_to_user(*oiff, req)) {
+ status = -EFAULT;
goto bail;
+ }
status = 0;
bail:
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
}
struct ocfs2_orphan_filldir_priv {
+ struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
};
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
{
int status;
struct inode *orphan_dir_inode = NULL;
- struct ocfs2_orphan_filldir_priv priv;
- loff_t pos = 0;
-
- priv.osb = osb;
- priv.head = *head;
+ struct ocfs2_orphan_filldir_priv priv = {
+ .ctx.actor = ocfs2_orphan_filldir,
+ .osb = osb,
+ .head = *head
+ };
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
- ocfs2_orphan_filldir);
+ status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
if (status) {
mlog_errno(status);
goto out_cluster;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..96f9ac237e86 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
- atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
@@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
- ocfs2_quota_trans_credits(sb);
+ ocfs2_quota_trans_credits(sb) + bits_wanted;
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 9f8dcadd9a50..f1fc172175b6 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
int ret, goal_bit = 0;
struct buffer_head *gd_bh = NULL;
- struct ocfs2_group_desc *bg = NULL;
+ struct ocfs2_group_desc *bg;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int c_to_b = 1 << (osb->s_clustersize_bits -
inode->i_sb->s_blocksize_bits);
@@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
range->me_goal);
/*
- * moving goal is not allowd to start with a group desc blok(#0 blk)
- * let's compromise to the latter cluster.
- */
- if (range->me_goal == le64_to_cpu(bg->bg_blkno))
- range->me_goal += c_to_b;
-
- /*
* validate goal sits within global_bitmap, and return the victim
* group desc
*/
@@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
/*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
* movement is not gonna cross two groups.
*/
if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
@@ -1057,42 +1057,40 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
struct inode *inode = file_inode(filp);
struct ocfs2_move_extents range;
- struct ocfs2_move_extents_context *context = NULL;
+ struct ocfs2_move_extents_context *context;
+
+ if (!argp)
+ return -EINVAL;
status = mnt_want_write_file(filp);
if (status)
return status;
if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
- goto out;
+ goto out_drop;
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
status = -EPERM;
- goto out;
+ goto out_drop;
}
context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
if (!context) {
status = -ENOMEM;
mlog_errno(status);
- goto out;
+ goto out_drop;
}
context->inode = inode;
context->file = filp;
- if (argp) {
- if (copy_from_user(&range, argp, sizeof(range))) {
- status = -EFAULT;
- goto out;
- }
- } else {
- status = -EINVAL;
- goto out;
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ status = -EFAULT;
+ goto out_free;
}
if (range.me_start > i_size_read(inode))
- goto out;
+ goto out_free;
if (range.me_start + range.me_len > i_size_read(inode))
range.me_len = i_size_read(inode) - range.me_start;
@@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
status = ocfs2_validate_and_adjust_move_goal(inode, &range);
if (status)
- goto out;
+ goto out_copy;
}
status = ocfs2_move_extents(context);
if (status)
mlog_errno(status);
-out:
+out_copy:
/*
* movement/defragmentation may end up being partially completed,
* that's the reason why we need to return userspace the finished
* length and new_offset even if failure happens somewhere.
*/
- if (argp) {
- if (copy_to_user(argp, &range, sizeof(range)))
- status = -EFAULT;
- }
+ if (copy_to_user(argp, &range, sizeof(range)))
+ status = -EFAULT;
+out_free:
kfree(context);
-
+out_drop:
mnt_drop_write_file(filp);
return status;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 04ee1b57c243..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
- le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
return ret;
}
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
{
int status;
int child_locked = 0;
+ bool is_unlinkable = false;
struct inode *inode = dentry->d_inode;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- if (inode_is_unlinkable(inode)) {
+ if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
goto leave;
}
+ is_unlinkable = true;
}
handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (inode_is_unlinkable(inode)) {
- status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- }
-
/* delete the name from the parent dir */
status = ocfs2_delete_entry(handle, dir, &lookup);
if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
if (S_ISDIR(inode->i_mode))
inc_nlink(dir);
+ goto leave;
+ }
+
+ if (is_unlinkable) {
+ status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+ orphan_name, &orphan_insert, orphan_dir);
+ if (status < 0)
+ mlog_errno(status);
}
leave:
@@ -947,7 +948,7 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- if (status)
+ if (status && (status != -ENOTEMPTY))
mlog_errno(status);
return status;
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto leave;
}
+ /*
+ * We're going to journal the change of i_flags and i_orphaned_slot.
+ * It's safe anyway, though some callers may duplicate the journaling.
+ * Journaling within the func just make the logic look more
+ * straightforward.
+ */
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* we're a cluster, and nlink can change on disk from
* underneath us... */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_dir_bh, lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto rollback;
}
- /*
- * We're going to journal the change of i_flags and i_orphaned_slot.
- * It's safe anyway, though some callers may duplicate the journaling.
- * Journaling within the func just make the logic look more
- * straightforward.
- */
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(inode),
- fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
/* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
osb->slot_num);
+rollback:
+ if (status < 0) {
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ }
+
leave:
brelse(orphan_dir_bh);
- if (status)
- mlog_errno(status);
return status;
}
@@ -2216,7 +2222,7 @@ out:
brelse(orphan_dir_bh);
- return 0;
+ return ret;
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+ di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
di->i_orphaned_slot = 0;
set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
- atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
int status;
/* there is a really tiny chance the journal calls could fail,
* but we wouldn't want inconsistent blocks in *any* case. */
- u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ u64 bg_ptr, prev_bg_ptr;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
(unsigned long long)le64_to_cpu(bg->bg_blkno),
(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
- fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
prev_bg_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out;
prev_bg->bg_next_group = bg->bg_next_group;
ocfs2_journal_dirty(handle, prev_bg_bh);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_prev_bg;
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
ocfs2_journal_dirty(handle, bg_bh);
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_bg;
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
ocfs2_journal_dirty(handle, fe_bh);
-out_rollback:
- if (status < 0) {
- fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
- bg->bg_next_group = cpu_to_le64(bg_ptr);
- prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
- }
-
- if (status)
+out:
+ if (status < 0)
mlog_errno(status);
return status;
+
+out_rollback_bg:
+ bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+ prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+ goto out;
}
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_unlock(&osb->osb_lock);
out += snprintf(buf + out, len - out,
- "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
+ "%10s => Pid: %d Interval: %lu\n", "Commit",
(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
- osb->osb_commit_interval,
- atomic_read(&osb->needs_checkpoint));
+ osb->osb_commit_interval);
out += snprintf(buf + out, len - out,
"%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
init_waitqueue_head(&osb->checkpoint_event);
- atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
{
int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
struct ocfs2_xa_loc loc;
if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
down_write(&oi->ip_alloc_sem);
if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
- if (!ocfs2_xattr_has_space_inline(inode, di)) {
- ret = -ENOSPC;
- goto out;
- }
- }
-
- if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
if (ret) {
if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
}
new_oi = OCFS2_I(args->new_inode);
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
spin_lock(&new_oi->ip_lock);
new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
return is_bad;
}
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
u64 fsblock, int hindex)
{
- struct inode *dir = file_inode(filp);
- struct buffer_head *bh;
- struct omfs_inode *oi;
- u64 self;
- int res = 0;
- unsigned char d_type;
-
/* follow chain in this bucket */
while (fsblock != ~0) {
- bh = omfs_bread(dir->i_sb, fsblock);
+ struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+ struct omfs_inode *oi;
+ u64 self;
+ unsigned char d_type;
+
if (!bh)
- goto out;
+ return true;
oi = (struct omfs_inode *) bh->b_data;
if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
brelse(bh);
- goto out;
+ return true;
}
self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
- res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
- OMFS_NAMELEN), filp->f_pos, self, d_type);
+ if (!dir_emit(ctx, oi->i_name,
+ strnlen(oi->i_name, OMFS_NAMELEN),
+ self, d_type)) {
+ brelse(bh);
+ return false;
+ }
brelse(bh);
- if (res < 0)
- break;
- filp->f_pos++;
+ ctx->pos++;
}
-out:
- return res;
+ return true;
}
static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
return err;
}
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *dir = file_inode(filp);
+ struct inode *dir = file_inode(file);
struct buffer_head *bh;
- loff_t offset, res;
+ __be64 *p;
unsigned int hchain, hindex;
int nbuckets;
- u64 fsblock;
- int ret = -EINVAL;
-
- if (filp->f_pos >> 32)
- goto success;
-
- switch ((unsigned long) filp->f_pos) {
- case 0:
- if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
- goto success;
- filp->f_pos++;
- /* fall through */
- case 1:
- if (filldir(dirent, "..", 2, 1,
- parent_ino(filp->f_dentry), DT_DIR) < 0)
- goto success;
- filp->f_pos = 1 << 20;
- /* fall through */
+
+ if (ctx->pos >> 32)
+ return -EINVAL;
+
+ if (ctx->pos < 1 << 20) {
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ ctx->pos = 1 << 20;
}
nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
/* high 12 bits store bucket + 1 and low 20 bits store hash index */
- hchain = (filp->f_pos >> 20) - 1;
- hindex = filp->f_pos & 0xfffff;
+ hchain = (ctx->pos >> 20) - 1;
+ hindex = ctx->pos & 0xfffff;
bh = omfs_bread(dir->i_sb, dir->i_ino);
if (!bh)
- goto out;
+ return -EINVAL;
- offset = OMFS_DIR_START + hchain * 8;
+ p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
- for (; hchain < nbuckets; hchain++, offset += 8) {
- fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
-
- res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
- hindex = 0;
- if (res < 0)
+ for (; hchain < nbuckets; hchain++) {
+ __u64 fsblock = be64_to_cpu(*p++);
+ if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
break;
-
- filp->f_pos = (hchain+2) << 20;
+ hindex = 0;
+ ctx->pos = (hchain+2) << 20;
}
brelse(bh);
-success:
- ret = 0;
-out:
- return ret;
+ return 0;
}
const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
- .readdir = omfs_readdir,
+ .iterate = omfs_readdir,
.llseek = generic_file_llseek,
};
diff --git a/fs/open.c b/fs/open.c
index 68354466879f..fca72c4d3f17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -197,10 +197,7 @@ out:
SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
- long ret = do_sys_ftruncate(fd, length, 1);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, fd, length);
- return ret;
+ return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
@@ -212,32 +209,15 @@ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
-SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
+SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
return do_sys_truncate(path, length);
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_truncate64(long path, loff_t length)
-{
- return SYSC_truncate64((const char __user *) path, length);
-}
-SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
-#endif
-SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
-{
- long ret = do_sys_ftruncate(fd, length, 0);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, fd, length);
- return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
- return SYSC_ftruncate64((unsigned int) fd, length);
+ return do_sys_ftruncate(fd, length, 0);
}
-SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
-#endif
#endif /* BITS_PER_LONG == 32 */
@@ -299,7 +279,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return ret;
}
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
struct fd f = fdget(fd);
int error = -EBADF;
@@ -311,14 +291,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
return error;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
-{
- return SYSC_fallocate((int)fd, (int)mode, offset, len);
-}
-SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
-#endif
-
/*
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
@@ -868,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_SYNC)
flags |= O_DSYNC;
- /*
- * If we have O_PATH in the open flag. Then we
- * cannot have anything other than the below set of flags
- */
- if (flags & O_PATH) {
+ if (flags & O_TMPFILE) {
+ if (!(flags & O_CREAT))
+ return -EINVAL;
+ acc_mode = MAY_OPEN | ACC_MODE(flags);
+ } else if (flags & O_PATH) {
+ /*
+ * If we have O_PATH in the open flag. Then we
+ * cannot have anything other than the below set of flags
+ */
flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
acc_mode = 0;
} else {
@@ -904,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
lookup_flags |= LOOKUP_DIRECTORY;
if (!(flags & O_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- return lookup_flags;
+ op->lookup_flags = lookup_flags;
+ return 0;
}
/**
@@ -921,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
struct open_flags op;
- int lookup = build_open_flags(flags, mode, &op);
- return do_filp_open(AT_FDCWD, name, &op, lookup);
+ int err = build_open_flags(flags, mode, &op);
+ return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
}
/**
@@ -947,65 +924,61 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
const char *filename, int flags)
{
struct open_flags op;
- int lookup = build_open_flags(flags, 0, &op);
+ int err = build_open_flags(flags, 0, &op);
+ if (err)
+ return ERR_PTR(err);
if (flags & O_CREAT)
return ERR_PTR(-EINVAL);
if (!filename && (flags & O_DIRECTORY))
if (!dentry->d_inode->i_op->lookup)
return ERR_PTR(-ENOTDIR);
- return do_file_open_root(dentry, mnt, filename, &op, lookup);
+ return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
- int lookup = build_open_flags(flags, mode, &op);
- struct filename *tmp = getname(filename);
- int fd = PTR_ERR(tmp);
-
- if (!IS_ERR(tmp)) {
- fd = get_unused_fd_flags(flags);
- if (fd >= 0) {
- struct file *f = do_filp_open(dfd, tmp, &op, lookup);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else {
- fsnotify_open(f);
- fd_install(fd, f);
- }
+ int fd = build_open_flags(flags, mode, &op);
+ struct filename *tmp;
+
+ if (fd)
+ return fd;
+
+ tmp = getname(filename);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ fd = get_unused_fd_flags(flags);
+ if (fd >= 0) {
+ struct file *f = do_filp_open(dfd, tmp, &op);
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ } else {
+ fsnotify_open(f);
+ fd_install(fd, f);
}
- putname(tmp);
}
+ putname(tmp);
return fd;
}
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
- long ret;
-
if (force_o_largefile())
flags |= O_LARGEFILE;
- ret = do_sys_open(AT_FDCWD, filename, flags, mode);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, filename, flags, mode);
- return ret;
+ return do_sys_open(AT_FDCWD, filename, flags, mode);
}
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
- long ret;
-
if (force_o_largefile())
flags |= O_LARGEFILE;
- ret = do_sys_open(dfd, filename, flags, mode);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, dfd, filename, flags, mode);
- return ret;
+ return do_sys_open(dfd, filename, flags, mode);
}
#ifndef __alpha__
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
.release = seq_release,
};
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
static const struct file_operations openprom_operations = {
.read = generic_read_dir,
- .readdir = openpromfs_readdir,
+ .iterate = openpromfs_readdir,
.llseek = generic_file_llseek,
};
@@ -260,71 +260,64 @@ found:
return NULL;
}
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct op_inode_info *oi = OP_I(inode);
struct device_node *dp = oi->u.node;
struct device_node *child;
struct property *prop;
- unsigned int ino;
int i;
mutex_lock(&op_mutex);
- ino = inode->i_ino;
- i = filp->f_pos;
- switch (i) {
- case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
goto out;
- i++;
- filp->f_pos++;
- /* fall thru */
- case 1:
- if (filldir(dirent, "..", 2, i,
+ ctx->pos = 1;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit(ctx, "..", 2,
(dp->parent == NULL ?
OPENPROM_ROOT_INO :
- dp->parent->unique_id), DT_DIR) < 0)
+ dp->parent->unique_id), DT_DIR))
goto out;
- i++;
- filp->f_pos++;
- /* fall thru */
- default:
- i -= 2;
-
- /* First, the children nodes as directories. */
- child = dp->child;
- while (i && child) {
- child = child->sibling;
- i--;
- }
- while (child) {
- if (filldir(dirent,
- child->path_component_name,
- strlen(child->path_component_name),
- filp->f_pos, child->unique_id, DT_DIR) < 0)
- goto out;
-
- filp->f_pos++;
- child = child->sibling;
- }
+ ctx->pos = 2;
+ }
+ i = ctx->pos - 2;
- /* Next, the properties as files. */
- prop = dp->properties;
- while (i && prop) {
- prop = prop->next;
- i--;
- }
- while (prop) {
- if (filldir(dirent, prop->name, strlen(prop->name),
- filp->f_pos, prop->unique_id, DT_REG) < 0)
- goto out;
+ /* First, the children nodes as directories. */
+ child = dp->child;
+ while (i && child) {
+ child = child->sibling;
+ i--;
+ }
+ while (child) {
+ if (!dir_emit(ctx,
+ child->path_component_name,
+ strlen(child->path_component_name),
+ child->unique_id, DT_DIR))
+ goto out;
- filp->f_pos++;
- prop = prop->next;
- }
+ ctx->pos++;
+ child = child->sibling;
+ }
+
+ /* Next, the properties as files. */
+ prop = dp->properties;
+ while (i && prop) {
+ prop = prop->next;
+ i--;
}
+ while (prop) {
+ if (!dir_emit(ctx, prop->name, strlen(prop->name),
+ prop->unique_id, DT_REG))
+ goto out;
+
+ ctx->pos++;
+ prop = prop->next;
+ }
+
out:
mutex_unlock(&op_mutex);
return 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 2234f3f61f8d..d2c45e14e6d8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,10 +21,13 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include "internal.h"
+
/*
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
@@ -53,8 +56,8 @@ unsigned int pipe_min_size = PAGE_SIZE;
static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
{
- if (pipe->inode)
- mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+ if (pipe->files)
+ mutex_lock_nested(&pipe->mutex, subclass);
}
void pipe_lock(struct pipe_inode_info *pipe)
@@ -68,11 +71,21 @@ EXPORT_SYMBOL(pipe_lock);
void pipe_unlock(struct pipe_inode_info *pipe)
{
- if (pipe->inode)
- mutex_unlock(&pipe->inode->i_mutex);
+ if (pipe->files)
+ mutex_unlock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_unlock);
+static inline void __pipe_lock(struct pipe_inode_info *pipe)
+{
+ mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
+}
+
+static inline void __pipe_unlock(struct pipe_inode_info *pipe)
+{
+ mutex_unlock(&pipe->mutex);
+}
+
void pipe_double_lock(struct pipe_inode_info *pipe1,
struct pipe_inode_info *pipe2)
{
@@ -361,8 +374,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
- struct inode *inode = file_inode(filp);
- struct pipe_inode_info *pipe;
+ struct pipe_inode_info *pipe = filp->private_data;
int do_wakeup;
ssize_t ret;
struct iovec *iov = (struct iovec *)_iov;
@@ -375,8 +387,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
do_wakeup = 0;
ret = 0;
- mutex_lock(&inode->i_mutex);
- pipe = inode->i_pipe;
+ __pipe_lock(pipe);
for (;;) {
int bufs = pipe->nrbufs;
if (bufs) {
@@ -464,7 +475,7 @@ redo:
}
pipe_wait(pipe);
}
- mutex_unlock(&inode->i_mutex);
+ __pipe_unlock(pipe);
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
@@ -486,8 +497,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
unsigned long nr_segs, loff_t ppos)
{
struct file *filp = iocb->ki_filp;
- struct inode *inode = file_inode(filp);
- struct pipe_inode_info *pipe;
+ struct pipe_inode_info *pipe = filp->private_data;
ssize_t ret;
int do_wakeup;
struct iovec *iov = (struct iovec *)_iov;
@@ -501,8 +511,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
do_wakeup = 0;
ret = 0;
- mutex_lock(&inode->i_mutex);
- pipe = inode->i_pipe;
+ __pipe_lock(pipe);
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -649,7 +658,7 @@ redo2:
pipe->waiting_writers--;
}
out:
- mutex_unlock(&inode->i_mutex);
+ __pipe_unlock(pipe);
if (do_wakeup) {
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
@@ -662,29 +671,14 @@ out:
return ret;
}
-static ssize_t
-bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
- return -EBADF;
-}
-
-static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
- loff_t *ppos)
-{
- return -EBADF;
-}
-
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = file_inode(filp);
- struct pipe_inode_info *pipe;
+ struct pipe_inode_info *pipe = filp->private_data;
int count, buf, nrbufs;
switch (cmd) {
case FIONREAD:
- mutex_lock(&inode->i_mutex);
- pipe = inode->i_pipe;
+ __pipe_lock(pipe);
count = 0;
buf = pipe->curbuf;
nrbufs = pipe->nrbufs;
@@ -692,7 +686,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
count += pipe->bufs[buf].len;
buf = (buf+1) & (pipe->buffers - 1);
}
- mutex_unlock(&inode->i_mutex);
+ __pipe_unlock(pipe);
return put_user(count, (int __user *)arg);
default:
@@ -705,8 +699,7 @@ static unsigned int
pipe_poll(struct file *filp, poll_table *wait)
{
unsigned int mask;
- struct inode *inode = file_inode(filp);
- struct pipe_inode_info *pipe = inode->i_pipe;
+ struct pipe_inode_info *pipe = filp->private_data;
int nrbufs;
poll_wait(filp, &pipe->wait, wait);
@@ -734,197 +727,56 @@ pipe_poll(struct file *filp, poll_table *wait)
}
static int
-pipe_release(struct inode *inode, int decr, int decw)
+pipe_release(struct inode *inode, struct file *file)
{
- struct pipe_inode_info *pipe;
+ struct pipe_inode_info *pipe = inode->i_pipe;
+ int kill = 0;
- mutex_lock(&inode->i_mutex);
- pipe = inode->i_pipe;
- pipe->readers -= decr;
- pipe->writers -= decw;
+ __pipe_lock(pipe);
+ if (file->f_mode & FMODE_READ)
+ pipe->readers--;
+ if (file->f_mode & FMODE_WRITE)
+ pipe->writers--;
- if (!pipe->readers && !pipe->writers) {
- free_pipe_info(inode);
- } else {
+ if (pipe->readers || pipe->writers) {
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-
-static int
-pipe_read_fasync(int fd, struct file *filp, int on)
-{
- struct inode *inode = file_inode(filp);
- int retval;
-
- mutex_lock(&inode->i_mutex);
- retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
- mutex_unlock(&inode->i_mutex);
-
- return retval;
-}
-
-
-static int
-pipe_write_fasync(int fd, struct file *filp, int on)
-{
- struct inode *inode = file_inode(filp);
- int retval;
+ spin_lock(&inode->i_lock);
+ if (!--pipe->files) {
+ inode->i_pipe = NULL;
+ kill = 1;
+ }
+ spin_unlock(&inode->i_lock);
+ __pipe_unlock(pipe);
- mutex_lock(&inode->i_mutex);
- retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
- mutex_unlock(&inode->i_mutex);
+ if (kill)
+ free_pipe_info(pipe);
- return retval;
+ return 0;
}
-
static int
-pipe_rdwr_fasync(int fd, struct file *filp, int on)
+pipe_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = file_inode(filp);
- struct pipe_inode_info *pipe = inode->i_pipe;
- int retval;
+ struct pipe_inode_info *pipe = filp->private_data;
+ int retval = 0;
- mutex_lock(&inode->i_mutex);
- retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
- if (retval >= 0) {
+ __pipe_lock(pipe);
+ if (filp->f_mode & FMODE_READ)
+ retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+ if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
- if (retval < 0) /* this can happen only if on == T */
+ if (retval < 0 && (filp->f_mode & FMODE_READ))
+ /* this can happen only if on == T */
fasync_helper(-1, filp, 0, &pipe->fasync_readers);
}
- mutex_unlock(&inode->i_mutex);
+ __pipe_unlock(pipe);
return retval;
}
-
-static int
-pipe_read_release(struct inode *inode, struct file *filp)
-{
- return pipe_release(inode, 1, 0);
-}
-
-static int
-pipe_write_release(struct inode *inode, struct file *filp)
-{
- return pipe_release(inode, 0, 1);
-}
-
-static int
-pipe_rdwr_release(struct inode *inode, struct file *filp)
-{
- int decr, decw;
-
- decr = (filp->f_mode & FMODE_READ) != 0;
- decw = (filp->f_mode & FMODE_WRITE) != 0;
- return pipe_release(inode, decr, decw);
-}
-
-static int
-pipe_read_open(struct inode *inode, struct file *filp)
-{
- int ret = -ENOENT;
-
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_pipe) {
- ret = 0;
- inode->i_pipe->readers++;
- }
-
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-static int
-pipe_write_open(struct inode *inode, struct file *filp)
-{
- int ret = -ENOENT;
-
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_pipe) {
- ret = 0;
- inode->i_pipe->writers++;
- }
-
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-static int
-pipe_rdwr_open(struct inode *inode, struct file *filp)
-{
- int ret = -ENOENT;
-
- if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
- return -EINVAL;
-
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_pipe) {
- ret = 0;
- if (filp->f_mode & FMODE_READ)
- inode->i_pipe->readers++;
- if (filp->f_mode & FMODE_WRITE)
- inode->i_pipe->writers++;
- }
-
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-/*
- * The file_operations structs are not static because they
- * are also used in linux/fs/fifo.c to do operations on FIFOs.
- *
- * Pipes reuse fifos' file_operations structs.
- */
-const struct file_operations read_pipefifo_fops = {
- .llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = bad_pipe_w,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_read_open,
- .release = pipe_read_release,
- .fasync = pipe_read_fasync,
-};
-
-const struct file_operations write_pipefifo_fops = {
- .llseek = no_llseek,
- .read = bad_pipe_r,
- .write = do_sync_write,
- .aio_write = pipe_write,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_write_open,
- .release = pipe_write_release,
- .fasync = pipe_write_fasync,
-};
-
-const struct file_operations rdwr_pipefifo_fops = {
- .llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = do_sync_write,
- .aio_write = pipe_write,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_rdwr_open,
- .release = pipe_rdwr_release,
- .fasync = pipe_rdwr_fasync,
-};
-
-struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
@@ -934,8 +786,8 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->inode = inode;
pipe->buffers = PIPE_DEF_BUFFERS;
+ mutex_init(&pipe->mutex);
return pipe;
}
kfree(pipe);
@@ -944,7 +796,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
return NULL;
}
-void __free_pipe_info(struct pipe_inode_info *pipe)
+void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
@@ -959,12 +811,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe)
kfree(pipe);
}
-void free_pipe_info(struct inode *inode)
-{
- __free_pipe_info(inode->i_pipe);
- inode->i_pipe = NULL;
-}
-
static struct vfsmount *pipe_mnt __read_mostly;
/*
@@ -990,13 +836,14 @@ static struct inode * get_pipe_inode(void)
inode->i_ino = get_next_ino();
- pipe = alloc_pipe_info(inode);
+ pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;
- inode->i_pipe = pipe;
+ inode->i_pipe = pipe;
+ pipe->files = 2;
pipe->readers = pipe->writers = 1;
- inode->i_fop = &rdwr_pipefifo_fops;
+ inode->i_fop = &pipefifo_fops;
/*
* Mark the inode dirty from the very beginning,
@@ -1039,17 +886,19 @@ int create_pipe_files(struct file **res, int flags)
d_instantiate(path.dentry, inode);
err = -ENFILE;
- f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
+ f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
if (IS_ERR(f))
goto err_dentry;
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
+ f->private_data = inode->i_pipe;
- res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
+ res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
if (IS_ERR(res[0]))
goto err_file;
path_get(&path);
+ res[0]->private_data = inode->i_pipe;
res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
res[1] = f;
return 0;
@@ -1057,12 +906,12 @@ int create_pipe_files(struct file **res, int flags)
err_file:
put_filp(f);
err_dentry:
- free_pipe_info(inode);
+ free_pipe_info(inode->i_pipe);
path_put(&path);
return err;
err_inode:
- free_pipe_info(inode);
+ free_pipe_info(inode->i_pipe);
iput(inode);
return err;
}
@@ -1144,6 +993,168 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
return sys_pipe2(fildes, 0);
}
+static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
+{
+ int cur = *cnt;
+
+ while (cur == *cnt) {
+ pipe_wait(pipe);
+ if (signal_pending(current))
+ break;
+ }
+ return cur == *cnt ? -ERESTARTSYS : 0;
+}
+
+static void wake_up_partner(struct pipe_inode_info *pipe)
+{
+ wake_up_interruptible(&pipe->wait);
+}
+
+static int fifo_open(struct inode *inode, struct file *filp)
+{
+ struct pipe_inode_info *pipe;
+ bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
+ int kill = 0;
+ int ret;
+
+ filp->f_version = 0;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_pipe) {
+ pipe = inode->i_pipe;
+ pipe->files++;
+ spin_unlock(&inode->i_lock);
+ } else {
+ spin_unlock(&inode->i_lock);
+ pipe = alloc_pipe_info();
+ if (!pipe)
+ return -ENOMEM;
+ pipe->files = 1;
+ spin_lock(&inode->i_lock);
+ if (unlikely(inode->i_pipe)) {
+ inode->i_pipe->files++;
+ spin_unlock(&inode->i_lock);
+ free_pipe_info(pipe);
+ pipe = inode->i_pipe;
+ } else {
+ inode->i_pipe = pipe;
+ spin_unlock(&inode->i_lock);
+ }
+ }
+ filp->private_data = pipe;
+ /* OK, we have a pipe and it's pinned down */
+
+ __pipe_lock(pipe);
+
+ /* We can only do regular read/write on fifos */
+ filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+
+ switch (filp->f_mode) {
+ case FMODE_READ:
+ /*
+ * O_RDONLY
+ * POSIX.1 says that O_NONBLOCK means return with the FIFO
+ * opened, even when there is no process writing the FIFO.
+ */
+ pipe->r_counter++;
+ if (pipe->readers++ == 0)
+ wake_up_partner(pipe);
+
+ if (!is_pipe && !pipe->writers) {
+ if ((filp->f_flags & O_NONBLOCK)) {
+ /* suppress POLLHUP until we have
+ * seen a writer */
+ filp->f_version = pipe->w_counter;
+ } else {
+ if (wait_for_partner(pipe, &pipe->w_counter))
+ goto err_rd;
+ }
+ }
+ break;
+
+ case FMODE_WRITE:
+ /*
+ * O_WRONLY
+ * POSIX.1 says that O_NONBLOCK means return -1 with
+ * errno=ENXIO when there is no process reading the FIFO.
+ */
+ ret = -ENXIO;
+ if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
+ goto err;
+
+ pipe->w_counter++;
+ if (!pipe->writers++)
+ wake_up_partner(pipe);
+
+ if (!is_pipe && !pipe->readers) {
+ if (wait_for_partner(pipe, &pipe->r_counter))
+ goto err_wr;
+ }
+ break;
+
+ case FMODE_READ | FMODE_WRITE:
+ /*
+ * O_RDWR
+ * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
+ * This implementation will NEVER block on a O_RDWR open, since
+ * the process can at least talk to itself.
+ */
+
+ pipe->readers++;
+ pipe->writers++;
+ pipe->r_counter++;
+ pipe->w_counter++;
+ if (pipe->readers == 1 || pipe->writers == 1)
+ wake_up_partner(pipe);
+ break;
+
+ default:
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Ok! */
+ __pipe_unlock(pipe);
+ return 0;
+
+err_rd:
+ if (!--pipe->readers)
+ wake_up_interruptible(&pipe->wait);
+ ret = -ERESTARTSYS;
+ goto err;
+
+err_wr:
+ if (!--pipe->writers)
+ wake_up_interruptible(&pipe->wait);
+ ret = -ERESTARTSYS;
+ goto err;
+
+err:
+ spin_lock(&inode->i_lock);
+ if (!--pipe->files) {
+ inode->i_pipe = NULL;
+ kill = 1;
+ }
+ spin_unlock(&inode->i_lock);
+ __pipe_unlock(pipe);
+ if (kill)
+ free_pipe_info(pipe);
+ return ret;
+}
+
+const struct file_operations pipefifo_fops = {
+ .open = fifo_open,
+ .llseek = no_llseek,
+ .read = do_sync_read,
+ .aio_read = pipe_read,
+ .write = do_sync_write,
+ .aio_write = pipe_write,
+ .poll = pipe_poll,
+ .unlocked_ioctl = pipe_ioctl,
+ .release = pipe_release,
+ .fasync = pipe_fasync,
+};
+
/*
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
@@ -1229,9 +1240,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
*/
struct pipe_inode_info *get_pipe_info(struct file *file)
{
- struct inode *i = file_inode(file);
-
- return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+ return file->f_op == &pipefifo_fops ? file->private_data : NULL;
}
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1243,7 +1252,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (!pipe)
return -EBADF;
- mutex_lock(&pipe->inode->i_mutex);
+ __pipe_lock(pipe);
switch (cmd) {
case F_SETPIPE_SZ: {
@@ -1272,7 +1281,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
}
out:
- mutex_unlock(&pipe->inode->i_mutex);
+ __pipe_unlock(pipe);
return ret;
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 8b29d2164da6..9af0df15256e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt)
if (peer_mnt == mnt)
peer_mnt = NULL;
}
- if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+ if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
+ list_empty(&mnt->mnt_share))
mnt_release_group_id(mnt);
list_del_init(&mnt->mnt_share);
@@ -218,7 +219,7 @@ static struct mount *get_source(struct mount *dest,
* @source_mnt: source mount.
* @tree_list : list of heads of trees to be attached.
*/
-int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
+int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
struct mount *source_mnt, struct list_head *tree_list)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
@@ -227,7 +228,6 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
struct mount *prev_dest_mnt = dest_mnt;
struct mount *prev_src_mnt = source_mnt;
LIST_HEAD(tmp_list);
- LIST_HEAD(umount_list);
for (m = propagation_next(dest_mnt, dest_mnt); m;
m = propagation_next(m, dest_mnt)) {
@@ -250,8 +250,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
goto out;
}
- if (is_subdir(dest_dentry, m->mnt.mnt_root)) {
- mnt_set_mountpoint(m, dest_dentry, child);
+ if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
+ mnt_set_mountpoint(m, dest_mp, child);
list_add_tail(&child->mnt_hash, tree_list);
} else {
/*
@@ -267,10 +267,9 @@ out:
br_write_lock(&vfsmount_lock);
while (!list_empty(&tmp_list)) {
child = list_first_entry(&tmp_list, struct mount, mnt_hash);
- umount_tree(child, 0, &umount_list);
+ umount_tree(child, 0);
}
br_write_unlock(&vfsmount_lock);
- release_mounts(&umount_list);
return ret;
}
diff --git a/fs/pnode.h b/fs/pnode.h
index a0493d5ebfbf..b091445c1c4a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -32,17 +32,16 @@ static inline void set_mnt_shared(struct mount *mnt)
}
void change_mnt_propagation(struct mount *, int);
-int propagate_mnt(struct mount *, struct dentry *, struct mount *,
+int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct list_head *);
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
-void mnt_set_mountpoint(struct mount *, struct dentry *,
+void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *);
-void release_mounts(struct list_head *);
-void umount_tree(struct mount *, int, struct list_head *);
+void umount_tree(struct mount *, int);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 712f24db9600..ab30716584f5 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
proc-y := nommu.o task_nommu.o
-proc-$(CONFIG_MMU) := mmu.o task_mmu.o
+proc-$(CONFIG_MMU) := task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
fd.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 69078c7cef1f..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/flex_array.h>
+#include <linux/posix-timers.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
@@ -404,6 +405,37 @@ static const struct file_operations proc_lstats_operations = {
#endif
+#ifdef CONFIG_CGROUPS
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cgroup_show, pid);
+}
+
+static const struct file_operations proc_cgroup_operations = {
+ .open = cgroup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+#ifdef CONFIG_PROC_PID_CPUSET
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cpuset_show, pid);
+}
+
+static const struct file_operations proc_cpuset_operations = {
+ .open = cpuset_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static int proc_oom_score(struct task_struct *task, char *buffer)
{
unsigned long totalpages = totalram_pages + total_swap_pages;
@@ -1347,11 +1379,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
struct inode *inode = file_inode(file);
struct task_struct *p;
char buffer[TASK_COMM_LEN];
+ const size_t maxlen = sizeof(buffer) - 1;
memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
+ if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
p = get_proc_task(inode);
@@ -1621,6 +1652,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
+int pid_delete_dentry(const struct dentry *dentry)
+{
+ /* Is the task we represent dead?
+ * If so, then don't put the dentry on the lru list,
+ * kill it immediately.
+ */
+ return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
const struct dentry_operations pid_dentry_operations =
{
.d_revalidate = pid_revalidate,
@@ -1641,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
const char *name, int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
+ struct qstr qname = QSTR_INIT(name, len);
struct inode *inode;
- struct qstr qname;
- ino_t ino = 0;
- unsigned type = DT_UNKNOWN;
-
- qname.name = name;
- qname.len = len;
- qname.hash = full_name_hash(name, len);
+ unsigned type;
+ ino_t ino;
- child = d_lookup(dir, &qname);
+ child = d_hash_and_lookup(dir, &qname);
if (!child) {
- struct dentry *new;
- new = d_alloc(dir, &qname);
- if (new) {
- child = instantiate(dir->d_inode, new, task, ptr);
- if (child)
- dput(new);
- else
- child = new;
+ child = d_alloc(dir, &qname);
+ if (!child)
+ goto end_instantiate;
+ if (instantiate(dir->d_inode, child, task, ptr) < 0) {
+ dput(child);
+ goto end_instantiate;
}
}
- if (!child || IS_ERR(child) || !child->d_inode)
- goto end_instantiate;
inode = child->d_inode;
- if (inode) {
- ino = inode->i_ino;
- type = inode->i_mode >> 12;
- }
+ ino = inode->i_ino;
+ type = inode->i_mode >> 12;
dput(child);
+ return dir_emit(ctx, name, len, ino, type);
+
end_instantiate:
- if (!ino)
- ino = find_inode_number(dir, &qname);
- if (!ino)
- ino = 1;
- return filldir(dirent, name, len, filp->f_pos, ino, type);
+ return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1806,7 +1834,7 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
-static struct dentry *
+static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
@@ -1816,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode = proc_pid_make_inode(dir->i_sb, task);
if (!inode)
- return ERR_PTR(-ENOENT);
+ return -ENOENT;
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;
@@ -1833,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
d_set_d_op(dentry, &tid_map_files_dentry_operations);
d_add(dentry, inode);
- return NULL;
+ return 0;
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1842,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
struct task_struct *task;
- struct dentry *result;
+ int result;
struct mm_struct *mm;
- result = ERR_PTR(-EPERM);
+ result = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto out;
- result = ERR_PTR(-ENOENT);
+ result = -ENOENT;
task = get_proc_task(dir);
if (!task)
goto out;
- result = ERR_PTR(-EACCES);
+ result = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_put_task;
- result = ERR_PTR(-ENOENT);
+ result = -ENOENT;
if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
goto out_put_task;
@@ -1881,7 +1909,7 @@ out_no_vma:
out_put_task:
put_task_struct(task);
out:
- return result;
+ return ERR_PTR(result);
}
static const struct inode_operations proc_map_files_inode_operations = {
@@ -1891,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
};
static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
struct vm_area_struct *vma;
struct task_struct *task;
struct mm_struct *mm;
- ino_t ino;
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
int ret;
ret = -EPERM;
@@ -1906,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out;
ret = -ENOENT;
- task = get_proc_task(inode);
+ task = get_proc_task(file_inode(file));
if (!task)
goto out;
@@ -1915,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
goto out_put_task;
ret = 0;
- switch (filp->f_pos) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
- goto out_put_task;
- filp->f_pos++;
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out_put_task;
- filp->f_pos++;
- default:
- {
- unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
- struct map_files_info *p;
-
- mm = get_task_mm(task);
- if (!mm)
- goto out_put_task;
- down_read(&mm->mmap_sem);
+ if (!dir_emit_dots(file, ctx))
+ goto out_put_task;
- nr_files = 0;
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+ down_read(&mm->mmap_sem);
- /*
- * We need two passes here:
- *
- * 1) Collect vmas of mapped files with mmap_sem taken
- * 2) Release mmap_sem and instantiate entries
- *
- * otherwise we get lockdep complained, since filldir()
- * routine might require mmap_sem taken in might_fault().
- */
+ nr_files = 0;
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > filp->f_pos)
- nr_files++;
- }
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
- ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
- up_read(&mm->mmap_sem);
- mmput(mm);
- goto out_put_task;
- }
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= filp->f_pos)
- continue;
-
- info.mode = vma->vm_file->f_mode;
- info.len = snprintf(info.name,
- sizeof(info.name), "%lx-%lx",
- vma->vm_start, vma->vm_end);
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > ctx->pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_put_task;
}
- up_read(&mm->mmap_sem);
-
- for (i = 0; i < nr_files; i++) {
- p = flex_array_get(fa, i);
- ret = proc_fill_cache(filp, dirent, filldir,
- p->name, p->len,
- proc_map_files_instantiate,
- task,
- (void *)(unsigned long)p->mode);
- if (ret)
- break;
- filp->f_pos++;
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
+
+ info.mode = vma->vm_file->f_mode;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
}
- if (fa)
- flex_array_free(fa);
- mmput(mm);
}
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ if (!proc_fill_cache(file, ctx,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task,
+ (void *)(unsigned long)p->mode))
+ break;
+ ctx->pos++;
}
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
out_put_task:
put_task_struct(task);
@@ -2009,19 +2020,115 @@ out:
static const struct file_operations proc_map_files_operations = {
.read = generic_read_dir,
- .readdir = proc_map_files_readdir,
+ .iterate = proc_map_files_readdir,
.llseek = default_llseek,
};
+struct timers_private {
+ struct pid *pid;
+ struct task_struct *task;
+ struct sighand_struct *sighand;
+ struct pid_namespace *ns;
+ unsigned long flags;
+};
+
+static void *timers_start(struct seq_file *m, loff_t *pos)
+{
+ struct timers_private *tp = m->private;
+
+ tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+ if (!tp->task)
+ return ERR_PTR(-ESRCH);
+
+ tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+ if (!tp->sighand)
+ return ERR_PTR(-ESRCH);
+
+ return seq_list_start(&tp->task->signal->posix_timers, *pos);
+}
+
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct timers_private *tp = m->private;
+ return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
+
+static void timers_stop(struct seq_file *m, void *v)
+{
+ struct timers_private *tp = m->private;
+
+ if (tp->sighand) {
+ unlock_task_sighand(tp->task, &tp->flags);
+ tp->sighand = NULL;
+ }
+
+ if (tp->task) {
+ put_task_struct(tp->task);
+ tp->task = NULL;
+ }
+}
+
+static int show_timer(struct seq_file *m, void *v)
+{
+ struct k_itimer *timer;
+ struct timers_private *tp = m->private;
+ int notify;
+ static char *nstr[] = {
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
+ };
+
+ timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ notify = timer->it_sigev_notify;
+
+ seq_printf(m, "ID: %d\n", timer->it_id);
+ seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
+ timer->sigq->info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n",
+ nstr[notify & ~SIGEV_THREAD_ID],
+ (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ pid_nr_ns(timer->it_pid, tp->ns));
+ seq_printf(m, "ClockID: %d\n", timer->it_clock);
+
+ return 0;
+}
+
+static const struct seq_operations proc_timers_seq_ops = {
+ .start = timers_start,
+ .next = timers_next,
+ .stop = timers_stop,
+ .show = show_timer,
+};
+
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+ struct timers_private *tp;
+
+ tp = __seq_open_private(file, &proc_timers_seq_ops,
+ sizeof(struct timers_private));
+ if (!tp)
+ return -ENOMEM;
+
+ tp->pid = proc_pid(inode);
+ tp->ns = inode->i_sb->s_fs_info;
+ return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+ .open = proc_timers_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
#endif /* CONFIG_CHECKPOINT_RESTORE */
-static struct dentry *proc_pident_instantiate(struct inode *dir,
+static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-ENOENT);
inode = proc_pid_make_inode(dir->i_sb, task);
if (!inode)
@@ -2040,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
+ return -ENOENT;
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2050,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
const struct pid_entry *ents,
unsigned int nents)
{
- struct dentry *error;
+ int error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
- error = ERR_PTR(-ENOENT);
+ error = -ENOENT;
if (!task)
goto out_no_task;
@@ -2077,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
out:
put_task_struct(task);
out_no_task:
- return error;
-}
-
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
- return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
- proc_pident_instantiate, task, p);
+ return ERR_PTR(error);
}
-static int proc_pident_readdir(struct file *filp,
- void *dirent, filldir_t filldir,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
const struct pid_entry *ents, unsigned int nents)
{
- int i;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- const struct pid_entry *p, *last;
- ino_t ino;
- int ret;
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct pid_entry *p;
- ret = -ENOENT;
if (!task)
- goto out_no_task;
+ return -ENOENT;
- ret = 0;
- i = filp->f_pos;
- switch (i) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- i -= 2;
- if (i >= nents) {
- ret = 1;
- goto out;
- }
- p = ents + i;
- last = &ents[nents - 1];
- while (p <= last) {
- if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
- goto out;
- filp->f_pos++;
- p++;
- }
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+
+ if (ctx->pos >= nents + 2)
+ goto out;
- ret = 1;
+ for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+ if (!proc_fill_cache(file, ctx, p->name, p->len,
+ proc_pident_instantiate, task, p))
+ break;
+ ctx->pos++;
+ }
out:
put_task_struct(task);
-out_no_task:
- return ret;
+ return 0;
}
#ifdef CONFIG_SECURITY
@@ -2225,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
};
-static int proc_attr_dir_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+ return proc_pident_readdir(file, ctx,
+ attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}
static const struct file_operations proc_attr_dir_operations = {
.read = generic_read_dir,
- .readdir = proc_attr_dir_readdir,
+ .iterate = proc_attr_dir_readdir,
.llseek = default_llseek,
};
@@ -2583,18 +2652,20 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
#endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ REG("timers", S_IRUGO, proc_timers_operations),
+#endif
};
-static int proc_tgid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tgid_base_readdir,
+ .iterate = proc_tgid_base_readdir,
.llseek = default_llseek,
};
@@ -2696,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
}
}
-static struct dentry *proc_pid_instantiate(struct inode *dir,
- struct dentry * dentry,
- struct task_struct *task, const void *ptr)
+static int proc_pid_instantiate(struct inode *dir,
+ struct dentry * dentry,
+ struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
struct inode *inode;
inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2720,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
+ return -ENOENT;
}
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- struct dentry *result = NULL;
+ int result = 0;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
@@ -2748,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
result = proc_pid_instantiate(dir, dentry, task, NULL);
put_task_struct(task);
out:
- return result;
+ return ERR_PTR(result);
}
/*
@@ -2794,52 +2864,44 @@ retry:
return iter;
}
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
-
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct tgid_iter iter)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", iter.tgid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_pid_instantiate, iter.task, NULL);
-}
-
-static int fake_filldir(void *buf, const char *name, int namelen,
- loff_t offset, u64 ino, unsigned d_type)
-{
- return 0;
-}
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
/* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns;
- filldir_t __filldir;
+ struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
+ loff_t pos = ctx->pos;
- if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
- goto out;
+ if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+ return 0;
- ns = filp->f_dentry->d_sb->s_fs_info;
+ if (pos == TGID_OFFSET - 1) {
+ struct inode *inode = ns->proc_self->d_inode;
+ if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ return 0;
+ iter.tgid = 0;
+ } else {
+ iter.tgid = pos - TGID_OFFSET;
+ }
iter.task = NULL;
- iter.tgid = filp->f_pos - TGID_OFFSET;
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
- if (has_pid_permissions(ns, iter.task, 2))
- __filldir = filldir;
- else
- __filldir = fake_filldir;
+ char name[PROC_NUMBUF];
+ int len;
+ if (!has_pid_permissions(ns, iter.task, 2))
+ continue;
- filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+ len = snprintf(name, sizeof(name), "%d", iter.tgid);
+ ctx->pos = iter.tgid + TGID_OFFSET;
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_pid_instantiate, iter.task, NULL)) {
put_task_struct(iter.task);
- goto out;
+ return 0;
}
}
- filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
+ ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
return 0;
}
@@ -2927,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
};
-static int proc_tid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2942,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
static const struct file_operations proc_tid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tid_base_readdir,
+ .iterate = proc_tid_base_readdir,
.llseek = default_llseek,
};
@@ -2952,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
.setattr = proc_setattr,
};
-static struct dentry *proc_task_instantiate(struct inode *dir,
+static int proc_task_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
struct inode *inode;
inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2974,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
+ return -ENOENT;
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- struct dentry *result = ERR_PTR(-ENOENT);
+ int result = -ENOENT;
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
@@ -3011,7 +3071,7 @@ out_drop_task:
out:
put_task_struct(leader);
out_no_task:
- return result;
+ return ERR_PTR(result);
}
/*
@@ -3083,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
return pos;
}
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct task_struct *task, int tid)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", tid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_task_instantiate, task, NULL);
-}
-
/* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
struct task_struct *leader = NULL;
- struct task_struct *task;
- int retval = -ENOENT;
- ino_t ino;
- int tid;
+ struct task_struct *task = get_proc_task(file_inode(file));
struct pid_namespace *ns;
+ int tid;
- task = get_proc_task(inode);
if (!task)
- goto out_no_task;
+ return -ENOENT;
rcu_read_lock();
if (pid_alive(task)) {
leader = task->group_leader;
@@ -3115,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
rcu_read_unlock();
put_task_struct(task);
if (!leader)
- goto out_no_task;
- retval = 0;
+ return -ENOENT;
- switch ((unsigned long)filp->f_pos) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = filp->f_dentry->d_sb->s_fs_info;
- tid = (int)filp->f_version;
- filp->f_version = 0;
- for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+ ns = file->f_dentry->d_sb->s_fs_info;
+ tid = (int)file->f_version;
+ file->f_version = 0;
+ for (task = first_tid(leader, tid, ctx->pos - 2, ns);
task;
- task = next_tid(task), filp->f_pos++) {
+ task = next_tid(task), ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
tid = task_pid_nr_ns(task, ns);
- if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+ len = snprintf(name, sizeof(name), "%d", tid);
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- filp->f_version = (u64)tid;
+ file->f_version = (u64)tid;
put_task_struct(task);
break;
}
}
out:
put_task_struct(leader);
-out_no_task:
- return retval;
+ return 0;
}
static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3180,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
- .readdir = proc_task_readdir,
+ .iterate = proc_task_readdir,
.llseek = default_llseek,
};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
return ret;
}
-static struct dentry *
+static int
proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
unsigned fd = (unsigned long)ptr;
struct proc_inode *ei;
struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
/* Close the race of the process dying before we return the dentry */
if (tid_fd_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
+ return -ENOENT;
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
instantiate_t instantiate)
{
struct task_struct *task = get_proc_task(dir);
- struct dentry *result = ERR_PTR(-ENOENT);
+ int result = -ENOENT;
unsigned fd = name_to_int(dentry);
if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
out:
put_task_struct(task);
out_no_task:
- return result;
+ return ERR_PTR(result);
}
-static int proc_readfd_common(struct file * filp, void * dirent,
- filldir_t filldir, instantiate_t instantiate)
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ instantiate_t instantiate)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *p = get_proc_task(inode);
+ struct task_struct *p = get_proc_task(file_inode(file));
struct files_struct *files;
- unsigned int fd, ino;
- int retval;
+ unsigned int fd;
- retval = -ENOENT;
if (!p)
- goto out_no_task;
- retval = 0;
-
- fd = filp->f_pos;
- switch (fd) {
- case 0:
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- default:
- files = get_files_struct(p);
- if (!files)
- goto out;
- rcu_read_lock();
- for (fd = filp->f_pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, filp->f_pos++) {
- char name[PROC_NUMBUF];
- int len;
- int rv;
-
- if (!fcheck_files(files, fd))
- continue;
- rcu_read_unlock();
+ return -ENOENT;
- len = snprintf(name, sizeof(name), "%d", fd);
- rv = proc_fill_cache(filp, dirent, filldir,
- name, len, instantiate, p,
- (void *)(unsigned long)fd);
- if (rv < 0)
- goto out_fd_loop;
- rcu_read_lock();
- }
- rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ files = get_files_struct(p);
+ if (!files)
+ goto out;
+
+ rcu_read_lock();
+ for (fd = ctx->pos - 2;
+ fd < files_fdtable(files)->max_fds;
+ fd++, ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
+
+ if (!fcheck_files(files, fd))
+ continue;
+ rcu_read_unlock();
+
+ len = snprintf(name, sizeof(name), "%d", fd);
+ if (!proc_fill_cache(file, ctx,
+ name, len, instantiate, p,
+ (void *)(unsigned long)fd))
+ goto out_fd_loop;
+ rcu_read_lock();
}
+ rcu_read_unlock();
+out_fd_loop:
+ put_files_struct(files);
out:
put_task_struct(p);
-out_no_task:
- return retval;
+ return 0;
}
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
{
- return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+ return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .readdir = proc_readfd,
+ .iterate = proc_readfd,
.llseek = default_llseek,
};
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
.setattr = proc_setattr,
};
-static struct dentry *
+static int
proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
unsigned fd = (unsigned long)ptr;
struct proc_inode *ei;
struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
/* Close the race of the process dying before we return the dentry */
if (tid_fd_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
+ return -ENOENT;
}
static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
{
- return proc_readfd_common(filp, dirent, filldir,
+ return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
}
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .readdir = proc_readfdinfo,
+ .iterate = proc_readfdinfo,
.llseek = default_llseek,
};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index cbb1d47deda8..7c047f256ae2 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -11,4 +11,9 @@ extern const struct inode_operations proc_fdinfo_inode_operations;
extern int proc_fd_permission(struct inode *inode, int mask);
+static inline int proc_fd(struct inode *inode)
+{
+ return PROC_I(inode)->fd;
+}
+
#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 21e1a8f1659d..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -36,212 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
return !memcmp(name, de->name, len);
}
-/* buffer size is one page but our output routines use some slack for overruns */
-#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
-
-static ssize_t
-__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos)
-{
- struct inode * inode = file_inode(file);
- char *page;
- ssize_t retval=0;
- int eof=0;
- ssize_t n, count;
- char *start;
- struct proc_dir_entry * dp;
- unsigned long long pos;
-
- /*
- * Gaah, please just use "seq_file" instead. The legacy /proc
- * interfaces cut loff_t down to off_t for reads, and ignore
- * the offset entirely for writes..
- */
- pos = *ppos;
- if (pos > MAX_NON_LFS)
- return 0;
- if (nbytes > MAX_NON_LFS - pos)
- nbytes = MAX_NON_LFS - pos;
-
- dp = PDE(inode);
- if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- while ((nbytes > 0) && !eof) {
- count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
-
- start = NULL;
- if (dp->read_proc) {
- /*
- * How to be a proc read function
- * ------------------------------
- * Prototype:
- * int f(char *buffer, char **start, off_t offset,
- * int count, int *peof, void *dat)
- *
- * Assume that the buffer is "count" bytes in size.
- *
- * If you know you have supplied all the data you
- * have, set *peof.
- *
- * You have three ways to return data:
- * 0) Leave *start = NULL. (This is the default.)
- * Put the data of the requested offset at that
- * offset within the buffer. Return the number (n)
- * of bytes there are from the beginning of the
- * buffer up to the last byte of data. If the
- * number of supplied bytes (= n - offset) is
- * greater than zero and you didn't signal eof
- * and the reader is prepared to take more data
- * you will be called again with the requested
- * offset advanced by the number of bytes
- * absorbed. This interface is useful for files
- * no larger than the buffer.
- * 1) Set *start = an unsigned long value less than
- * the buffer address but greater than zero.
- * Put the data of the requested offset at the
- * beginning of the buffer. Return the number of
- * bytes of data placed there. If this number is
- * greater than zero and you didn't signal eof
- * and the reader is prepared to take more data
- * you will be called again with the requested
- * offset advanced by *start. This interface is
- * useful when you have a large file consisting
- * of a series of blocks which you want to count
- * and return as wholes.
- * (Hack by Paul.Russell@rustcorp.com.au)
- * 2) Set *start = an address within the buffer.
- * Put the data of the requested offset at *start.
- * Return the number of bytes of data placed there.
- * If this number is greater than zero and you
- * didn't signal eof and the reader is prepared to
- * take more data you will be called again with the
- * requested offset advanced by the number of bytes
- * absorbed.
- */
- n = dp->read_proc(page, &start, *ppos,
- count, &eof, dp->data);
- } else
- break;
-
- if (n == 0) /* end of file */
- break;
- if (n < 0) { /* error */
- if (retval == 0)
- retval = n;
- break;
- }
-
- if (start == NULL) {
- if (n > PAGE_SIZE) /* Apparent buffer overflow */
- n = PAGE_SIZE;
- n -= *ppos;
- if (n <= 0)
- break;
- if (n > count)
- n = count;
- start = page + *ppos;
- } else if (start < page) {
- if (n > PAGE_SIZE) /* Apparent buffer overflow */
- n = PAGE_SIZE;
- if (n > count) {
- /*
- * Don't reduce n because doing so might
- * cut off part of a data block.
- */
- pr_warn("proc_file_read: count exceeded\n");
- }
- } else /* start >= page */ {
- unsigned long startoff = (unsigned long)(start - page);
- if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */
- n = PAGE_SIZE - startoff;
- if (n > count)
- n = count;
- }
-
- n -= copy_to_user(buf, start < page ? page : start, n);
- if (n == 0) {
- if (retval == 0)
- retval = -EFAULT;
- break;
- }
-
- *ppos += start < page ? (unsigned long)start : n;
- nbytes -= n;
- buf += n;
- retval += n;
- }
- free_page((unsigned long) page);
- return retval;
-}
-
-static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos)
-{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- ssize_t rv = -EIO;
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- spin_unlock(&pde->pde_unload_lock);
-
- rv = __proc_file_read(file, buf, nbytes, ppos);
-
- pde_users_dec(pde);
- return rv;
-}
-
-static ssize_t
-proc_file_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- ssize_t rv = -EIO;
-
- if (pde->write_proc) {
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- spin_unlock(&pde->pde_unload_lock);
-
- /* FIXME: does this routine need ppos? probably... */
- rv = pde->write_proc(file, buffer, count, pde->data);
- pde_users_dec(pde);
- }
- return rv;
-}
-
-
-static loff_t
-proc_file_lseek(struct file *file, loff_t offset, int orig)
-{
- loff_t retval = -EINVAL;
- switch (orig) {
- case 1:
- offset += file->f_pos;
- /* fallthrough */
- case 0:
- if (offset < 0 || offset > MAX_NON_LFS)
- break;
- file->f_pos = retval = offset;
- }
- return retval;
-}
-
-static const struct file_operations proc_file_operations = {
- .llseek = proc_file_lseek,
- .read = proc_file_read,
- .write = proc_file_write,
-};
-
static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = dentry->d_inode;
@@ -371,7 +165,7 @@ void proc_free_inum(unsigned int inum)
static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, PDE(dentry->d_inode)->data);
+ nd_set_link(nd, __PDE_DATA(dentry->d_inode));
return NULL;
}
@@ -439,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+ struct dir_context *ctx)
{
- unsigned int ino;
int i;
- struct inode *inode = file_inode(filp);
- int ret = 0;
-
- ino = inode->i_ino;
- i = filp->f_pos;
- switch (i) {
- case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- if (filldir(dirent, "..", 2, i,
- parent_ino(filp->f_path.dentry),
- DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- spin_lock(&proc_subdir_lock);
- de = de->subdir;
- i -= 2;
- for (;;) {
- if (!de) {
- ret = 1;
- spin_unlock(&proc_subdir_lock);
- goto out;
- }
- if (!i)
- break;
- de = de->next;
- i--;
- }
- do {
- struct proc_dir_entry *next;
-
- /* filldir passes info to user space */
- pde_get(de);
- spin_unlock(&proc_subdir_lock);
- if (filldir(dirent, de->name, de->namelen, filp->f_pos,
- de->low_ino, de->mode >> 12) < 0) {
- pde_put(de);
- goto out;
- }
- spin_lock(&proc_subdir_lock);
- filp->f_pos++;
- next = de->next;
- pde_put(de);
- de = next;
- } while (de);
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ spin_lock(&proc_subdir_lock);
+ de = de->subdir;
+ i = ctx->pos - 2;
+ for (;;) {
+ if (!de) {
spin_unlock(&proc_subdir_lock);
+ return 0;
+ }
+ if (!i)
+ break;
+ de = de->next;
+ i--;
}
- ret = 1;
-out:
- return ret;
+
+ do {
+ struct proc_dir_entry *next;
+ pde_get(de);
+ spin_unlock(&proc_subdir_lock);
+ if (!dir_emit(ctx, de->name, de->namelen,
+ de->low_ino, de->mode >> 12)) {
+ pde_put(de);
+ return 0;
+ }
+ spin_lock(&proc_subdir_lock);
+ ctx->pos++;
+ next = de->next;
+ pde_put(de);
+ de = next;
+ } while (de);
+ spin_unlock(&proc_subdir_lock);
+ return 0;
}
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
- return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+ return proc_readdir_de(PDE(inode), file, ctx);
}
/*
@@ -519,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
static const struct file_operations proc_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_readdir,
+ .iterate = proc_readdir,
};
/*
@@ -541,19 +311,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
return ret;
if (S_ISDIR(dp->mode)) {
- if (dp->proc_iops == NULL) {
- dp->proc_fops = &proc_dir_operations;
- dp->proc_iops = &proc_dir_inode_operations;
- }
+ dp->proc_fops = &proc_dir_operations;
+ dp->proc_iops = &proc_dir_inode_operations;
dir->nlink++;
} else if (S_ISLNK(dp->mode)) {
- if (dp->proc_iops == NULL)
- dp->proc_iops = &proc_link_inode_operations;
+ dp->proc_iops = &proc_link_inode_operations;
} else if (S_ISREG(dp->mode)) {
- if (dp->proc_fops == NULL)
- dp->proc_fops = &proc_file_operations;
- if (dp->proc_iops == NULL)
- dp->proc_iops = &proc_file_inode_operations;
+ BUG_ON(dp->proc_fops == NULL);
+ dp->proc_iops = &proc_file_inode_operations;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
}
spin_lock(&proc_subdir_lock);
@@ -636,13 +404,17 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
- struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
{
struct proc_dir_entry *ent;
+ if (mode == 0)
+ mode = S_IRUGO | S_IXUGO;
+
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
+ ent->data = data;
if (proc_register(parent, ent) < 0) {
kfree(ent);
ent = NULL;
@@ -650,82 +422,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
}
return ent;
}
-EXPORT_SYMBOL(proc_mkdir_mode);
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
-struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
- struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+ struct proc_dir_entry *parent)
{
- struct proc_dir_entry *ent;
-
- ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2);
- if (ent) {
- ent->data = net;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- ent = NULL;
- }
- }
- return ent;
+ return proc_mkdir_data(name, mode, parent, NULL);
}
-EXPORT_SYMBOL_GPL(proc_net_mkdir);
+EXPORT_SYMBOL(proc_mkdir_mode);
struct proc_dir_entry *proc_mkdir(const char *name,
struct proc_dir_entry *parent)
{
- return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+ return proc_mkdir_data(name, 0, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir);
-struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
- struct proc_dir_entry *parent)
-{
- struct proc_dir_entry *ent;
- nlink_t nlink;
-
- if (S_ISDIR(mode)) {
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO | S_IXUGO;
- nlink = 2;
- } else {
- if ((mode & S_IFMT) == 0)
- mode |= S_IFREG;
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- nlink = 1;
- }
-
- ent = __proc_create(&parent, name, mode, nlink);
- if (ent) {
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- ent = NULL;
- }
- }
- return ent;
-}
-EXPORT_SYMBOL(create_proc_entry);
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,
void *data)
{
struct proc_dir_entry *pde;
- nlink_t nlink;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
- if (S_ISDIR(mode)) {
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO | S_IXUGO;
- nlink = 2;
- } else {
- if ((mode & S_IFMT) == 0)
- mode |= S_IFREG;
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- nlink = 1;
+ if (!S_ISREG(mode)) {
+ WARN_ON(1); /* use proc_mkdir() */
+ return NULL;
}
- pde = __proc_create(&parent, name, mode, nlink);
+ if ((mode & S_IALLUGO) == 0)
+ mode |= S_IRUGO;
+ pde = __proc_create(&parent, name, mode, 1);
if (!pde)
goto out;
pde->proc_fops = proc_fops;
@@ -739,6 +468,19 @@ out:
return NULL;
}
EXPORT_SYMBOL(proc_create_data);
+
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
+{
+ de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
+
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+ de->uid = uid;
+ de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
static void free_proc_entry(struct proc_dir_entry *de)
{
@@ -755,41 +497,6 @@ void pde_put(struct proc_dir_entry *pde)
free_proc_entry(pde);
}
-static void entry_rundown(struct proc_dir_entry *de)
-{
- spin_lock(&de->pde_unload_lock);
- /*
- * Stop accepting new callers into module. If you're
- * dynamically allocating ->proc_fops, save a pointer somewhere.
- */
- de->proc_fops = NULL;
- /* Wait until all existing callers into module are done. */
- if (de->pde_users > 0) {
- DECLARE_COMPLETION_ONSTACK(c);
-
- if (!de->pde_unload_completion)
- de->pde_unload_completion = &c;
-
- spin_unlock(&de->pde_unload_lock);
-
- wait_for_completion(de->pde_unload_completion);
-
- spin_lock(&de->pde_unload_lock);
- }
-
- while (!list_empty(&de->pde_openers)) {
- struct pde_opener *pdeo;
-
- pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
- list_del(&pdeo->lh);
- spin_unlock(&de->pde_unload_lock);
- pdeo->release(pdeo->inode, pdeo->file);
- kfree(pdeo);
- spin_lock(&de->pde_unload_lock);
- }
- spin_unlock(&de->pde_unload_lock);
-}
-
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@@ -821,7 +528,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
return;
}
- entry_rundown(de);
+ proc_entry_rundown(de);
if (S_ISDIR(de->mode))
parent->nlink--;
@@ -870,7 +577,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
}
spin_unlock(&proc_subdir_lock);
- entry_rundown(de);
+ proc_entry_rundown(de);
next = de->parent;
if (S_ISDIR(de->mode))
next->nlink--;
@@ -886,3 +593,23 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
return 0;
}
EXPORT_SYMBOL(remove_proc_subtree);
+
+void *proc_get_parent_data(const struct inode *inode)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ return de->parent->data;
+}
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+ if (de)
+ remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
+
+void *PDE_DATA(const struct inode *inode)
+{
+ return __PDE_DATA(inode);
+}
+EXPORT_SYMBOL(PDE_DATA);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 869116c2afbe..073aea60cf8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -22,6 +22,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
+#include <linux/magic.h>
#include <asm/uaccess.h>
@@ -50,8 +51,8 @@ static void proc_evict_inode(struct inode *inode)
sysctl_head_put(head);
}
/* Release any associated namespace */
- ns_ops = PROC_I(inode)->ns_ops;
- ns = PROC_I(inode)->ns;
+ ns_ops = PROC_I(inode)->ns.ns_ops;
+ ns = PROC_I(inode)->ns.ns;
if (ns_ops && ns)
ns_ops->put(ns);
}
@@ -72,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
- ei->ns = NULL;
- ei->ns_ops = NULL;
+ ei->ns.ns = NULL;
+ ei->ns.ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
@@ -129,96 +130,100 @@ static const struct super_operations proc_sops = {
.show_options = proc_show_options,
};
-static void __pde_users_dec(struct proc_dir_entry *pde)
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
+{
+ return atomic_inc_unless_negative(&pde->in_use);
+}
+
+static void unuse_pde(struct proc_dir_entry *pde)
{
- pde->pde_users--;
- if (pde->pde_unload_completion && pde->pde_users == 0)
+ if (atomic_dec_return(&pde->in_use) == BIAS)
complete(pde->pde_unload_completion);
}
-void pde_users_dec(struct proc_dir_entry *pde)
+/* pde is locked */
+static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
- spin_lock(&pde->pde_unload_lock);
- __pde_users_dec(pde);
- spin_unlock(&pde->pde_unload_lock);
+ if (pdeo->closing) {
+ /* somebody else is doing that, just wait */
+ DECLARE_COMPLETION_ONSTACK(c);
+ pdeo->c = &c;
+ spin_unlock(&pde->pde_unload_lock);
+ wait_for_completion(&c);
+ spin_lock(&pde->pde_unload_lock);
+ } else {
+ struct file *file;
+ pdeo->closing = 1;
+ spin_unlock(&pde->pde_unload_lock);
+ file = pdeo->file;
+ pde->proc_fops->release(file_inode(file), file);
+ spin_lock(&pde->pde_unload_lock);
+ list_del_init(&pdeo->lh);
+ if (pdeo->c)
+ complete(pdeo->c);
+ kfree(pdeo);
+ }
+}
+
+void proc_entry_rundown(struct proc_dir_entry *de)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ /* Wait until all existing callers into module are done. */
+ de->pde_unload_completion = &c;
+ if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+ wait_for_completion(&c);
+
+ spin_lock(&de->pde_unload_lock);
+ while (!list_empty(&de->pde_openers)) {
+ struct pde_opener *pdeo;
+ pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+ close_pdeo(de, pdeo);
+ }
+ spin_unlock(&de->pde_unload_lock);
}
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- loff_t (*llseek)(struct file *, loff_t, int);
-
- spin_lock(&pde->pde_unload_lock);
- /*
- * remove_proc_entry() is going to delete PDE (as part of module
- * cleanup sequence). No new callers into module allowed.
- */
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ loff_t (*llseek)(struct file *, loff_t, int);
+ llseek = pde->proc_fops->llseek;
+ if (!llseek)
+ llseek = default_llseek;
+ rv = llseek(file, offset, whence);
+ unuse_pde(pde);
}
- /*
- * Bump refcount so that remove_proc_entry will wail for ->llseek to
- * complete.
- */
- pde->pde_users++;
- /*
- * Save function pointer under lock, to protect against ->proc_fops
- * NULL'ifying right after ->pde_unload_lock is dropped.
- */
- llseek = pde->proc_fops->llseek;
- spin_unlock(&pde->pde_unload_lock);
-
- if (!llseek)
- llseek = default_llseek;
- rv = llseek(file, offset, whence);
-
- pde_users_dec(pde);
return rv;
}
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
+ ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ read = pde->proc_fops->read;
+ if (read)
+ rv = read(file, buf, count, ppos);
+ unuse_pde(pde);
}
- pde->pde_users++;
- read = pde->proc_fops->read;
- spin_unlock(&pde->pde_unload_lock);
-
- if (read)
- rv = read(file, buf, count, ppos);
-
- pde_users_dec(pde);
return rv;
}
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
+ ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ write = pde->proc_fops->write;
+ if (write)
+ rv = write(file, buf, count, ppos);
+ unuse_pde(pde);
}
- pde->pde_users++;
- write = pde->proc_fops->write;
- spin_unlock(&pde->pde_unload_lock);
-
- if (write)
- rv = write(file, buf, count, ppos);
-
- pde_users_dec(pde);
return rv;
}
@@ -227,20 +232,12 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p
struct proc_dir_entry *pde = PDE(file_inode(file));
unsigned int rv = DEFAULT_POLLMASK;
unsigned int (*poll)(struct file *, struct poll_table_struct *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ poll = pde->proc_fops->poll;
+ if (poll)
+ rv = poll(file, pts);
+ unuse_pde(pde);
}
- pde->pde_users++;
- poll = pde->proc_fops->poll;
- spin_unlock(&pde->pde_unload_lock);
-
- if (poll)
- rv = poll(file, pts);
-
- pde_users_dec(pde);
return rv;
}
@@ -249,20 +246,12 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
long (*ioctl)(struct file *, unsigned int, unsigned long);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ ioctl = pde->proc_fops->unlocked_ioctl;
+ if (ioctl)
+ rv = ioctl(file, cmd, arg);
+ unuse_pde(pde);
}
- pde->pde_users++;
- ioctl = pde->proc_fops->unlocked_ioctl;
- spin_unlock(&pde->pde_unload_lock);
-
- if (ioctl)
- rv = ioctl(file, cmd, arg);
-
- pde_users_dec(pde);
return rv;
}
@@ -272,20 +261,12 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ compat_ioctl = pde->proc_fops->compat_ioctl;
+ if (compat_ioctl)
+ rv = compat_ioctl(file, cmd, arg);
+ unuse_pde(pde);
}
- pde->pde_users++;
- compat_ioctl = pde->proc_fops->compat_ioctl;
- spin_unlock(&pde->pde_unload_lock);
-
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
-
- pde_users_dec(pde);
return rv;
}
#endif
@@ -295,20 +276,12 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
int (*mmap)(struct file *, struct vm_area_struct *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ mmap = pde->proc_fops->mmap;
+ if (mmap)
+ rv = mmap(file, vma);
+ unuse_pde(pde);
}
- pde->pde_users++;
- mmap = pde->proc_fops->mmap;
- spin_unlock(&pde->pde_unload_lock);
-
- if (mmap)
- rv = mmap(file, vma);
-
- pde_users_dec(pde);
return rv;
}
@@ -330,91 +303,47 @@ static int proc_reg_open(struct inode *inode, struct file *file)
* by hand in remove_proc_entry(). For this, save opener's credentials
* for later.
*/
- pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+ pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
+ if (!use_pde(pde)) {
kfree(pdeo);
return -ENOENT;
}
- pde->pde_users++;
open = pde->proc_fops->open;
release = pde->proc_fops->release;
- spin_unlock(&pde->pde_unload_lock);
if (open)
rv = open(inode, file);
- spin_lock(&pde->pde_unload_lock);
if (rv == 0 && release) {
/* To know what to release. */
- pdeo->inode = inode;
pdeo->file = file;
/* Strictly for "too late" ->release in proc_reg_release(). */
- pdeo->release = release;
+ spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
+ spin_unlock(&pde->pde_unload_lock);
} else
kfree(pdeo);
- __pde_users_dec(pde);
- spin_unlock(&pde->pde_unload_lock);
- return rv;
-}
-
-static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
- struct inode *inode, struct file *file)
-{
- struct pde_opener *pdeo;
- list_for_each_entry(pdeo, &pde->pde_openers, lh) {
- if (pdeo->inode == inode && pdeo->file == file)
- return pdeo;
- }
- return NULL;
+ unuse_pde(pde);
+ return rv;
}
static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
- int rv = 0;
- int (*release)(struct inode *, struct file *);
struct pde_opener *pdeo;
-
spin_lock(&pde->pde_unload_lock);
- pdeo = find_pde_opener(pde, inode, file);
- if (!pde->proc_fops) {
- /*
- * Can't simply exit, __fput() will think that everything is OK,
- * and move on to freeing struct file. remove_proc_entry() will
- * find slacker in opener's list and will try to do non-trivial
- * things with struct file. Therefore, remove opener from list.
- *
- * But if opener is removed from list, who will ->release it?
- */
- if (pdeo) {
- list_del(&pdeo->lh);
- spin_unlock(&pde->pde_unload_lock);
- rv = pdeo->release(inode, file);
- kfree(pdeo);
- } else
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- release = pde->proc_fops->release;
- if (pdeo) {
- list_del(&pdeo->lh);
- kfree(pdeo);
+ list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+ if (pdeo->file == file) {
+ close_pdeo(pde, pdeo);
+ break;
+ }
}
spin_unlock(&pde->pde_unload_lock);
-
- if (release)
- rv = release(inode, file);
-
- pde_users_dec(pde);
- return rv;
+ return 0;
}
static const struct file_operations proc_reg_file_ops = {
@@ -462,8 +391,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_size = de->size;
if (de->nlink)
set_nlink(inode, de->nlink);
- if (de->proc_iops)
- inode->i_op = de->proc_iops;
+ WARN_ON(!de->proc_iops);
+ inode->i_op = de->proc_iops;
if (de->proc_fops) {
if (S_ISREG(inode->i_mode)) {
#ifdef CONFIG_COMPAT
@@ -506,5 +435,5 @@ int proc_fill_super(struct super_block *s)
return -ENOMEM;
}
- return 0;
+ return proc_setup_self(s);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4598b3..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -1,4 +1,4 @@
-/* internal.h: internal procfs definitions
+/* Internal procfs definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -9,80 +9,83 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/sched.h>
#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
#include <linux/binfmts.h>
-struct ctl_table_header;
-struct mempolicy;
-extern struct proc_dir_entry proc_root;
-extern void proc_self_init(void);
-#ifdef CONFIG_PROC_SYSCTL
-extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *head);
-#else
-static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
-#endif
-#ifdef CONFIG_NET
-extern int proc_net_init(void);
-#else
-static inline int proc_net_init(void) { return 0; }
-#endif
+struct ctl_table_header;
+struct mempolicy;
-struct vmalloc_info {
- unsigned long used;
- unsigned long largest_chunk;
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * The "next" pointer creates a linked list of one /proc directory,
+ * while parent/subdir create the directory structure (every
+ * /proc file has a parent, but "subdir" is NULL for all
+ * non-directory entries).
+ */
+struct proc_dir_entry {
+ unsigned int low_ino;
+ umode_t mode;
+ nlink_t nlink;
+ kuid_t uid;
+ kgid_t gid;
+ loff_t size;
+ const struct inode_operations *proc_iops;
+ const struct file_operations *proc_fops;
+ struct proc_dir_entry *next, *parent, *subdir;
+ void *data;
+ atomic_t count; /* use count */
+ atomic_t in_use; /* number of callers into module in progress; */
+ /* negative -> it's going away RSN */
+ struct completion *pde_unload_completion;
+ struct list_head pde_openers; /* who did ->open, but not ->release */
+ spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+ u8 namelen;
+ char name[];
};
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi) \
-do { \
- (vmi)->used = 0; \
- (vmi)->largest_chunk = 0; \
-} while(0)
-#endif
-
-extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-
-extern const struct file_operations proc_tid_children_operations;
-extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
-extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
-extern const struct file_operations proc_pid_smaps_operations;
-extern const struct file_operations proc_tid_smaps_operations;
-extern const struct file_operations proc_clear_refs_operations;
-extern const struct file_operations proc_pagemap_operations;
-extern const struct file_operations proc_net_operations;
-extern const struct inode_operations proc_net_inode_operations;
-extern const struct inode_operations proc_pid_link_inode_operations;
+union proc_op {
+ int (*proc_get_link)(struct dentry *, struct path *);
+ int (*proc_read)(struct task_struct *task, char *page);
+ int (*proc_show)(struct seq_file *m,
+ struct pid_namespace *ns, struct pid *pid,
+ struct task_struct *task);
+};
-struct proc_maps_private {
+struct proc_inode {
struct pid *pid;
- struct task_struct *task;
-#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
-#endif
-#ifdef CONFIG_NUMA
- struct mempolicy *task_mempolicy;
-#endif
+ int fd;
+ union proc_op op;
+ struct proc_dir_entry *pde;
+ struct ctl_table_header *sysctl;
+ struct ctl_table *sysctl_entry;
+ struct proc_ns ns;
+ struct inode vfs_inode;
};
-void proc_init_inodecache(void);
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+ return container_of(inode, struct proc_inode, vfs_inode);
+}
+
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+ return PROC_I(inode)->pde;
+}
+
+static inline void *__PDE_DATA(const struct inode *inode)
+{
+ return PDE(inode)->data;
+}
static inline struct pid *proc_pid(struct inode *inode)
{
@@ -94,11 +97,6 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
-static inline int proc_fd(struct inode *inode)
-{
- return PROC_I(inode)->fd;
-}
-
static inline int task_dumpable(struct task_struct *task)
{
int dumpable = 0;
@@ -114,15 +112,6 @@ static inline int task_dumpable(struct task_struct *task)
return 0;
}
-static inline int pid_delete_dentry(const struct dentry * dentry)
-{
- /* Is the task we represent dead?
- * If so, then don't put the dentry on the lru list,
- * kill it immediately.
- */
- return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
-}
-
static inline unsigned name_to_int(struct dentry *dentry)
{
const char *name = dentry->d_name.name;
@@ -145,63 +134,165 @@ out:
return ~0U;
}
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
- struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir);
+/*
+ * Offset of the first process in the /proc root directory..
+ */
+#define FIRST_PROCESS_ENTRY 256
-struct pde_opener {
- struct inode *inode;
- struct file *file;
- int (*release)(struct inode *, struct file *);
- struct list_head lh;
-};
-void pde_users_dec(struct proc_dir_entry *pde);
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
+
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int proc_setattr(struct dentry *, struct iattr *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern int pid_revalidate(struct dentry *, unsigned int);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
+extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
+/* Lookups */
+typedef int instantiate_t(struct inode *, struct dentry *,
+ struct task_struct *, const void *);
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+ instantiate_t, struct task_struct *, const void *);
+
+/*
+ * generic.c
+ */
extern spinlock_t proc_subdir_lock;
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-unsigned long task_statm(struct mm_struct *,
- unsigned long *, unsigned long *, unsigned long *, unsigned long *);
-void task_mem(struct seq_file *, struct mm_struct *);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
+ struct dentry *);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
atomic_inc(&pde->count);
return pde;
}
-void pde_put(struct proc_dir_entry *pde);
-
-int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-int proc_remount(struct super_block *sb, int *flags, char *data);
+extern void pde_put(struct proc_dir_entry *);
/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
+ * inode.c
*/
-int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+struct pde_opener {
+ struct file *file;
+ struct list_head lh;
+ int closing;
+ struct completion *c;
+};
+extern const struct inode_operations proc_pid_link_inode_operations;
+extern void proc_init_inodecache(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern int proc_fill_super(struct super_block *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
-/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
- struct task_struct *, const void *);
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- const char *name, int len,
- instantiate_t instantiate, struct task_struct *task, const void *ptr);
-int pid_revalidate(struct dentry *dentry, unsigned int flags);
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
-extern const struct dentry_operations pid_dentry_operations;
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-int proc_setattr(struct dentry *dentry, struct iattr *attr);
+/*
+ * proc_devtree.c
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+extern void proc_device_tree_init(void);
+#endif
+/*
+ * proc_namespaces.c
+ */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
+extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_sysctl.c
+ */
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *);
+#else
+static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
+#endif
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+extern int proc_remount(struct super_block *, int *, char *);
+
+/*
+ * task_[no]mmu.c
+ */
+struct proc_maps_private {
+ struct pid *pid;
+ struct task_struct *task;
+#ifdef CONFIG_MMU
+ struct vm_area_struct *tail_vma;
+#endif
+#ifdef CONFIG_NUMA
+ struct mempolicy *task_mempolicy;
+#endif
+};
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+ unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index eda6f017f272..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -11,10 +11,12 @@
#include <linux/mm.h>
#include <linux/proc_fs.h>
+#include <linux/kcore.h>
#include <linux/user.h>
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -27,6 +29,7 @@
#include <linux/ioport.h>
#include <linux/memory.h>
#include <asm/sections.h>
+#include "internal.h"
#define CORE_STR "CORE"
@@ -405,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
prpsinfo.pr_zomb = 0;
strcpy(prpsinfo.pr_fname, "vmlinux");
- strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+ strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
nhdr->p_filesz += notesize(&notes[1]);
bufp = storenote(&notes[1], bufp);
@@ -564,7 +567,6 @@ static const struct file_operations proc_kcore_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_MEMORY_HOTPLUG
/* just remember that we have to update kcore */
static int __meminit kcore_callback(struct notifier_block *self,
unsigned long action, void *arg)
@@ -578,8 +580,11 @@ static int __meminit kcore_callback(struct notifier_block *self,
}
return NOTIFY_OK;
}
-#endif
+static struct notifier_block kcore_callback_nb __meminitdata = {
+ .notifier_call = kcore_callback,
+ .priority = 0,
+};
static struct kcore_list kcore_vmalloc;
@@ -631,7 +636,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- hotplug_memory_notifier(kcore_callback, 0);
+ register_hotmemory_notifier(&kcore_callback_nb);
return 0;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a740ff1..bdfabdaefdce 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
static int kmsg_open(struct inode * inode, struct file * file)
{
- return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
}
static int kmsg_release(struct inode * inode, struct file * file)
{
- (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+ (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
return 0;
}
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
if ((file->f_flags & O_NONBLOCK) &&
- !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return -EAGAIN;
- return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+ return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
}
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
- if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 1efaaa19c4f3..5aa847a603c0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/vmstat.h>
#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221dfd010..000000000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
- struct vm_struct *vma;
- unsigned long free_area_size;
- unsigned long prev_end;
-
- vmi->used = 0;
-
- if (!vmlist) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- }
- else {
- vmi->largest_chunk = 0;
-
- prev_end = VMALLOC_START;
-
- read_lock(&vmlist_lock);
-
- for (vma = vmlist; vma; vma = vma->next) {
- unsigned long addr = (unsigned long) vma->addr;
-
- /*
- * Some archs keep another range for modules in vmlist
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
-
- vmi->used += vma->size;
-
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
-
- prev_end = vma->size + addr;
- }
-
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
- read_unlock(&vmlist_lock);
- }
-}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 66b51c0383da..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry)
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = dentry->d_inode;
- const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
@@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
inode->i_op = &ns_inode_operations;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
- ei->ns_ops = ns_ops;
- ei->ns = ns;
+ ei->ns.ns_ops = ns_ops;
+ ei->ns.ns = ns;
unlock_new_inode(inode);
} else {
ns_ops->put(ns);
@@ -128,7 +128,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_put_task;
- ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+ ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
if (IS_ERR(ns_path.dentry)) {
error = ERR_CAST(ns_path.dentry);
goto out_put_task;
@@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
{
struct inode *inode = dentry->d_inode;
struct proc_inode *ei = PROC_I(inode);
- const struct proc_ns_operations *ns_ops = ei->ns_ops;
+ const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
struct task_struct *task;
void *ns;
char name[50];
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
.setattr = proc_setattr,
};
-static struct dentry *proc_ns_instantiate(struct inode *dir,
+static int proc_ns_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
const struct proc_ns_operations *ns_ops = ptr;
struct inode *inode;
struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-ENOENT);
inode = proc_pid_make_inode(dir->i_sb, task);
if (!inode)
@@ -202,96 +201,58 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
ei = PROC_I(inode);
inode->i_mode = S_IFLNK|S_IRWXUGO;
inode->i_op = &proc_ns_link_inode_operations;
- ei->ns_ops = ns_ops;
+ ei->ns.ns_ops = ns_ops;
d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, 0))
- error = NULL;
+ return 0;
out:
- return error;
-}
-
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task,
- const struct proc_ns_operations *ops)
-{
- return proc_fill_cache(filp, dirent, filldir,
- ops->name, strlen(ops->name),
- proc_ns_instantiate, task, ops);
+ return -ENOENT;
}
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
- int i;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
const struct proc_ns_operations **entry, **last;
- ino_t ino;
- int ret;
- ret = -ENOENT;
if (!task)
- goto out_no_task;
+ return -ENOENT;
- ret = 0;
- i = filp->f_pos;
- switch (i) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- i -= 2;
- if (i >= ARRAY_SIZE(ns_entries)) {
- ret = 1;
- goto out;
- }
- entry = ns_entries + i;
- last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
- while (entry <= last) {
- if (proc_ns_fill_cache(filp, dirent, filldir,
- task, *entry) < 0)
- goto out;
- filp->f_pos++;
- entry++;
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+ goto out;
+ entry = ns_entries + (ctx->pos - 2);
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ const struct proc_ns_operations *ops = *entry;
+ if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops))
+ break;
+ ctx->pos++;
+ entry++;
}
-
- ret = 1;
out:
put_task_struct(task);
-out_no_task:
- return ret;
+ return 0;
}
const struct file_operations proc_ns_dir_operations = {
.read = generic_read_dir,
- .readdir = proc_ns_dir_readdir,
+ .iterate = proc_ns_dir_readdir,
};
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
- struct dentry *error;
+ int error;
struct task_struct *task = get_proc_task(dir);
const struct proc_ns_operations **entry, **last;
unsigned int len = dentry->d_name.len;
- error = ERR_PTR(-ENOENT);
+ error = -ENOENT;
if (!task)
goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
out:
put_task_struct(task);
out_no_task:
- return error;
+ return ERR_PTR(error);
}
const struct inode_operations proc_ns_dir_inode_operations = {
@@ -337,6 +298,11 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+struct proc_ns *get_proc_ns(struct inode *inode)
+{
+ return &PROC_I(inode)->ns;
+}
+
bool proc_ns_inode(struct inode *inode)
{
return inode->i_fop == &ns_file_operations;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 30b590f5bd35..106a83570630 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,7 +12,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/of.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <asm/prom.h>
#include <asm/uaccess.h>
@@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v)
static int property_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, property_proc_show, PDE(inode)->data);
+ return single_open(file, property_proc_show, __PDE_DATA(inode));
}
static const struct file_operations property_proc_fops = {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index b4ac6572474f..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -26,6 +26,10 @@
#include "internal.h"
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+ return pde->parent->data;
+}
static struct net *get_proc_net(const struct inode *inode)
{
@@ -156,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
.getattr = proc_tgid_net_getattr,
};
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
{
int ret;
struct net *net;
ret = -EINVAL;
- net = get_proc_task_net(file_inode(filp));
+ net = get_proc_task_net(file_inode(file));
if (net != NULL) {
- ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+ ret = proc_readdir_de(net->proc_net, file, ctx);
put_net(net);
}
return ret;
@@ -174,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
const struct file_operations proc_net_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_tgid_net_readdir,
+ .iterate = proc_tgid_net_readdir,
};
static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
return ret;
}
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir,
+static bool proc_sys_fill_cache(struct file *file,
+ struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
dput(child);
- return -ENOMEM;
+ return false;
} else {
d_set_d_op(child, &proc_sys_dentry_operations);
d_add(child, inode);
}
} else {
- return -ENOMEM;
+ return false;
}
}
inode = child->d_inode;
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
- return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+ return dir_emit(ctx, qname.name, qname.len, ino, type);
}
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir,
+static bool proc_sys_link_fill_cache(struct file *file,
+ struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
- int err, ret = 0;
+ bool ret = true;
head = sysctl_head_grab(head);
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
- err = sysctl_follow_link(&head, &table, current->nsproxy);
+ int err = sysctl_follow_link(&head, &table, current->nsproxy);
if (err)
goto out;
}
- ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+ ret = proc_sys_fill_cache(file, ctx, head, table);
out:
sysctl_head_finish(head);
return ret;
@@ -634,67 +634,50 @@ out:
static int scan(struct ctl_table_header *head, ctl_table *table,
unsigned long *pos, struct file *file,
- void *dirent, filldir_t filldir)
+ struct dir_context *ctx)
{
- int res;
+ bool res;
- if ((*pos)++ < file->f_pos)
- return 0;
+ if ((*pos)++ < ctx->pos)
+ return true;
if (unlikely(S_ISLNK(table->mode)))
- res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+ res = proc_sys_link_fill_cache(file, ctx, head, table);
else
- res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+ res = proc_sys_fill_cache(file, ctx, head, table);
- if (res == 0)
- file->f_pos = *pos;
+ if (res)
+ ctx->pos = *pos;
return res;
}
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
- int ret = -EINVAL;
if (IS_ERR(head))
return PTR_ERR(head);
ctl_dir = container_of(head, struct ctl_dir, header);
- ret = 0;
- /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos,
- inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
- if (filp->f_pos == 1) {
- if (filldir(dirent, "..", 2, filp->f_pos,
- parent_ino(dentry), DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
pos = 2;
for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
- ret = scan(h, entry, &pos, filp, dirent, filldir);
- if (ret) {
+ if (!scan(h, entry, &pos, file, ctx)) {
sysctl_head_finish(h);
break;
}
}
- ret = 1;
-out:
sysctl_head_finish(head);
- return ret;
+ return 0;
}
static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
static const struct file_operations proc_sys_dir_file_operations = {
.read = generic_read_dir,
- .readdir = proc_sys_readdir,
+ .iterate = proc_sys_readdir,
.llseek = generic_file_llseek,
};
@@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
return res;
}
-static int proc_sys_compare(const struct dentry *parent,
- const struct inode *pinode,
- const struct dentry *dentry, const struct inode *inode,
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct ctl_table_header *head;
+ struct inode *inode;
+
/* Although proc doesn't have negative dentries, rcu-walk means
* that inode here can be NULL */
/* AV: can it, indeed? */
+ inode = ACCESS_ONCE(dentry->d_inode);
if (!inode)
return 1;
if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9c7fab1d23f0..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -141,6 +141,8 @@ static void proc_kill_sb(struct super_block *sb)
struct pid_namespace *ns;
ns = (struct pid_namespace *)sb->s_fs_info;
+ if (ns->proc_self)
+ dput(ns->proc_self);
kill_anon_super(sb);
put_pid_ns(ns);
}
@@ -200,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
return proc_pid_lookup(dir, dentry, flags);
}
-static int proc_root_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned int nr = filp->f_pos;
- int ret;
-
- if (nr < FIRST_PROCESS_ENTRY) {
- int error = proc_readdir(filp, dirent, filldir);
- if (error <= 0)
- return error;
- filp->f_pos = FIRST_PROCESS_ENTRY;
+ if (ctx->pos < FIRST_PROCESS_ENTRY) {
+ proc_readdir(file, ctx);
+ ctx->pos = FIRST_PROCESS_ENTRY;
}
- ret = proc_pid_readdir(filp, dirent, filldir);
- return ret;
+ return proc_pid_readdir(file, ctx);
}
/*
@@ -224,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
*/
static const struct file_operations proc_root_operations = {
.read = generic_read_dir,
- .readdir = proc_root_readdir,
+ .iterate = proc_root_readdir,
.llseek = default_llseek,
};
diff --git a/fs/proc/self.c b/fs/proc/self.c
index aa5cc3bff140..6b6a993b5c25 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,6 +1,8 @@
-#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
/*
* /proc/self:
@@ -48,12 +50,43 @@ static const struct inode_operations proc_self_inode_operations = {
.put_link = proc_self_put_link,
};
-void __init proc_self_init(void)
+static unsigned self_inum;
+
+int proc_setup_self(struct super_block *s)
{
- struct proc_dir_entry *proc_self_symlink;
- mode_t mode;
+ struct inode *root_inode = s->s_root->d_inode;
+ struct pid_namespace *ns = s->s_fs_info;
+ struct dentry *self;
+
+ mutex_lock(&root_inode->i_mutex);
+ self = d_alloc_name(s->s_root, "self");
+ if (self) {
+ struct inode *inode = new_inode_pseudo(s);
+ if (inode) {
+ inode->i_ino = self_inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_op = &proc_self_inode_operations;
+ d_add(self, inode);
+ } else {
+ dput(self);
+ self = ERR_PTR(-ENOMEM);
+ }
+ } else {
+ self = ERR_PTR(-ENOMEM);
+ }
+ mutex_unlock(&root_inode->i_mutex);
+ if (IS_ERR(self)) {
+ pr_err("proc_fill_super: can't allocate /proc/self\n");
+ return PTR_ERR(self);
+ }
+ ns->proc_self = self;
+ return 0;
+}
- mode = S_IFLNK | S_IRWXUGO;
- proc_self_symlink = proc_create("self", mode, NULL, NULL );
- proc_self_symlink->proc_iops = &proc_self_inode_operations;
+void __init proc_self_init(void)
+{
+ proc_alloc_inum(&self_inum);
}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e296572c73ed..1cf86c0e8689 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -184,7 +184,7 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- unsigned size = 1024 + 128 * num_possible_cpus();
+ size_t size = 1024 + 128 * num_possible_cpus();
char *buf;
struct seq_file *m;
int res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
.release = seq_release_private,
};
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ * but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ * these flag is set to denote, that user is aware of the
+ * new API and those page-shift bits change their meaning.
+ * The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ * of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/vm/soft-dirty.txt for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(ptent))
continue;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
- int type;
+ enum clear_refs_types type;
+ int itype;
int rv;
memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- rv = kstrtoint(strstrip(buffer), 10, &type);
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
if (rv < 0)
return rv;
- if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
+
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ soft_dirty_cleared = true;
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+ "See the linux/Documentation/vm/pagemap.txt for details.\n");
+ }
+
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct clear_refs_private cp = {
+ .type = type,
+ };
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
+ .private = &cp,
};
down_read(&mm->mmap_sem);
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_start(mm, 0, -1);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- clear_refs_walk.private = vma;
+ cp.vma = vma;
if (is_vm_hugetlb_page(vma))
continue;
/*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
struct pagemapread {
int pos, len;
pagemap_entry_t *buffer;
+ bool v2;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
u64 frame, flags;
struct page *page = NULL;
+ int flags2 = 0;
if (pte_present(pte)) {
frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+ *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
/*
* Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
}
#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ int pmd_flags2;
+
+ pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and need a new, higher one */
if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
- pme = make_pme(PM_NOT_PRESENT);
+ pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, vma, addr, *pte);
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pte_t pte, int offset)
{
if (pte_present(pte))
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!count)
goto out_task;
+ pm.v2 = soft_dirty_cleared;
pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
return ret;
}
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+ pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+ "to stop being page-shift some time soon. See the "
+ "linux/Documentation/vm/pagemap.txt for details.\n");
+ return 0;
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
+ .open = pagemap_open,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
for_each_possible_cpu(i)
idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
- do_posix_clock_monotonic_gettime(&uptime);
- monotonic_to_bootbased(&uptime);
+ get_monotonic_boottime(&uptime);
nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b870f740ab5a..28503172f2e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,7 +8,7 @@
*/
#include <linux/mm.h>
-#include <linux/proc_fs.h>
+#include <linux/kcore.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
@@ -20,8 +20,10 @@
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/io.h>
+#include "internal.h"
/* List representing chunks of contiguous memory areas and their offsets in
* vmcore file.
@@ -31,6 +33,10 @@ static LIST_HEAD(vmcore_list);
/* Stores the pointer to the buffer containing kernel elf core headers. */
static char *elfcorebuf;
static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
@@ -117,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
return read;
}
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
- struct vmcore **m_ptr)
-{
- struct vmcore *m;
- u64 paddr;
-
- list_for_each_entry(m, vc_list, list) {
- u64 start, end;
- start = m->offset;
- end = m->offset + m->size - 1;
- if (offset >= start && offset <= end) {
- paddr = m->paddr + offset - start;
- *m_ptr = m;
- return paddr;
- }
- }
- *m_ptr = NULL;
- return 0;
-}
-
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
@@ -146,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
{
ssize_t acc = 0, tmp;
size_t tsz;
- u64 start, nr_bytes;
- struct vmcore *curr_m = NULL;
+ u64 start;
+ struct vmcore *m = NULL;
if (buflen == 0 || *fpos >= vmcore_size)
return 0;
@@ -158,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = elfcorebuf_sz - *fpos;
- if (buflen < tsz)
- tsz = buflen;
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
return -EFAULT;
buflen -= tsz;
@@ -173,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
return acc;
}
- start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
- if (!curr_m)
- return -EINVAL;
-
- while (buflen) {
- tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
+ /* Read Elf note segment */
+ if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
-
- tmp = read_from_oldmem(buffer, tsz, &start, 1);
- if (tmp < 0)
- return tmp;
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+ if (copy_to_user(buffer, kaddr, tsz))
+ return -EFAULT;
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
acc += tsz;
- if (start >= (curr_m->paddr + curr_m->size)) {
- if (curr_m->list.next == &vmcore_list)
- return acc; /*EOF*/
- curr_m = list_entry(curr_m->list.next,
- struct vmcore, list);
- start = curr_m->paddr;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (*fpos < m->offset + m->size) {
+ tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ start = m->paddr + *fpos - m->offset;
+ tmp = read_from_oldmem(buffer, tsz, &start, 1);
+ if (tmp < 0)
+ return tmp;
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
}
}
+
return acc;
}
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ * vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+ return vmalloc_user(notes_sz);
+#else
+ return vzalloc(notes_sz);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+ u64 start, end, len, tsz;
+ struct vmcore *m;
+
+ start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ end = start + size;
+
+ if (size > vmcore_size || end > vmcore_size)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ len = 0;
+
+ if (start < elfcorebuf_sz) {
+ u64 pfn;
+
+ tsz = min(elfcorebuf_sz - (size_t)start, size);
+ pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+ if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+ vma->vm_page_prot))
+ return -EAGAIN;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ if (start < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+ kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+ kaddr, tsz))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (start < m->offset + m->size) {
+ u64 paddr = 0;
+
+ tsz = min_t(size_t, m->offset + m->size - start, size);
+ paddr = m->paddr + start - m->offset;
+ if (remap_pfn_range(vma, vma->vm_start + len,
+ paddr >> PAGE_SHIFT, tsz,
+ vma->vm_page_prot))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+ }
+
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, vma->vm_start, len);
+ return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ return -ENOSYS;
+}
+#endif
+
static const struct file_operations proc_vmcore_operations = {
.read = read_vmcore,
.llseek = default_llseek,
+ .mmap = mmap_vmcore,
};
static struct vmcore* __init get_new_element(void)
@@ -213,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
}
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
- int i;
- u64 size;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr *phdr_ptr;
-
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
- size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
- }
- return size;
-}
-
-static u64 __init get_vmcore_size_elf32(char *elfptr)
-{
- int i;
u64 size;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr *phdr_ptr;
+ struct vmcore *m;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
- size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
+ size = elfsz + elfnotesegsz;
+ list_for_each_entry(m, vc_list, list) {
+ size += m->size;
}
return size;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
Elf64_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -279,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
+ while (real_sz < max_sz) {
if (nhdr_ptr->n_namesz == 0)
break;
sz = sizeof(Elf64_Nhdr) +
@@ -288,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
real_sz += sz;
nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf64_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf64(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf64_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -321,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -328,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return 0;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
Elf32_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -360,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
+ while (real_sz < max_sz) {
if (nhdr_ptr->n_namesz == 0)
break;
sz = sizeof(Elf32_Nhdr) +
@@ -369,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
real_sz += sz;
nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf32_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf32(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf32_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -402,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -413,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
* the new offset fields of exported program headers. */
static int __init process_ptload_program_headers_elf64(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -424,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
static int __init process_ptload_program_headers_elf32(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -461,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
- struct list_head *vc_list)
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
loff_t vmcore_off;
- Elf64_Ehdr *ehdr_ptr;
struct vmcore *m;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
m->offset = vmcore_off;
@@ -505,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
}
}
-/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
- struct list_head *vc_list)
+static void free_elfcorebuf(void)
{
- loff_t vmcore_off;
- Elf32_Ehdr *ehdr_ptr;
- struct vmcore *m;
-
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-
- list_for_each_entry(m, vc_list, list) {
- m->offset = vmcore_off;
- vmcore_off += m->size;
- }
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = NULL;
+ vfree(elfnotes_buf);
+ elfnotes_buf = NULL;
}
static int __init parse_crash_elf64_headers(void)
@@ -553,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+ ehdr.e_phnum * sizeof(Elf64_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf32_headers(void)
@@ -608,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf_headers(void)
@@ -654,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
rc = parse_crash_elf64_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf64(elfcorebuf);
} else if (e_ident[EI_CLASS] == ELFCLASS32) {
rc = parse_crash_elf32_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf32(elfcorebuf);
} else {
pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
+
+ /* Determine vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+
return 0;
}
@@ -698,7 +974,7 @@ void vmcore_cleanup(void)
struct list_head *pos, *next;
if (proc_vmcore) {
- remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+ proc_remove(proc_vmcore);
proc_vmcore = NULL;
}
@@ -710,7 +986,6 @@ void vmcore_cleanup(void)
list_del(&m->list);
kfree(m);
}
- kfree(elfcorebuf);
- elfcorebuf = NULL;
+ free_elfcorebuf();
}
EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.parent_ip = parent_ip;
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
- sizeof(rec), psinfo);
+ 0, sizeof(rec), psinfo);
local_irq_restore(flags);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
if (p->psi->erase)
p->psi->erase(p->type, p->id, p->count,
dentry->d_inode->i_ctime, p->psi);
+ else
+ return -EPERM;
return simple_unlink(dir, dentry);
}
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
case PSTORE_TYPE_MCE:
sprintf(name, "mce-%s-%lld", psname, id);
break;
+ case PSTORE_TYPE_PPC_RTAS:
+ sprintf(name, "rtas-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_PPC_OF:
+ sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_PPC_COMMON:
+ sprintf(name, "powerpc-common-%s-%lld", psname, id);
+ break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
break;
ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- oopscount, hsize + len, psinfo);
+ oopscount, hsize, hsize + len, psinfo);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
memcpy(psinfo->buf, s, c);
- psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
+ psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
static int pstore_write_compat(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part, int count,
- size_t size, struct pstore_info *psi)
+ size_t hsize, size_t size,
+ struct pstore_info *psi)
{
- return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
+ return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+ size, psi);
}
/*
@@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
{
struct module *owner = psi->owner;
+ if (backend && strcmp(backend, psi->name))
+ return -EPERM;
+
spin_lock(&pstore_lock);
if (psinfo) {
spin_unlock(&pstore_lock);
return -EBUSY;
}
- if (backend && strcmp(backend, psi->name)) {
- spin_unlock(&pstore_lock);
- return -EINVAL;
- }
-
if (!psi->write)
psi->write = pstore_write_compat;
psinfo = psi;
@@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
add_timer(&pstore_timer);
}
+ pr_info("pstore: Registered %s as persistent store backend\n",
+ psi->name);
+
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 288f068740f6..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -83,7 +83,7 @@ struct ramoops_context {
size_t console_size;
size_t ftrace_size;
int dump_oops;
- int ecc_size;
+ struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
unsigned int dump_read_cnt;
@@ -136,6 +136,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
char **buf, struct pstore_info *psi)
{
ssize_t size;
+ ssize_t ecc_notice_size;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz;
@@ -156,12 +157,18 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
time->tv_nsec = 0;
size = persistent_ram_old_size(prz);
- *buf = kmalloc(size, GFP_KERNEL);
+
+ /* ECC correction notice */
+ ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+
+ *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL);
if (*buf == NULL)
return -ENOMEM;
+
memcpy(*buf, persistent_ram_old(prz), size);
+ persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
- return size;
+ return size + ecc_notice_size;
}
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
@@ -188,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part,
- const char *buf, size_t size,
+ const char *buf,
+ size_t hsize, size_t size,
struct pstore_info *psi)
{
struct ramoops_context *cxt = psi->data;
@@ -323,7 +331,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
for (i = 0; i < cxt->max_dump_cnt; i++) {
size_t sz = cxt->record_size;
- cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size);
+ cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+ &cxt->ecc_info);
if (IS_ERR(cxt->przs[i])) {
err = PTR_ERR(cxt->przs[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -353,7 +362,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return -ENOMEM;
}
- *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
+ *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
@@ -391,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
- if (!is_power_of_2(pdata->mem_size))
- pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
if (!is_power_of_2(pdata->record_size))
pdata->record_size = rounddown_pow_of_two(pdata->record_size);
if (!is_power_of_2(pdata->console_size))
@@ -407,7 +414,7 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
cxt->dump_oops = pdata->dump_oops;
- cxt->ecc_size = pdata->ecc_size;
+ cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
@@ -430,6 +437,7 @@ static int ramoops_probe(struct platform_device *pdev)
pr_err("memory size too small, minimum is %zu\n",
cxt->console_size + cxt->record_size +
cxt->ftrace_size);
+ err = -EINVAL;
goto fail_cnt;
}
@@ -447,6 +455,7 @@ static int ramoops_probe(struct platform_device *pdev)
spin_lock_init(&cxt->pstore.buf_lock);
if (!cxt->pstore.buf) {
pr_err("cannot allocate pstore buffer\n");
+ err = -ENOMEM;
goto fail_clear;
}
@@ -465,9 +474,9 @@ static int ramoops_probe(struct platform_device *pdev)
record_size = pdata->record_size;
dump_oops = pdata->dump_oops;
- pr_info("attached 0x%lx@0x%llx, ecc: %d\n",
+ pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
cxt->size, (unsigned long long)cxt->phys_addr,
- cxt->ecc_size);
+ cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
return 0;
@@ -539,7 +548,7 @@ static void ramoops_register_dummy(void)
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
*/
- dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+ dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
dummy = platform_device_register_data(NULL, "ramoops", -1,
dummy_data, sizeof(struct ramoops_platform_data));
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 0306303be372..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
}
/* increase and wrap the start pointer, returning the old value */
-static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
{
int old;
int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
}
/* increase the size counter until it hits the max size */
-static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
{
size_t old;
size_t new;
@@ -78,16 +78,63 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
} while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
}
+static DEFINE_RAW_SPINLOCK(buffer_lock);
+
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+ int old;
+ int new;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&buffer_lock, flags);
+
+ old = atomic_read(&prz->buffer->start);
+ new = old + a;
+ while (unlikely(new > prz->buffer_size))
+ new -= prz->buffer_size;
+ atomic_set(&prz->buffer->start, new);
+
+ raw_spin_unlock_irqrestore(&buffer_lock, flags);
+
+ return old;
+}
+
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+ size_t old;
+ size_t new;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&buffer_lock, flags);
+
+ old = atomic_read(&prz->buffer->size);
+ if (old == prz->buffer_size)
+ goto exit;
+
+ new = old + a;
+ if (new > prz->buffer_size)
+ new = prz->buffer_size;
+ atomic_set(&prz->buffer->size, new);
+
+exit:
+ raw_spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
+static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
+
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
uint8_t *data, size_t len, uint8_t *ecc)
{
int i;
- uint16_t par[prz->ecc_size];
+ uint16_t par[prz->ecc_info.ecc_size];
/* Initialize the parity buffer */
memset(par, 0, sizeof(par));
encode_rs8(prz->rs_decoder, data, len, par, 0);
- for (i = 0; i < prz->ecc_size; i++)
+ for (i = 0; i < prz->ecc_info.ecc_size; i++)
ecc[i] = par[i];
}
@@ -95,9 +142,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
void *data, size_t len, uint8_t *ecc)
{
int i;
- uint16_t par[prz->ecc_size];
+ uint16_t par[prz->ecc_info.ecc_size];
- for (i = 0; i < prz->ecc_size; i++)
+ for (i = 0; i < prz->ecc_info.ecc_size; i++)
par[i] = ecc[i];
return decode_rs8(prz->rs_decoder, data, par, len,
NULL, 0, NULL, 0, NULL);
@@ -110,15 +157,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz,
uint8_t *buffer_end = buffer->data + prz->buffer_size;
uint8_t *block;
uint8_t *par;
- int ecc_block_size = prz->ecc_block_size;
- int ecc_size = prz->ecc_size;
- int size = prz->ecc_block_size;
+ int ecc_block_size = prz->ecc_info.block_size;
+ int ecc_size = prz->ecc_info.ecc_size;
+ int size = ecc_block_size;
- if (!prz->ecc_size)
+ if (!ecc_size)
return;
block = buffer->data + (start & ~(ecc_block_size - 1));
- par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size;
+ par = prz->par_buffer + (start / ecc_block_size) * ecc_size;
do {
if (block + ecc_block_size > buffer_end)
@@ -133,7 +180,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz)
{
struct persistent_ram_buffer *buffer = prz->buffer;
- if (!prz->ecc_size)
+ if (!prz->ecc_info.ecc_size)
return;
persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer),
@@ -146,14 +193,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
uint8_t *block;
uint8_t *par;
- if (!prz->ecc_size)
+ if (!prz->ecc_info.ecc_size)
return;
block = buffer->data;
par = prz->par_buffer;
while (block < buffer->data + buffer_size(prz)) {
int numerr;
- int size = prz->ecc_block_size;
+ int size = prz->ecc_info.block_size;
if (block + size > buffer->data + prz->buffer_size)
size = buffer->data + prz->buffer_size - block;
numerr = persistent_ram_decode_rs8(prz, block, size, par);
@@ -166,44 +213,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
block);
prz->bad_blocks++;
}
- block += prz->ecc_block_size;
- par += prz->ecc_size;
+ block += prz->ecc_info.block_size;
+ par += prz->ecc_info.ecc_size;
}
}
static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
- int ecc_size)
+ struct persistent_ram_ecc_info *ecc_info)
{
int numerr;
struct persistent_ram_buffer *buffer = prz->buffer;
int ecc_blocks;
size_t ecc_total;
- int ecc_symsize = 8;
- int ecc_poly = 0x11d;
- if (!ecc_size)
+ if (!ecc_info || !ecc_info->ecc_size)
return 0;
- prz->ecc_block_size = 128;
- prz->ecc_size = ecc_size;
+ prz->ecc_info.block_size = ecc_info->block_size ?: 128;
+ prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16;
+ prz->ecc_info.symsize = ecc_info->symsize ?: 8;
+ prz->ecc_info.poly = ecc_info->poly ?: 0x11d;
- ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size);
- ecc_total = (ecc_blocks + 1) * prz->ecc_size;
+ ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size,
+ prz->ecc_info.block_size +
+ prz->ecc_info.ecc_size);
+ ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size;
if (ecc_total >= prz->buffer_size) {
pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n",
- __func__, prz->ecc_size, ecc_total, prz->buffer_size);
+ __func__, prz->ecc_info.ecc_size,
+ ecc_total, prz->buffer_size);
return -EINVAL;
}
prz->buffer_size -= ecc_total;
prz->par_buffer = buffer->data + prz->buffer_size;
- prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size;
+ prz->par_header = prz->par_buffer +
+ ecc_blocks * prz->ecc_info.ecc_size;
/*
* first consecutive root is 0
* primitive element to generate roots = 1
*/
- prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size);
+ prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly,
+ 0, 1, prz->ecc_info.ecc_size);
if (prz->rs_decoder == NULL) {
pr_info("persistent_ram: init_rs failed\n");
return -EINVAL;
@@ -230,6 +282,9 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
{
ssize_t ret;
+ if (!prz->ecc_info.ecc_size)
+ return 0;
+
if (prz->corrected_bytes || prz->bad_blocks)
ret = snprintf(str, len, ""
"\n%d Corrected bytes, %d unrecoverable blocks\n",
@@ -364,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
return NULL;
}
+ buffer_start_add = buffer_start_add_locked;
+ buffer_size_add = buffer_size_add_locked;
+
return ioremap(start, size);
}
@@ -391,11 +449,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
}
static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
- int ecc_size)
+ struct persistent_ram_ecc_info *ecc_info)
{
int ret;
- ret = persistent_ram_init_ecc(prz, ecc_size);
+ ret = persistent_ram_init_ecc(prz, ecc_info);
if (ret)
return ret;
@@ -444,7 +502,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
- u32 sig, int ecc_size)
+ u32 sig, struct persistent_ram_ecc_info *ecc_info)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -459,7 +517,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
if (ret)
goto err;
- ret = persistent_ram_post_init(prz, sig, ecc_size);
+ ret = persistent_ram_post_init(prz, sig, ecc_info);
if (ret)
goto err;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
#include <linux/buffer_head.h>
#include "qnx4.h"
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
unsigned int offset;
struct buffer_head *bh;
struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
int size;
QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
- QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
+ QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos));
- while (filp->f_pos < inode->i_size) {
- blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+ while (ctx->pos < inode->i_size) {
+ blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
bh = sb_bread(inode->i_sb, blknum);
- if(bh==NULL) {
+ if (bh == NULL) {
printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
- break;
+ return 0;
}
- ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
- while (ix < QNX4_INODES_PER_BLOCK) {
+ ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+ for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
offset = ix * QNX4_DIR_ENTRY_SIZE;
de = (struct qnx4_inode_entry *) (bh->b_data + offset);
- size = strlen(de->di_fname);
- if (size) {
- if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
- size = QNX4_SHORT_NAME_MAX;
- else if ( size > QNX4_NAME_MAX )
- size = QNX4_NAME_MAX;
-
- if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
- QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
- if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
- ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
- else {
- le = (struct qnx4_link_info*)de;
- ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
- QNX4_INODES_PER_BLOCK +
- le->dl_inode_ndx;
- }
- if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
- brelse(bh);
- goto out;
- }
- }
+ if (!de->di_fname[0])
+ continue;
+ if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ continue;
+ if (!(de->di_status & QNX4_FILE_LINK))
+ size = QNX4_SHORT_NAME_MAX;
+ else
+ size = QNX4_NAME_MAX;
+ size = strnlen(de->di_fname, size);
+ QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+ if (!(de->di_status & QNX4_FILE_LINK))
+ ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+ else {
+ le = (struct qnx4_link_info*)de;
+ ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+ QNX4_INODES_PER_BLOCK +
+ le->dl_inode_ndx;
+ }
+ if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+ brelse(bh);
+ return 0;
}
- ix++;
- filp->f_pos += QNX4_DIR_ENTRY_SIZE;
}
brelse(bh);
}
-out:
return 0;
}
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
{
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = qnx4_readdir,
+ .iterate = qnx4_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8798d065e400..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
static int qnx6_dir_longfilename(struct inode *inode,
struct qnx6_long_dir_entry *de,
- void *dirent, loff_t pos,
- unsigned de_inode, filldir_t filldir)
+ struct dir_context *ctx,
+ unsigned de_inode)
{
struct qnx6_long_filename *lf;
struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
lf_size, lf->lf_fname, de_inode));
- if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
- DT_UNKNOWN) < 0) {
+ if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
qnx6_put_page(page);
return 0;
}
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
return 1;
}
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct super_block *s = inode->i_sb;
struct qnx6_sb_info *sbi = QNX6_SB(s);
- loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1);
+ loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
bool done = false;
- if (filp->f_pos >= inode->i_size)
+ ctx->pos = pos;
+ if (ctx->pos >= inode->i_size)
return 0;
for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
printk(KERN_ERR "qnx6_readdir: read failed\n");
- filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+ ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
return PTR_ERR(page);
}
de = ((struct qnx6_dir_entry *)page_address(page)) + start;
- for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+ for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
int size = de->de_size;
u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
structure / block */
if (!qnx6_dir_longfilename(inode,
(struct qnx6_long_dir_entry *)de,
- dirent, pos, no_inode,
- filldir)) {
+ ctx, no_inode)) {
done = true;
break;
}
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
" inode:%u\n", size, de->de_fname,
no_inode));
- if (filldir(dirent, de->de_fname, size,
- pos, no_inode, DT_UNKNOWN)
- < 0) {
+ if (!dir_emit(ctx, de->de_fname, size,
+ no_inode, DT_UNKNOWN)) {
done = true;
break;
}
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
qnx6_put_page(page);
}
- filp->f_pos = pos;
return 0;
}
@@ -282,7 +279,7 @@ found:
const struct file_operations qnx6_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = qnx6_readdir,
+ .iterate = qnx6_readdir,
.fsync = generic_file_fsync,
};
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
return proc_dointvec(table, write, buffer, lenp, ppos);
}
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
{ },
};
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
{
.procname = "quota",
.mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
{ },
};
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/read_write.c b/fs/read_write.c
index e6ddc8dceb96..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,6 +9,7 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
@@ -16,12 +17,15 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
-#include "read_write.h"
#include "internal.h"
#include <asm/uaccess.h>
#include <asm/unistd.h>
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -37,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
return file->f_mode & FMODE_UNSIGNED_OFFSET;
}
-static loff_t lseek_execute(struct file *file, struct inode *inode,
- loff_t offset, loff_t maxsize)
+/**
+ * vfs_setpos - update the file offset for lseek
+ * @file: file structure in question
+ * @offset: file offset to seek to
+ * @maxsize: maximum file size
+ *
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
@@ -51,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
}
return offset;
}
+EXPORT_SYMBOL(vfs_setpos);
/**
* generic_file_llseek_size - generic llseek implementation for regular files
@@ -72,8 +88,6 @@ loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
loff_t maxsize, loff_t eof)
{
- struct inode *inode = file->f_mapping->host;
-
switch (whence) {
case SEEK_END:
offset += eof;
@@ -93,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
* like SEEK_SET.
*/
spin_lock(&file->f_lock);
- offset = lseek_execute(file, inode, file->f_pos + offset,
- maxsize);
+ offset = vfs_setpos(file, file->f_pos + offset, maxsize);
spin_unlock(&file->f_lock);
return offset;
case SEEK_DATA:
@@ -116,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
break;
}
- return lseek_execute(file, inode, offset, maxsize);
+ return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
@@ -128,7 +141,7 @@ EXPORT_SYMBOL(generic_file_llseek_size);
*
* This is a generic implemenation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
- * @offset and @whence under i_mutex.
+ * @offset and @whence.
*/
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
@@ -141,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
EXPORT_SYMBOL(generic_file_llseek);
/**
+ * fixed_size_llseek - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @size: size of the file
+ *
+ */
+loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR: case SEEK_END:
+ return generic_file_llseek_size(file, offset, whence,
+ size, size);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(fixed_size_llseek);
+
+/**
* noop_llseek - No Operation Performed llseek implementation
* @file: file structure to seek on
* @offset: file offset to seek to
@@ -292,7 +325,7 @@ out_putf:
* them to something that fits in "int" so that others
* won't have to do range checks all the time.
*/
-int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
struct inode *inode;
loff_t pos;
@@ -326,16 +359,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!kiocbIsKicked(iocb))
- schedule();
- else
- kiocbClearKicked(iocb);
- __set_current_state(TASK_RUNNING);
-}
-
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -347,13 +370,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -403,13 +420,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -459,6 +470,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
+ file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
@@ -468,6 +480,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
add_wchar(current, ret);
}
inc_syscw(current);
+ file_end_write(file);
}
return ret;
@@ -493,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
- file_pos_write(f.file, pos);
+ if (ret >= 0)
+ file_pos_write(f.file, pos);
fdput(f);
}
return ret;
@@ -508,15 +522,16 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
- file_pos_write(f.file, pos);
+ if (ret >= 0)
+ file_pos_write(f.file, pos);
fdput(f);
}
return ret;
}
-SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
- size_t count, loff_t pos)
+SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+ size_t, count, loff_t, pos)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -534,17 +549,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
return ret;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
-{
- return SYSC_pread64((unsigned int) fd, (char __user *) buf,
- (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pread64, SyS_pread64);
-#endif
-SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
- size_t count, loff_t pos)
+SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+ size_t, count, loff_t, pos)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -562,14 +569,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
return ret;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
-{
- return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
- (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
-#endif
/*
* Reduce an iovec's length in-place. Return the resulting number of segments
@@ -592,7 +591,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
}
EXPORT_SYMBOL(iov_shorten);
-ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
{
struct kiocb kiocb;
@@ -603,13 +602,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -617,7 +610,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
}
/* Do it by hand, with file-ops */
-ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
{
struct iovec *vector = iov;
@@ -759,6 +752,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
+ file_start_write(file);
}
if (fnv)
@@ -767,6 +761,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
else
ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ if (type != READ)
+ file_end_write(file);
+
out:
if (iov != iovstack)
kfree(iov);
@@ -814,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_readv(f.file, vec, vlen, &pos);
- file_pos_write(f.file, pos);
+ if (ret >= 0)
+ file_pos_write(f.file, pos);
fdput(f);
}
@@ -833,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_writev(f.file, vec, vlen, &pos);
- file_pos_write(f.file, pos);
+ if (ret >= 0)
+ file_pos_write(f.file, pos);
fdput(f);
}
@@ -897,12 +896,210 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
- loff_t max)
+#ifdef CONFIG_COMPAT
+
+static ssize_t compat_do_readv_writev(int type, struct file *file,
+ const struct compat_iovec __user *uvector,
+ unsigned long nr_segs, loff_t *pos)
+{
+ compat_ssize_t tot_len;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ ssize_t ret;
+ io_fn_t fn;
+ iov_fn_t fnv;
+
+ ret = -EINVAL;
+ if (!file->f_op)
+ goto out;
+
+ ret = -EFAULT;
+ if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+ goto out;
+
+ ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+ UIO_FASTIOV, iovstack, &iov);
+ if (ret <= 0)
+ goto out;
+
+ tot_len = ret;
+ ret = rw_verify_area(type, file, pos, tot_len);
+ if (ret < 0)
+ goto out;
+
+ fnv = NULL;
+ if (type == READ) {
+ fn = file->f_op->read;
+ fnv = file->f_op->aio_read;
+ } else {
+ fn = (io_fn_t)file->f_op->write;
+ fnv = file->f_op->aio_write;
+ file_start_write(file);
+ }
+
+ if (fnv)
+ ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+ pos, fnv);
+ else
+ ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+
+ if (type != READ)
+ file_end_write(file);
+
+out:
+ if (iov != iovstack)
+ kfree(iov);
+ if ((ret + (type == READ)) > 0) {
+ if (type == READ)
+ fsnotify_access(file);
+ else
+ fsnotify_modify(file);
+ }
+ return ret;
+}
+
+static size_t compat_readv(struct file *file,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t *pos)
+{
+ ssize_t ret = -EBADF;
+
+ if (!(file->f_mode & FMODE_READ))
+ goto out;
+
+ ret = -EINVAL;
+ if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+ goto out;
+
+ ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+
+out:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen)
+{
+ struct fd f = fdget(fd);
+ ssize_t ret;
+ loff_t pos;
+
+ if (!f.file)
+ return -EBADF;
+ pos = f.file->f_pos;
+ ret = compat_readv(f.file, vec, vlen, &pos);
+ if (ret >= 0)
+ f.file->f_pos = pos;
+ fdput(f);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, loff_t, pos)
+{
+ struct fd f;
+ ssize_t ret;
+
+ if (pos < 0)
+ return -EINVAL;
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+ ret = -ESPIPE;
+ if (f.file->f_mode & FMODE_PREAD)
+ ret = compat_readv(f.file, vec, vlen, &pos);
+ fdput(f);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, u32, pos_low, u32, pos_high)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ return compat_sys_preadv64(fd, vec, vlen, pos);
+}
+
+static size_t compat_writev(struct file *file,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t *pos)
+{
+ ssize_t ret = -EBADF;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ goto out;
+
+ ret = -EINVAL;
+ if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+ goto out;
+
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+
+out:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+ const struct compat_iovec __user *, vec,
+ unsigned long, vlen)
+{
+ struct fd f = fdget(fd);
+ ssize_t ret;
+ loff_t pos;
+
+ if (!f.file)
+ return -EBADF;
+ pos = f.file->f_pos;
+ ret = compat_writev(f.file, vec, vlen, &pos);
+ if (ret >= 0)
+ f.file->f_pos = pos;
+ fdput(f);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, loff_t, pos)
+{
+ struct fd f;
+ ssize_t ret;
+
+ if (pos < 0)
+ return -EINVAL;
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+ ret = -ESPIPE;
+ if (f.file->f_mode & FMODE_PWRITE)
+ ret = compat_writev(f.file, vec, vlen, &pos);
+ fdput(f);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, u32, pos_low, u32, pos_high)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ return compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
+
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+ size_t count, loff_t max)
{
struct fd in, out;
struct inode *in_inode, *out_inode;
loff_t pos;
+ loff_t out_pos;
ssize_t retval;
int fl;
@@ -916,12 +1113,14 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
if (!(in.file->f_mode & FMODE_READ))
goto fput_in;
retval = -ESPIPE;
- if (!ppos)
- ppos = &in.file->f_pos;
- else
+ if (!ppos) {
+ pos = in.file->f_pos;
+ } else {
+ pos = *ppos;
if (!(in.file->f_mode & FMODE_PREAD))
goto fput_in;
- retval = rw_verify_area(READ, in.file, ppos, count);
+ }
+ retval = rw_verify_area(READ, in.file, &pos, count);
if (retval < 0)
goto fput_in;
count = retval;
@@ -938,7 +1137,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
retval = -EINVAL;
in_inode = file_inode(in.file);
out_inode = file_inode(out.file);
- retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
+ out_pos = out.file->f_pos;
+ retval = rw_verify_area(WRITE, out.file, &out_pos, count);
if (retval < 0)
goto fput_out;
count = retval;
@@ -946,7 +1146,6 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
- pos = *ppos;
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
if (pos >= max)
@@ -965,18 +1164,25 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
if (in.file->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
- retval = do_splice_direct(in.file, ppos, out.file, count, fl);
+ file_start_write(out.file);
+ retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
+ file_end_write(out.file);
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
fsnotify_access(in.file);
fsnotify_modify(out.file);
+ out.file->f_pos = out_pos;
+ if (ppos)
+ *ppos = pos;
+ else
+ in.file->f_pos = pos;
}
inc_syscr(current);
inc_syscw(current);
- if (*ppos > max)
+ if (pos > max)
retval = -EOVERFLOW;
fput_out:
@@ -1022,3 +1228,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+ compat_off_t __user *, offset, compat_size_t, count)
+{
+ loff_t pos;
+ off_t off;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(get_user(off, offset)))
+ return -EFAULT;
+ pos = off;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+ compat_loff_t __user *, offset, compat_size_t, count)
+{
+ loff_t pos;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+ return -EFAULT;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif
diff --git a/fs/read_write.h b/fs/read_write.h
deleted file mode 100644
index d3e00ef67420..000000000000
--- a/fs/read_write.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * This file is only for sharing some helpers from read_write.c with compat.c.
- * Don't use anywhere else.
- */
-
-
-typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
-
-ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
-ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
- loff_t max);
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
#include <asm/uaccess.h>
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
int res = -ENOTDIR;
- if (!file->f_op || !file->f_op->readdir)
+ if (!file->f_op || !file->f_op->iterate)
goto out;
res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
- res = file->f_op->readdir(file, buf, filler);
+ ctx->pos = file->f_pos;
+ res = file->f_op->iterate(file, ctx);
+ file->f_pos = ctx->pos;
file_accessed(file);
}
mutex_unlock(&inode->i_mutex);
out:
return res;
}
-
-EXPORT_SYMBOL(vfs_readdir);
+EXPORT_SYMBOL(iterate_dir);
/*
* Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
};
struct readdir_callback {
+ struct dir_context ctx;
struct old_linux_dirent __user * dirent;
int result;
};
@@ -73,7 +75,7 @@ struct readdir_callback {
static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
u64 ino, unsigned int d_type)
{
- struct readdir_callback * buf = (struct readdir_callback *) __buf;
+ struct readdir_callback *buf = (struct readdir_callback *) __buf;
struct old_linux_dirent __user * dirent;
unsigned long d_ino;
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
{
int error;
struct fd f = fdget(fd);
- struct readdir_callback buf;
+ struct readdir_callback buf = {
+ .ctx.actor = fillonedir,
+ .dirent = dirent
+ };
if (!f.file)
return -EBADF;
- buf.result = 0;
- buf.dirent = dirent;
-
- error = vfs_readdir(f.file, fillonedir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (buf.result)
error = buf.result;
@@ -137,6 +139,7 @@ struct linux_dirent {
};
struct getdents_callback {
+ struct dir_context ctx;
struct linux_dirent __user * current_dir;
struct linux_dirent __user * previous;
int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
{
struct fd f;
struct linux_dirent __user * lastdirent;
- struct getdents_callback buf;
+ struct getdents_callback buf = {
+ .ctx.actor = filldir,
+ .count = count,
+ .current_dir = dirent
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, filldir, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- if (put_user(f.file->f_pos, &lastdirent->d_off))
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
}
struct getdents_callback64 {
+ struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
struct linux_dirent64 __user * previous;
int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
- struct getdents_callback64 buf;
+ struct getdents_callback64 buf = {
+ .ctx.actor = filldir64,
+ .count = count,
+ .current_dir = dirent
+ };
int error;
if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!f.file)
return -EBADF;
- buf.current_dir = dirent;
- buf.previous = NULL;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(f.file, filldir64, &buf);
+ error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
- typeof(lastdirent->d_off) d_off = f.file->f_pos;
+ typeof(lastdirent->d_off) d_off = buf.ctx.pos;
if (__put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 66c53b642a88..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
extern const struct reiserfs_key MIN_KEY;
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
int datasync);
const struct file_operations reiserfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = reiserfs_readdir,
+ .iterate = reiserfs_readdir,
.fsync = reiserfs_dir_fsync,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
#define store_ih(where,what) copy_item_head (where, what)
-static inline bool is_privroot_deh(struct dentry *dir,
- struct reiserfs_de_head *deh)
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
{
- struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
- return (dir == dir->d_parent && privroot->d_inode &&
+ struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
+ return (privroot->d_inode &&
deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
}
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
- filldir_t filldir, loff_t *pos)
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
{
- struct inode *inode = dentry->d_inode;
struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
INITIALIZE_PATH(path_to_entry);
struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
/* form key for search the next directory entry using f_pos field of
file structure */
- make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+ make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
next_pos = cpu_key_k_offset(&pos_key);
path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
entry_num++, deh++) {
int d_reclen;
char *d_name;
- off_t d_off;
ino_t d_ino;
if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
}
/* Ignore the .reiserfs_priv entry */
- if (is_privroot_deh(dentry, deh))
+ if (is_privroot_deh(inode, deh))
continue;
- d_off = deh_offset(deh);
- *pos = d_off;
+ ctx->pos = deh_offset(deh);
d_ino = deh_objectid(deh);
if (d_reclen <= 32) {
local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
* the write lock here for other waiters
*/
reiserfs_write_unlock(inode->i_sb);
- if (filldir
- (dirent, local_buf, d_reclen, d_off, d_ino,
- DT_UNKNOWN) < 0) {
+ if (!dir_emit
+ (ctx, local_buf, d_reclen, d_ino,
+ DT_UNKNOWN)) {
reiserfs_write_lock(inode->i_sb);
if (local_buf != small_buf) {
kfree(local_buf);
@@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
next_pos = deh_offset(deh) + 1;
if (item_moved(&tmp_ih, &path_to_entry)) {
+ set_cpu_key_k_offset(&pos_key,
+ next_pos);
goto research;
}
} /* for */
@@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
} /* while */
end:
- *pos = next_pos;
+ ctx->pos = next_pos;
pathrelse(&path_to_entry);
reiserfs_check_path(&path_to_entry);
out:
@@ -243,10 +240,9 @@ out:
return ret;
}
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = file->f_path.dentry;
- return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+ return reiserfs_readdir_inode(file_inode(file), ctx);
}
/* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6165bd4784f6..dcaafcfc23b0 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -234,68 +234,9 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
return ret;
}
-/* Write @count bytes at position @ppos in a file indicated by @file
- from the buffer @buf.
-
- generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
- something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
- written for (ext2/3). This is for several reasons:
-
- * It has no understanding of any filesystem specific optimizations.
-
- * It enters the filesystem repeatedly for each page that is written.
-
- * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
- * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
- * to reiserfs which allows for fewer tree traversals.
-
- * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
-
- * Asking the block allocation code for blocks one at a time is slightly less efficient.
-
- All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
- use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make
- things right finally.
-
- Future Features: providing search_by_key with hints.
-
-*/
-static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */
- const char __user * buf, /* pointer to user supplied data
- (in userspace) */
- size_t count, /* amount of bytes to write */
- loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to
- * new current position before returning. */
- )
-{
- struct inode *inode = file_inode(file); // Inode of the file that we are writing to.
- /* To simplify coding at this time, we store
- locked pages in array for now */
- struct reiserfs_transaction_handle th;
- th.t_trans_id = 0;
-
- /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
- * lying around (most of the disk, in fact). Despite the filesystem
- * now being a v3.6 format, the old items still can't support large
- * file sizes. Catch this case here, as the rest of the VFS layer is
- * oblivious to the different limitations between old and new items.
- * reiserfs_setattr catches this for truncates. This chunk is lifted
- * from generic_write_checks. */
- if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
- *ppos + count > MAX_NON_LFS) {
- if (*ppos >= MAX_NON_LFS) {
- return -EFBIG;
- }
- if (count > MAX_NON_LFS - (unsigned long)*ppos)
- count = MAX_NON_LFS - (unsigned long)*ppos;
- }
-
- return do_sync_write(file, buf, count, ppos);
-}
-
const struct file_operations reiserfs_file_operations = {
.read = do_sync_read,
- .write = reiserfs_file_write,
+ .write = do_sync_write,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = reiserfs_compat_ioctl,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ea5061fd4f3e..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
+#include <linux/aio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
@@ -1810,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
- if (insert_inode_locked4(inode, args.objectid,
- reiserfs_find_actor, &args) < 0) {
+
+ reiserfs_write_unlock(inode->i_sb);
+ err = insert_inode_locked4(inode, args.objectid,
+ reiserfs_find_actor, &args);
+ reiserfs_write_lock(inode->i_sb);
+ if (err) {
err = -EINVAL;
goto out_bad_inode;
}
+
if (old_format_only(sb))
/* not a perfect generation count, as object ids can be reused, but
** this is as good as reiserfs can do right now.
@@ -2969,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
}
/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
@@ -2990,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ goto out;
+
/*
* is this block fully invalidated?
*/
@@ -3008,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (!offset && ret) {
+ if (!partial_page && ret) {
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index afcadcc03e8a..742fdd4c209a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
struct super_block *sb, unsigned long nblocks);
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
struct reiserfs_journal *journal);
static int dirty_one_transaction(struct super_block *s,
struct reiserfs_journal_list *jl);
@@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb)
SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
}
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
struct reiserfs_journal *journal)
{
- int result;
-
- result = 0;
-
if (journal->j_dev_bd != NULL) {
- result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+ blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
journal->j_dev_bd = NULL;
}
-
- if (result != 0) {
- reiserfs_warning(super, "sh-457",
- "Cannot release journal device: %i", result);
- }
- return result;
}
static int journal_init_dev(struct super_block *super,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9cc0740adffa..33532f79b4f7 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -394,20 +394,24 @@ static int set_sb(struct super_block *sb, void *data)
return -ENOENT;
}
+struct reiserfs_seq_private {
+ struct super_block *sb;
+ int (*show) (struct seq_file *, struct super_block *);
+};
+
static void *r_start(struct seq_file *m, loff_t * pos)
{
- struct proc_dir_entry *de = m->private;
- struct super_block *s = de->parent->data;
+ struct reiserfs_seq_private *priv = m->private;
loff_t l = *pos;
if (l)
return NULL;
- if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s)))
+ if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
return NULL;
- up_write(&s->s_umount);
- return s;
+ up_write(&priv->sb->s_umount);
+ return priv->sb;
}
static void *r_next(struct seq_file *m, void *v, loff_t * pos)
@@ -426,9 +430,8 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct proc_dir_entry *de = m->private;
- int (*show) (struct seq_file *, struct super_block *) = de->data;
- return show(m, v);
+ struct reiserfs_seq_private *priv = m->private;
+ return priv->show(m, v);
}
static const struct seq_operations r_ops = {
@@ -440,11 +443,15 @@ static const struct seq_operations r_ops = {
static int r_open(struct inode *inode, struct file *file)
{
- int ret = seq_open(file, &r_ops);
+ struct reiserfs_seq_private *priv;
+ int ret = seq_open_private(file, &r_ops,
+ sizeof(struct reiserfs_seq_private));
if (!ret) {
struct seq_file *m = file->private_data;
- m->private = PDE(inode);
+ priv = m->private;
+ priv->sb = proc_get_parent_data(inode);
+ priv->show = PDE_DATA(inode);
}
return ret;
}
@@ -453,7 +460,7 @@ static const struct file_operations r_file_operations = {
.open = r_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_private,
.owner = THIS_MODULE,
};
@@ -479,9 +486,8 @@ int reiserfs_proc_info_init(struct super_block *sb)
*s = '!';
spin_lock_init(&__PINFO(sb).lock);
- REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
+ REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
if (REISERFS_SB(sb)->procdir) {
- REISERFS_SB(sb)->procdir->data = sb;
add_file(sb, "version", show_version);
add_file(sb, "super", show_super);
add_file(sb, "per-level", show_per_level);
@@ -499,29 +505,17 @@ int reiserfs_proc_info_init(struct super_block *sb)
int reiserfs_proc_info_done(struct super_block *sb)
{
struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
- char b[BDEVNAME_SIZE];
- char *s;
+ if (de) {
+ char b[BDEVNAME_SIZE];
+ char *s;
- /* Some block devices use /'s */
- strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
- s = strchr(b, '/');
- if (s)
- *s = '!';
+ /* Some block devices use /'s */
+ strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+ s = strchr(b, '/');
+ if (s)
+ *s = '!';
- if (de) {
- remove_proc_entry("journal", de);
- remove_proc_entry("oidmap", de);
- remove_proc_entry("on-disk-super", de);
- remove_proc_entry("bitmap", de);
- remove_proc_entry("per-level", de);
- remove_proc_entry("super", de);
- remove_proc_entry("version", de);
- }
- spin_lock(&__PINFO(sb).lock);
- __PINFO(sb).exiting = 1;
- spin_unlock(&__PINFO(sb).lock);
- if (proc_info_root) {
- remove_proc_entry(b, proc_info_root);
+ remove_proc_subtree(b, proc_info_root);
REISERFS_SB(sb)->procdir = NULL;
}
return 0;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
extern const struct inode_operations reiserfs_symlink_inode_operations;
extern const struct inode_operations reiserfs_special_inode_operations;
extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
/* tail_conversion.c */
int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4cce1d9552fb..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
* modifying extended attributes. This includes operations such as permissions
* or ownership changes, object deletions, etc. */
struct reiserfs_dentry_buf {
+ struct dir_context ctx;
struct dentry *xadir;
int count;
struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
{
struct dentry *dir;
int i, err = 0;
- loff_t pos = 0;
struct reiserfs_dentry_buf buf = {
- .count = 0,
+ .ctx.actor = fill_with_dentries,
};
/* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
reiserfs_write_lock(inode->i_sb);
buf.xadir = dir;
- err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
- while ((err == 0 || err == -ENOSPC) && buf.count) {
- err = 0;
-
- for (i = 0; i < buf.count && buf.dentries[i]; i++) {
- int lerr = 0;
+ while (1) {
+ err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
+ if (err)
+ break;
+ if (!buf.count)
+ break;
+ for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
struct dentry *dentry = buf.dentries[i];
- if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
- lerr = action(dentry, data);
+ if (!S_ISDIR(dentry->d_inode->i_mode))
+ err = action(dentry, data);
dput(dentry);
buf.dentries[i] = NULL;
- err = lerr ?: err;
}
+ if (err)
+ break;
buf.count = 0;
- if (!err)
- err = reiserfs_readdir_dentry(dir, &buf,
- fill_with_dentries, &pos);
}
mutex_unlock(&dir->d_inode->i_mutex);
- /* Clean up after a failed readdir */
cleanup_dentry_buf(&buf);
if (!err) {
@@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
static int chown_one_xattr(struct dentry *dentry, void *data)
{
struct iattr *attrs = data;
- return reiserfs_setattr(dentry, attrs);
+ int ia_valid = attrs->ia_valid;
+ int err;
+
+ /*
+ * We only want the ownership bits. Otherwise, we'll do
+ * things like change a directory to a regular file if
+ * ATTR_MODE is set.
+ */
+ attrs->ia_valid &= (ATTR_UID|ATTR_GID);
+ err = reiserfs_setattr(dentry, attrs);
+ attrs->ia_valid = ia_valid;
+
+ return err;
}
/* No i_mutex, but the inode is unconnected. */
@@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
}
struct listxattr_buf {
+ struct dir_context ctx;
size_t size;
size_t pos;
char *buf;
@@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
{
struct dentry *dir;
int err = 0;
- loff_t pos = 0;
struct listxattr_buf buf = {
+ .ctx.actor = listxattr_filler,
.dentry = dentry,
.buf = buffer,
.size = buffer ? size : 0,
@@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
}
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
- err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+ err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
mutex_unlock(&dir->d_inode->i_mutex);
if (!err)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d7c01ef64eda..6c8767fdfc6a 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode)
int depth;
int error;
+ if (IS_PRIVATE(inode))
+ return 0;
+
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index e1a7779dd3cb..f373bde8f545 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -49,8 +49,11 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
return (unsigned long) -EINVAL;
offset += ROMFS_I(inode)->i_dataoffset;
- if (offset > mtd->size - len)
+ if (offset >= mtd->size)
return (unsigned long) -EINVAL;
+ /* the mapping mustn't extend beyond the EOF */
+ if ((offset + len) > mtd->size)
+ len = mtd->size - offset;
ret = mtd_get_unmapped_area(mtd, len, offset, flags);
if (ret == -EOPNOTSUPP)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
/*
* read the entries from a directory
*/
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *i = file_inode(filp);
+ struct inode *i = file_inode(file);
struct romfs_inode ri;
unsigned long offset, maxoff;
int j, ino, nextfh;
- int stored = 0;
char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
int ret;
maxoff = romfs_maxsize(i->i_sb);
- offset = filp->f_pos;
+ offset = ctx->pos;
if (!offset) {
offset = i->i_ino & ROMFH_MASK;
ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
for (;;) {
if (!offset || offset >= maxoff) {
offset = maxoff;
- filp->f_pos = offset;
+ ctx->pos = offset;
goto out;
}
- filp->f_pos = offset;
+ ctx->pos = offset;
/* Fetch inode info */
ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
nextfh = be32_to_cpu(ri.next);
if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
ino = be32_to_cpu(ri.spec);
- if (filldir(dirent, fsname, j, offset, ino,
- romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+ if (!dir_emit(ctx, fsname, j, ino,
+ romfs_dtype_table[nextfh & ROMFH_TYPE]))
goto out;
- stored++;
offset = nextfh & ROMFH_MASK;
}
-
out:
- return stored;
+ return 0;
}
/*
@@ -281,7 +278,7 @@ error:
static const struct file_operations romfs_dir_operations = {
.read = generic_read_dir,
- .readdir = romfs_readdir,
+ .iterate = romfs_readdir,
.llseek = default_llseek,
};
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c27062..f9f49c40cfd4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,8 @@
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/sched/rt.h>
+#include <linux/freezer.h>
+#include <net/ll_poll.h>
#include <asm/uaccess.h>
@@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
set_current_state(state);
if (!pwq->triggered)
- rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ rc = freezable_schedule_hrtimeout_range(expires, slack,
+ HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
@@ -384,9 +387,10 @@ get_max:
#define POLLEX_SET (POLLPRI)
static inline void wait_key_set(poll_table *wait, unsigned long in,
- unsigned long out, unsigned long bit)
+ unsigned long out, unsigned long bit,
+ unsigned int ll_flag)
{
- wait->_key = POLLEX_SET;
+ wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
@@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
@@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+ bool can_busy_loop = false;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) {
- wait_key_set(wait, in, out, bit);
+ wait_key_set(wait, in, out,
+ bit, busy_flag);
mask = (*f_op->poll)(f.file, wait);
}
fdput(f);
@@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval++;
wait->_qproc = NULL;
}
+ /* got something, stop busy polling */
+ if (retval) {
+ can_busy_loop = false;
+ busy_flag = 0;
+
+ /*
+ * only remember a returned
+ * POLL_BUSY_LOOP if we asked for it
+ */
+ } else if (busy_flag & mask)
+ can_busy_loop = true;
+
}
}
if (res_in)
@@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
break;
}
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
@@ -717,7 +748,9 @@ struct poll_list {
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+ bool *can_busy_poll,
+ unsigned int busy_flag)
{
unsigned int mask;
int fd;
@@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
+ pwait->_key |= busy_flag;
mask = f.file->f_op->poll(f.file, pwait);
+ if (mask & busy_flag)
+ *can_busy_poll = true;
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (;;) {
struct poll_list *walk;
+ bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt)) {
+ if (do_pollfd(pfd, pt, &can_busy_loop,
+ busy_flag)) {
count++;
pt->_qproc = NULL;
+ /* found something, stop busy polling */
+ busy_flag = 0;
+ can_busy_loop = false;
}
}
}
@@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
if (count || timed_out)
break;
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 38bb59f3f2ad..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *),
}
EXPORT_SYMBOL(single_open);
+int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
+ void *data, size_t size)
+{
+ char *buf = kmalloc(size, GFP_KERNEL);
+ int ret;
+ if (!buf)
+ return -ENOMEM;
+ ret = single_open(file, show, data);
+ if (ret) {
+ kfree(buf);
+ return ret;
+ }
+ ((struct seq_file *)file->private_data)->buf = buf;
+ ((struct seq_file *)file->private_data)->size = size;
+ return 0;
+}
+EXPORT_SYMBOL(single_open_size);
+
int single_release(struct inode *inode, struct file *file)
{
const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
@@ -903,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+ struct hlist_node *node;
+
+ for_each_possible_cpu(*cpu) {
+ hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+ if (pos-- == 0)
+ return node;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v: pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+ int *cpu, loff_t *pos)
+{
+ struct hlist_node *node = v;
+
+ ++*pos;
+
+ if (node->next)
+ return node->next;
+
+ for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+ *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+ struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+ if (!hlist_empty(bucket))
+ return bucket->first;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b53486961735..424b7b65321f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -30,6 +30,7 @@
#include <linux/signalfd.h>
#include <linux/syscalls.h>
#include <linux/proc_fs.h>
+#include <linux/compat.h>
void signalfd_cleanup(struct sighand_struct *sighand)
{
@@ -311,3 +312,33 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
{
return sys_signalfd4(ufd, user_mask, sizemask, 0);
}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
+ const compat_sigset_t __user *,sigmask,
+ compat_size_t, sigsetsize,
+ int, flags)
+{
+ compat_sigset_t ss32;
+ sigset_t tmp;
+ sigset_t __user *ksigmask;
+
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+ sigset_from_compat(&tmp, &ss32);
+ ksigmask = compat_alloc_user_space(sizeof(sigset_t));
+ if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
+ return -EFAULT;
+
+ return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd,
+ const compat_sigset_t __user *,sigmask,
+ compat_size_t, sigsetsize)
+{
+ return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
+#endif
diff --git a/fs/splice.c b/fs/splice.c
index 29e394e49ddd..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
+#include <linux/compat.h>
#include "internal.h"
/*
@@ -218,7 +219,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
page_nr++;
ret += buf->len;
- if (pipe->inode)
+ if (pipe->files)
do_wakeup = 1;
if (!--spd->nr_pages)
@@ -828,7 +829,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
ops->release(pipe, buf);
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
- if (pipe->inode)
+ if (pipe->files)
sd->need_wakeup = true;
}
@@ -1000,8 +1001,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
};
ssize_t ret;
- sb_start_write(inode->i_sb);
-
pipe_lock(pipe);
splice_from_pipe_begin(&sd);
@@ -1037,7 +1036,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
*ppos += ret;
balance_dirty_pages_ratelimited(mapping);
}
- sb_end_write(inode->i_sb);
return ret;
}
@@ -1100,17 +1098,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
{
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int);
- int ret;
-
- if (unlikely(!(out->f_mode & FMODE_WRITE)))
- return -EBADF;
-
- if (unlikely(out->f_flags & O_APPEND))
- return -EINVAL;
-
- ret = rw_verify_area(WRITE, out, ppos, len);
- if (unlikely(ret < 0))
- return ret;
if (out->f_op && out->f_op->splice_write)
splice_write = out->f_op->splice_write;
@@ -1183,7 +1170,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
*/
pipe = current->splice_pipe;
if (unlikely(!pipe)) {
- pipe = alloc_pipe_info(NULL);
+ pipe = alloc_pipe_info();
if (!pipe)
return -ENOMEM;
@@ -1273,7 +1260,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
{
struct file *file = sd->u.file;
- return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+ return do_splice_from(pipe, file, sd->opos, sd->total_len,
sd->flags);
}
@@ -1282,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
* @in: file to splice from
* @ppos: input file offset
* @out: file to splice to
+ * @opos: output file offset
* @len: number of bytes to splice
* @flags: splice modifier flags
*
@@ -1293,7 +1281,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
*
*/
long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- size_t len, unsigned int flags)
+ loff_t *opos, size_t len, unsigned int flags)
{
struct splice_desc sd = {
.len = len,
@@ -1301,9 +1289,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.flags = flags,
.pos = *ppos,
.u.file = out,
+ .opos = opos,
};
long ret;
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ if (unlikely(out->f_flags & O_APPEND))
+ return -EINVAL;
+
+ ret = rw_verify_area(WRITE, out, opos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
if (ret > 0)
*ppos = sd.pos;
@@ -1324,7 +1323,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
- loff_t offset, *off;
+ loff_t offset;
long ret;
ipipe = get_pipe_info(in);
@@ -1355,13 +1354,27 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &out->f_pos;
+ } else {
+ offset = out->f_pos;
+ }
+
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ if (unlikely(out->f_flags & O_APPEND))
+ return -EINVAL;
- ret = do_splice_from(ipipe, out, off, len, flags);
+ ret = rw_verify_area(WRITE, out, &offset, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ file_start_write(out);
+ ret = do_splice_from(ipipe, out, &offset, len, flags);
+ file_end_write(out);
- if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+ if (!off_out)
+ out->f_pos = offset;
+ else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
@@ -1375,13 +1388,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &in->f_pos;
+ } else {
+ offset = in->f_pos;
+ }
- ret = do_splice_to(in, off, opipe, len, flags);
+ ret = do_splice_to(in, &offset, opipe, len, flags);
- if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+ if (!off_in)
+ in->f_pos = offset;
+ else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
@@ -1690,6 +1705,27 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
return error;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
+ unsigned int, nr_segs, unsigned int, flags)
+{
+ unsigned i;
+ struct iovec __user *iov;
+ if (nr_segs > UIO_MAXIOV)
+ return -EINVAL;
+ iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+ for (i = 0; i < nr_segs; i++) {
+ struct compat_iovec v;
+ if (get_user(v.iov_base, &iov32[i].iov_base) ||
+ get_user(v.iov_len, &iov32[i].iov_len) ||
+ put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+ put_user(v.iov_len, &iov[i].iov_len))
+ return -EFAULT;
+ }
+ return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+#endif
+
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
}
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
* It also means that the external f_pos is offset by 3 from the
* on-disk directory f_pos.
*/
- while (file->f_pos < 3) {
+ while (ctx->pos < 3) {
char *name;
int i_ino;
- if (file->f_pos == 0) {
+ if (ctx->pos == 0) {
name = ".";
size = 1;
i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
i_ino = squashfs_i(inode)->parent;
}
- TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
- dirent, name, size, file->f_pos, i_ino,
- squashfs_filetype_table[1]);
-
- if (filldir(dirent, name, size, file->f_pos, i_ino,
- squashfs_filetype_table[1]) < 0) {
- TRACE("Filldir returned less than 0\n");
+ if (!dir_emit(ctx, name, size, i_ino,
+ squashfs_filetype_table[1]))
goto finish;
- }
- file->f_pos += size;
+ ctx->pos += size;
}
length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
squashfs_i(inode)->dir_idx_start,
squashfs_i(inode)->dir_idx_offset,
squashfs_i(inode)->dir_idx_cnt,
- file->f_pos);
+ ctx->pos);
while (length < i_size_read(inode)) {
/*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
length += sizeof(*dire) + size;
- if (file->f_pos >= length)
+ if (ctx->pos >= length)
continue;
dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
((short) le16_to_cpu(dire->inode_number));
type = le16_to_cpu(dire->type);
- TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
- "\n", dirent, dire->name, size,
- file->f_pos,
- le32_to_cpu(dirh.start_block),
- le16_to_cpu(dire->offset),
- inode_number,
- squashfs_filetype_table[type]);
-
- if (filldir(dirent, dire->name, size, file->f_pos,
+ if (!dir_emit(ctx, dire->name, size,
inode_number,
- squashfs_filetype_table[type]) < 0) {
- TRACE("Filldir returned less than 0\n");
+ squashfs_filetype_table[type]))
goto finish;
- }
- file->f_pos = length;
+ ctx->pos = length;
}
}
@@ -238,6 +222,6 @@ failed_read:
const struct file_operations squashfs_dir_ops = {
.read = generic_read_dir,
- .readdir = squashfs_readdir,
+ .iterate = squashfs_readdir,
.llseek = default_llseek,
};
diff --git a/fs/sync.c b/fs/sync.c
index 2c5d6639a66a..905f3f6b3d85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync);
* already-instantiated disk blocks, there are no guarantees here that the data
* will be available after a crash.
*/
-SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
- unsigned int flags)
+SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
+ unsigned int, flags)
{
int ret;
struct fd f;
@@ -365,29 +365,11 @@ out_put:
out:
return ret;
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
- long flags)
-{
- return SYSC_sync_file_range((int) fd, offset, nbytes,
- (unsigned int) flags);
-}
-SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
-#endif
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
-SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
- loff_t offset, loff_t nbytes)
+SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
+ loff_t, offset, loff_t, nbytes)
{
return sys_sync_file_range(fd, offset, nbytes, flags);
}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range2(long fd, long flags,
- loff_t offset, loff_t nbytes)
-{
- return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
- offset, nbytes);
-}
-SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
-#endif
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e14512678c9b..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
}
/**
- * sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
* @sd: sysfs_dirent of interest
*
* Link @sd into its sibling rbtree which starts from
@@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
if (unlikely(!sd))
return NULL;
- while (1) {
- int v, t;
-
- v = atomic_read(&sd->s_active);
- if (unlikely(v < 0))
- return NULL;
-
- t = atomic_cmpxchg(&sd->s_active, v, v + 1);
- if (likely(t == v))
- break;
- if (t < 0)
- return NULL;
-
- cpu_relax();
- }
+ if (!atomic_inc_unless_negative(&sd->s_active))
+ return NULL;
if (likely(!ignore_lockdep(sd)))
rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
@@ -281,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
*/
parent_sd = sd->s_parent;
+ WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
+ "sysfs: free using entry: %s/%s\n",
+ parent_sd ? parent_sd->s_name : "", sd->s_name);
+
if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
sysfs_put(sd->s_symlink.target_sd);
if (sysfs_type(sd) & SYSFS_COPY_NAME)
@@ -399,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
sd->s_name = name;
sd->s_mode = mode;
- sd->s_flags = type;
+ sd->s_flags = type | SYSFS_FLAG_REMOVED;
return sd;
@@ -479,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
}
+ /* Mark the entry added into directory tree */
+ sd->s_flags &= ~SYSFS_FLAG_REMOVED;
+
return 0;
}
@@ -1004,61 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
return pos;
}
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file->f_path.dentry;
struct sysfs_dirent * parent_sd = dentry->d_fsdata;
- struct sysfs_dirent *pos = filp->private_data;
+ struct sysfs_dirent *pos = file->private_data;
enum kobj_ns_type type;
const void *ns;
- ino_t ino;
type = sysfs_ns_type(parent_sd);
ns = sysfs_info(dentry->d_sb)->ns[type];
- if (filp->f_pos == 0) {
- ino = parent_sd->s_ino;
- if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
- filp->f_pos++;
- else
- return 0;
- }
- if (filp->f_pos == 1) {
- if (parent_sd->s_parent)
- ino = parent_sd->s_parent->s_ino;
- else
- ino = parent_sd->s_ino;
- if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
- filp->f_pos++;
- else
- return 0;
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
mutex_lock(&sysfs_mutex);
- for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+ for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
pos;
- pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
- const char * name;
- unsigned int type;
- int len, ret;
-
- name = pos->s_name;
- len = strlen(name);
- ino = pos->s_ino;
- type = dt_type(pos);
- filp->f_pos = pos->s_hash;
- filp->private_data = sysfs_get(pos);
+ pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+ const char *name = pos->s_name;
+ unsigned int type = dt_type(pos);
+ int len = strlen(name);
+ ino_t ino = pos->s_ino;
+ ctx->pos = pos->s_hash;
+ file->private_data = sysfs_get(pos);
mutex_unlock(&sysfs_mutex);
- ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+ if (!dir_emit(ctx, name, len, ino, type))
+ return 0;
mutex_lock(&sysfs_mutex);
- if (ret < 0)
- break;
}
mutex_unlock(&sysfs_mutex);
- if ((filp->f_pos > 1) && !pos) { /* EOF */
- filp->f_pos = INT_MAX;
- filp->private_data = NULL;
- }
+ file->private_data = NULL;
+ ctx->pos = INT_MAX;
return 0;
}
@@ -1076,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations sysfs_dir_operations = {
.read = generic_read_dir,
- .readdir = sysfs_readdir,
+ .iterate = sysfs_readdir,
.release = sysfs_dir_release,
.llseek = sysfs_dir_llseek,
};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
+ if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
}
spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
#include <linux/security.h>
#include "sysfs.h"
-extern struct super_block * sysfs_sb;
-
static const struct address_space_operations sysfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
#include <linux/swap.h>
#include "sysv.h"
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
const struct file_operations sysv_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = sysv_readdir,
+ .iterate = sysv_readdir,
.fsync = generic_file_fsync,
};
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
return page;
}
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned long pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ unsigned long pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
+ unsigned offset;
+ unsigned long n;
- pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+ ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
if (pos >= inode->i_size)
- goto done;
+ return 0;
+
+ offset = pos & ~PAGE_CACHE_MASK;
+ n = pos >> PAGE_CACHE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)(kaddr+offset);
limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
- for ( ;(char*)de <= limit; de++) {
+ for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
char *name = de->name;
- int over;
if (!de->inode)
continue;
- offset = (char *)de - kaddr;
-
- over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
- ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
fs16_to_cpu(SYSV_SB(sb), de->inode),
- DT_UNKNOWN);
- if (over) {
+ DT_UNKNOWN)) {
dir_put_page(page);
- goto done;
+ return 0;
}
}
dir_put_page(page);
}
-
-done:
- filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
return 0;
}
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
return err;
}
-static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
- struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
{
/* Truncate the name in place, avoids having to define a compare
function. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/alarmtimer.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
@@ -26,7 +27,10 @@
#include <linux/rcupdate.h>
struct timerfd_ctx {
- struct hrtimer tmr;
+ union {
+ struct hrtimer tmr;
+ struct alarm alarm;
+ } t;
ktime_t tintv;
ktime_t moffs;
wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
static LIST_HEAD(cancel_list);
static DEFINE_SPINLOCK(cancel_lock);
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+ return ctx->clockid == CLOCK_REALTIME_ALARM ||
+ ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
+
/*
* This gets called when the timer event triggers. We set the "expired"
* flag, but we do not re-arm the timer (in case it's necessary,
* tintv.tv64 != 0) until the timer is accessed.
*/
-static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+static void timerfd_triggered(struct timerfd_ctx *ctx)
{
- struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
unsigned long flags;
spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
ctx->ticks++;
wake_up_locked(&ctx->wqh);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+ struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+ t.tmr);
+ timerfd_triggered(ctx);
return HRTIMER_NORESTART;
}
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+ t.alarm);
+ timerfd_triggered(ctx);
+ return ALARMTIMER_NORESTART;
+}
+
/*
* Called when the clock was set to cancel the timers in the cancel
* list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
{
- if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
- (flags & TFD_TIMER_CANCEL_ON_SET)) {
+ if ((ctx->clockid == CLOCK_REALTIME ||
+ ctx->clockid == CLOCK_REALTIME_ALARM) &&
+ (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
if (!ctx->might_cancel) {
ctx->might_cancel = true;
spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
{
ktime_t remaining;
- remaining = hrtimer_expires_remaining(&ctx->tmr);
+ if (isalarm(ctx))
+ remaining = alarm_expires_remaining(&ctx->t.alarm);
+ else
+ remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->expired = 0;
ctx->ticks = 0;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
- hrtimer_init(&ctx->tmr, clockid, htmode);
- hrtimer_set_expires(&ctx->tmr, texp);
- ctx->tmr.function = timerfd_tmrproc;
+
+ if (isalarm(ctx)) {
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ } else {
+ hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_set_expires(&ctx->t.tmr, texp);
+ ctx->t.tmr.function = timerfd_tmrproc;
+ }
+
if (texp.tv64 != 0) {
- hrtimer_start(&ctx->tmr, texp, htmode);
+ if (isalarm(ctx)) {
+ if (flags & TFD_TIMER_ABSTIME)
+ alarm_start(&ctx->t.alarm, texp);
+ else
+ alarm_start_relative(&ctx->t.alarm, texp);
+ } else {
+ hrtimer_start(&ctx->t.tmr, texp, htmode);
+ }
+
if (timerfd_canceled(ctx))
return -ECANCELED;
}
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
struct timerfd_ctx *ctx = file->private_data;
timerfd_remove_cancel(ctx);
- hrtimer_cancel(&ctx->tmr);
+
+ if (isalarm(ctx))
+ alarm_cancel(&ctx->t.alarm);
+ else
+ hrtimer_cancel(&ctx->t.tmr);
kfree_rcu(ctx, rcu);
return 0;
}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
* callback to avoid DoS attacks specifying a very
* short timer period.
*/
- ticks += hrtimer_forward_now(&ctx->tmr,
- ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+ if (isalarm(ctx)) {
+ ticks += alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ticks += hrtimer_forward_now(&ctx->t.tmr,
+ ctx->tintv) - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
ctx->expired = 0;
ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
if ((flags & ~TFD_CREATE_FLAGS) ||
(clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME))
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_REALTIME_ALARM &&
+ clockid != CLOCK_BOOTTIME_ALARM))
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
init_waitqueue_head(&ctx->wqh);
ctx->clockid = clockid;
- hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+
+ if (isalarm(ctx))
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ else
+ hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+
ctx->moffs = ktime_get_monotonic_offset();
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
*/
for (;;) {
spin_lock_irq(&ctx->wqh.lock);
- if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
- break;
+
+ if (isalarm(ctx)) {
+ if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+ break;
+ } else {
+ if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+ break;
+ }
spin_unlock_irq(&ctx->wqh.lock);
cpu_relax();
}
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
* We do not update "ticks" and "expired" since the timer will be
* re-programmed again in the following timerfd_setup() call.
*/
- if (ctx->expired && ctx->tintv.tv64)
- hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+ if (ctx->expired && ctx->tintv.tv64) {
+ if (isalarm(ctx))
+ alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+ else
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+ }
old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
spin_lock_irq(&ctx->wqh.lock);
if (ctx->expired && ctx->tintv.tv64) {
ctx->expired = 0;
- ctx->ticks +=
- hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+
+ if (isalarm(ctx)) {
+ ctx->ticks +=
+ alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ctx->ticks +=
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+ - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index de08c92f2e23..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type)
* This means that UBIFS cannot support NFS which requires full
* 'seekdir()'/'telldir()' support.
*/
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
- int err, over = 0;
+ int err;
struct qstr nm;
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
- dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+ dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
- if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+ if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
/*
* The directory was seek'ed to a senseless position or there
* are no more entries.
*/
return 0;
- /* File positions 0 and 1 correspond to "." and ".." */
- if (file->f_pos == 0) {
- ubifs_assert(!file->private_data);
- over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
- if (over)
- return 0;
- file->f_pos = 1;
+ if (file->f_version == 0) {
+ /*
+ * The file was seek'ed, which means that @file->private_data
+ * is now invalid. This may also be just the first
+ * 'ubifs_readdir()' invocation, in which case
+ * @file->private_data is NULL, and the below code is
+ * basically a no-op.
+ */
+ kfree(file->private_data);
+ file->private_data = NULL;
}
- if (file->f_pos == 1) {
+ /*
+ * 'generic_file_llseek()' unconditionally sets @file->f_version to
+ * zero, and we use this for detecting whether the file was seek'ed.
+ */
+ file->f_version = 1;
+
+ /* File positions 0 and 1 correspond to "." and ".." */
+ if (ctx->pos < 2) {
ubifs_assert(!file->private_data);
- over = filldir(dirent, "..", 2, 1,
- parent_ino(file->f_path.dentry), DT_DIR);
- if (over)
+ if (!dir_emit_dots(file, ctx))
return 0;
/* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
goto out;
}
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
@@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
if (!dent) {
/*
* The directory was seek'ed to and is now readdir'ed.
- * Find the entry corresponding to @file->f_pos or the
- * closest one.
+ * Find the entry corresponding to @ctx->pos or the closest one.
*/
- dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+ dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
nm.name = NULL;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
goto out;
}
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
@@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
ubifs_inode(dir)->creat_sqnum);
nm.len = le16_to_cpu(dent->nlen);
- over = filldir(dirent, dent->name, nm.len, file->f_pos,
+ if (!dir_emit(ctx, dent->name, nm.len,
le64_to_cpu(dent->inum),
- vfs_dent_type(dent->type));
- if (over)
+ vfs_dent_type(dent->type)))
return 0;
/* Switch to the next entry */
@@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
}
kfree(file->private_data);
- file->f_pos = key_hash_flash(c, &dent->key);
+ ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
cond_resched();
}
@@ -448,18 +454,11 @@ out:
kfree(file->private_data);
file->private_data = NULL;
- file->f_pos = 2;
+ /* 2 is a special value indicating that there are no more direntries */
+ ctx->pos = 2;
return 0;
}
-/* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- kfree(file->private_data);
- file->private_data = NULL;
- return generic_file_llseek(file, offset, whence);
-}
-
/* Free saved readdir() state when the directory is closed */
static int ubifs_dir_release(struct inode *dir, struct file *file)
{
@@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
};
const struct file_operations ubifs_dir_operations = {
- .llseek = ubifs_dir_llseek,
+ .llseek = generic_file_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
- .readdir = ubifs_readdir,
+ .iterate = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f12189d2db1d..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
*/
#include "ubifs.h"
+#include <linux/aio.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -1276,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset)
+ if (offset || length < PAGE_CACHE_SIZE)
/* Partial page remains dirty */
return;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
c->vi.ubi_num, c->vi.vol_id, c->vi.name,
- c->ro_mount ? ", R/O mode" : NULL);
+ c->ro_mount ? ", R/O mode" : "");
x = (long long)c->main_lebs * c->leb_size;
y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
#include "udf_i.h"
#include "udf_sb.h"
-static int do_udf_readdir(struct inode *dir, struct file *filp,
- filldir_t filldir, void *dirent)
+
+static int udf_readdir(struct file *file, struct dir_context *ctx)
{
+ struct inode *dir = file_inode(file);
+ struct udf_inode_info *iinfo = UDF_I(dir);
struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
struct fileIdentDesc *fi = NULL;
struct fileIdentDesc cfi;
int block, iblock;
- loff_t nf_pos = (filp->f_pos - 1) << 2;
+ loff_t nf_pos;
int flen;
unsigned char *fname = NULL;
unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
uint32_t elen;
sector_t offset;
int i, num, ret = 0;
- unsigned int dt_type;
struct extent_position epos = { NULL, 0, {0, 0} };
- struct udf_inode_info *iinfo;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos = 1;
+ }
+ nf_pos = (ctx->pos - 1) << 2;
if (nf_pos >= size)
goto out;
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
nf_pos = udf_ext0_offset(dir);
fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
- iinfo = UDF_I(dir);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
&epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
}
while (nf_pos < size) {
- filp->f_pos = (nf_pos >> 2) + 1;
+ struct kernel_lb_addr tloc;
+
+ ctx->pos = (nf_pos >> 2) + 1;
fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
&elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
}
if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
- iblock = parent_ino(filp->f_path.dentry);
- flen = 2;
- memcpy(fname, "..", flen);
- dt_type = DT_DIR;
- } else {
- struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-
- iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
- dt_type = DT_UNKNOWN;
+ if (!dir_emit_dotdot(file, ctx))
+ goto out;
+ continue;
}
- if (flen && filldir(dirent, fname, flen, filp->f_pos,
- iblock, dt_type) < 0)
+ flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ if (!flen)
+ continue;
+
+ tloc = lelb_to_cpu(cfi.icb.extLocation);
+ iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+ if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
goto out;
} /* end while */
- filp->f_pos = (nf_pos >> 2) + 1;
+ ctx->pos = (nf_pos >> 2) + 1;
out:
if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
return ret;
}
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- struct inode *dir = file_inode(filp);
- int result;
-
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
- return 0;
- }
- filp->f_pos++;
- }
-
- result = do_udf_readdir(dir, filp, filldir, dirent);
- return result;
-}
-
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = udf_readdir,
+ .iterate = udf_readdir,
.unlocked_ioctl = udf_ioctl,
.fsync = generic_file_fsync,
};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7a12e48ad819..b6d15d349810 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 102c072c6bbf..5f6fc17d6bc5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
return 0;
}
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+ struct udf_inode_info *iinfo;
+ int err;
+
+ inode = udf_new_inode(dir, mode, &err);
+ if (!inode)
+ return err;
+
+ iinfo = UDF_I(inode);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ inode->i_data.a_ops = &udf_adinicb_aops;
+ else
+ inode->i_data.a_ops = &udf_aops;
+ inode->i_op = &udf_file_inode_operations;
+ inode->i_fop = &udf_file_operations;
+ mark_inode_dirty(inode);
+
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
.rmdir = udf_rmdir,
.mknod = udf_mknod,
.rename = udf_rename,
+ .tmpfile = udf_tmpfile,
};
const struct inode_operations udf_symlink_inode_operations = {
.readlink = generic_readlink,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
* This is blatantly stolen from ext2fs
*/
static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = file_inode(filp);
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = ufs_dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
- int need_revalidate = filp->f_version != inode->i_version;
+ int need_revalidate = file->f_version != inode->i_version;
unsigned flags = UFS_SB(sb)->s_flags;
UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
ufs_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
return -EIO;
}
if (de->d_ino) {
- int over;
unsigned char d_type = DT_UNKNOWN;
- offset = (char *)de - kaddr;
-
UFSD("filldir(%s,%u)\n", de->d_name,
fs32_to_cpu(sb, de->d_ino));
UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
d_type = de->d_u.d_44.d_type;
- over = filldir(dirent, de->d_name,
+ if (!dir_emit(ctx, de->d_name,
ufs_get_de_namlen(sb, de),
- (n<<PAGE_CACHE_SHIFT) | offset,
- fs32_to_cpu(sb, de->d_ino), d_type);
- if (over) {
+ fs32_to_cpu(sb, de->d_ino),
+ d_type)) {
ufs_put_page(page);
return 0;
}
}
- filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+ ctx->pos += fs16_to_cpu(sb, de->d_reclen);
}
ufs_put_page(page);
}
@@ -660,7 +656,7 @@ not_empty:
const struct file_operations ufs_dir_operations = {
.read = generic_read_dir,
- .readdir = ufs_readdir,
+ .iterate = ufs_readdir,
.fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 95425b59ce0a..b6c2f94e041e 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -26,8 +26,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
count = size >> uspi->s_fshift;
if (count > UFS_MAXFRAG)
return NULL;
- ubh = (struct ufs_buffer_head *)
- kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
+ ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
if (!ubh)
return NULL;
ubh->fragment = fragment;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index cc33aaf219f1..399e8cec6e60 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -69,6 +69,19 @@ config XFS_RT
If unsure, say N.
+config XFS_WARN
+ bool "XFS Verbose Warnings"
+ depends on XFS_FS && !XFS_DEBUG
+ help
+ Say Y here to get an XFS build with many additional warnings.
+ It converts ASSERT checks to WARN, so will log any out-of-bounds
+ conditions that occur that would otherwise be missed. It is much
+ lighter weight than XFS_DEBUG and does not modify algorithms and will
+ not cause the kernel to panic on non-fatal errors.
+
+ However, similar to XFS_DEBUG, it is only advisable to use this if you
+ are debugging a particular problem.
+
config XFS_DEBUG
bool "XFS Debugging support"
depends on XFS_FS
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d02201df855b..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -45,11 +45,11 @@ xfs-y += xfs_aops.o \
xfs_itable.o \
xfs_message.o \
xfs_mru_cache.o \
- xfs_super.o \
- xfs_xattr.o \
xfs_rename.o \
+ xfs_super.o \
xfs_utils.o \
xfs_vnodeops.o \
+ xfs_xattr.o \
kmem.o \
uuid.o
@@ -58,6 +58,7 @@ xfs-y += xfs_alloc.o \
xfs_alloc_btree.o \
xfs_attr.o \
xfs_attr_leaf.o \
+ xfs_attr_remote.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
@@ -70,9 +71,11 @@ xfs-y += xfs_alloc.o \
xfs_dir2_sf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_icreate_item.o \
xfs_inode.o \
xfs_log_recover.o \
xfs_mount.o \
+ xfs_symlink.o \
xfs_trans.o
# low-level transaction/log code
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5c..e3c92d19e540 100644
--- a/fs/xfs/mrlock.h
+++ b/fs/xfs/mrlock.h
@@ -22,12 +22,12 @@
typedef struct {
struct rw_semaphore mr_lock;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int mr_writer;
#endif
} mrlock_t;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
#define mrinit(mrp, name) \
do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
#else
@@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass)
static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
{
down_write_nested(&mrp->mr_lock, subclass);
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
#endif
}
@@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
{
if (!down_write_trylock(&mrp->mr_lock))
return 0;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
#endif
return 1;
@@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
static inline void mrunlock_excl(mrlock_t *mrp)
{
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 0;
#endif
up_write(&mrp->mr_lock);
@@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp)
static inline void mrdemote(mrlock_t *mrp)
{
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 0;
#endif
downgrade_write(&mrp->mr_lock);
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index d8b11b7f94aa..a742c47f7d5a 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,6 +24,11 @@
#define XFS_BUF_LOCK_TRACKING 1
#endif
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
#include "xfs_linux.h"
#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d52763..306d883d89bc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,6 +21,8 @@
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_vnodeops.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
#include "xfs_trace.h"
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -34,7 +36,9 @@
*/
STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
+xfs_acl_from_disk(
+ struct xfs_acl *aclp,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
- if (count > XFS_ACL_MAX_ENTRIES)
+ if (count > max_entries)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
struct xfs_inode *ip = XFS_I(inode);
struct posix_acl *acl;
struct xfs_acl *xfs_acl;
- int len = sizeof(struct xfs_acl);
unsigned char *ea_name;
int error;
+ int len;
acl = get_cached_acl(inode, type);
if (acl != ACL_NOT_CACHED)
@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
* If we have a cached ACLs value just return it, not need to
* go out to the disk.
*/
-
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ len = XFS_ACL_MAX_SIZE(ip->i_mount);
+ xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl);
+ acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (acl) {
struct xfs_acl *xfs_acl;
- int len;
+ int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ xfs_acl = kzalloc(len, GFP_KERNEL);
if (!xfs_acl)
return -ENOMEM;
xfs_acl_to_disk(xfs_acl, acl);
- len = sizeof(struct xfs_acl) -
- (sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+ /* subtract away the unused acl entries */
+ len -= sizeof(struct xfs_acl_entry) *
+ (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
static int
xfs_acl_exists(struct inode *inode, unsigned char *name)
{
- int len = sizeof(struct xfs_acl);
+ int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
goto out_release;
error = -EINVAL;
- if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+ if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
goto out_release;
if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d941354..4016a567b83c 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
struct posix_acl;
struct xfs_inode;
-#define XFS_ACL_MAX_ENTRIES 25
#define XFS_ACL_NOT_PRESENT (-1)
/* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
+
struct xfs_acl {
- __be32 acl_cnt;
- struct xfs_acl_entry {
- __be32 ae_tag;
- __be32 ae_id;
- __be16 ae_perm;
- } acl_entry[XFS_ACL_MAX_ENTRIES];
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[0];
};
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_sb_version_hascrc(&mp->m_sb) \
+ ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2aeedb6a579..317aa86d96ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -30,6 +30,7 @@ struct xfs_trans;
#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
#define XFS_AGF_VERSION 1
#define XFS_AGI_VERSION 1
@@ -63,12 +64,29 @@ typedef struct xfs_agf {
__be32 agf_spare0; /* spare field */
__be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
__be32 agf_spare1; /* spare field */
+
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
__be32 agf_flcount; /* count of blocks in freelist */
__be32 agf_freeblks; /* total free blocks */
+
__be32 agf_longest; /* longest free space */
__be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+ uuid_t agf_uuid; /* uuid of filesystem */
+
+ /*
+ * reserve some contiguous space for future logged fields before we add
+ * the unlogged fields. This makes the range logging via flags and
+ * structure offsets much simpler.
+ */
+ __be64 agf_spare64[16];
+
+ /* unlogged fields, written during buffer writeback. */
+ __be64 agf_lsn; /* last write sequence */
+ __be32 agf_crc; /* crc of agf sector */
+ __be32 agf_spare2;
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agf_t;
#define XFS_AGF_MAGICNUM 0x00000001
@@ -83,7 +101,8 @@ typedef struct xfs_agf {
#define XFS_AGF_FREEBLKS 0x00000200
#define XFS_AGF_LONGEST 0x00000400
#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_NUM_BITS 12
+#define XFS_AGF_UUID 0x00001000
+#define XFS_AGF_NUM_BITS 13
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -98,7 +117,8 @@ typedef struct xfs_agf {
{ XFS_AGF_FLCOUNT, "FLCOUNT" }, \
{ XFS_AGF_FREEBLKS, "FREEBLKS" }, \
{ XFS_AGF_LONGEST, "LONGEST" }, \
- { XFS_AGF_BTREEBLKS, "BTREEBLKS" }
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
+ { XFS_AGF_UUID, "UUID" }
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
@@ -132,6 +152,7 @@ typedef struct xfs_agi {
__be32 agi_root; /* root of inode btree */
__be32 agi_level; /* levels in inode btree */
__be32 agi_freecount; /* number of free inodes */
+
__be32 agi_newino; /* new inode just allocated */
__be32 agi_dirino; /* last directory inode chunk */
/*
@@ -139,6 +160,13 @@ typedef struct xfs_agi {
* still being referenced.
*/
__be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+
+ uuid_t agi_uuid; /* uuid of filesystem */
+ __be32 agi_crc; /* crc of agi sector */
+ __be32 agi_pad32;
+ __be64 agi_lsn; /* last write sequence */
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agi_t;
#define XFS_AGI_MAGICNUM 0x00000001
@@ -171,11 +199,31 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops;
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+ (__be32 *)(bp)->b_addr)
+
+/*
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+ (((mp)->m_sb.sb_sectsize - \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ sizeof(struct xfs_agfl) : 0)) / \
+ sizeof(xfs_agblock_t))
+
typedef struct xfs_agfl {
- __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
+ __be32 agfl_magicnum;
+ __be32 agfl_seqno;
+ uuid_t agfl_uuid;
+ __be64 agfl_lsn;
+ __be32 agfl_crc;
+ __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
} xfs_agfl_t;
/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 0ad23253e8b1..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -33,7 +33,9 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_buf_item.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -173,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -187,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -430,53 +440,84 @@ xfs_alloc_fixup_trees(
return 0;
}
-static void
+static bool
xfs_agfl_verify(
struct xfs_buf *bp)
{
-#ifdef WHEN_CRCS_COME_ALONG
- /*
- * we cannot actually do any verification of the AGFL because mkfs does
- * not initialise the AGFL to zero or NULL. Hence the only valid part of
- * the AGFL is what the AGF says is active. We can't get to the AGF, so
- * we can't verify just those entries are valid.
- *
- * This problem goes away when the CRC format change comes along as that
- * requires the AGFL to be initialised by mkfs. At that point, we can
- * verify the blocks in the agfl -active or not- lie within the bounds
- * of the AG. Until then, just leave this check ifdef'd out.
- */
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
- int agfl_ok = 1;
-
int i;
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ return false;
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ return false;
+
for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
- if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+ if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
- agfl_ok = 0;
+ return false;
}
+ return true;
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int agfl_ok = 1;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agfl, agfl_crc));
+
+ agfl_ok = agfl_ok && xfs_agfl_verify(bp);
if (!agfl_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
}
-#endif
}
static void
xfs_agfl_write_verify(
struct xfs_buf *bp)
{
- xfs_agfl_verify(bp);
-}
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
-static void
-xfs_agfl_read_verify(
- struct xfs_buf *bp)
-{
- xfs_agfl_verify(bp);
+ /* no verification of non-crc AGFLs */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_agfl_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (bip)
+ XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agfl, agfl_crc));
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -772,7 +813,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbnoa,
+ args->alignment,
+ args->userdata, *sbnoa,
*slena, &new);
/*
@@ -842,7 +884,7 @@ xfs_alloc_ag_vextent_near(
*/
int dofirst; /* set to do first algorithm */
- dofirst = random32() & 1;
+ dofirst = prandom_u32() & 1;
#endif
restart:
@@ -943,7 +985,8 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1095,7 +1138,8 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
@@ -1111,7 +1155,8 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbnoa, gtlena, &gtnew);
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
@@ -1170,7 +1215,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- ltbnoa, ltlena, &ltnew);
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1982,18 +2027,18 @@ xfs_alloc_get_freelist(
int btreeblk) /* destination is a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. freelist structure */
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
+ __be32 *agfl_bno;
int error;
int logflags;
- xfs_mount_t *mp; /* mount structure */
+ xfs_mount_t *mp = tp->t_mountp;
xfs_perag_t *pag; /* per allocation group data */
- agf = XFS_BUF_TO_AGF(agbp);
/*
* Freelist is empty, give up.
*/
+ agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -2001,15 +2046,17 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- mp = tp->t_mountp;
- if ((error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
+ error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+ &agflbp);
+ if (error)
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
/*
* Get the block number and update the data structures.
*/
- bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2058,11 +2105,14 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_freeblks),
offsetof(xfs_agf_t, agf_longest),
offsetof(xfs_agf_t, agf_btreeblks),
+ offsetof(xfs_agf_t, agf_uuid),
sizeof(xfs_agf_t)
};
trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
}
@@ -2099,12 +2149,13 @@ xfs_alloc_put_freelist(
int btreeblk) /* block came from a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. free block array */
__be32 *blockp;/* pointer to array entry */
int error;
int logflags;
xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
+ __be32 *agfl_bno;
+ int startoff;
agf = XFS_BUF_TO_AGF(agbp);
mp = tp->t_mountp;
@@ -2112,7 +2163,6 @@ xfs_alloc_put_freelist(
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
agf->agf_fllast = 0;
@@ -2133,32 +2183,38 @@ xfs_alloc_put_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
- blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
+ startoff = (char *)blockp - (char *)agflbp->b_addr;
+
xfs_alloc_log_agf(tp, agbp, logflags);
- xfs_trans_log_buf(tp, agflbp,
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
- sizeof(xfs_agblock_t) - 1));
+
+ xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(tp, agflbp, startoff,
+ startoff + sizeof(xfs_agblock_t) - 1);
return 0;
}
-static void
+static bool
xfs_agf_verify(
+ struct xfs_mount *mp,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_agf *agf;
- int agf_ok;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- agf = XFS_BUF_TO_AGF(bp);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ return false;
- agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+ if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+ return false;
/*
* during growfs operations, the perag is not fully initialised,
@@ -2166,33 +2222,58 @@ xfs_agf_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag)
- agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
- bp->b_pag->pag_agno;
+ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ return false;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ return false;
+
+ return true;;
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
}
static void
xfs_agf_read_verify(
struct xfs_buf *bp)
{
- xfs_agf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int agf_ok = 1;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agf, agf_crc));
+
+ agf_ok = agf_ok && xfs_agf_verify(mp, bp);
+
+ if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
xfs_agf_write_verify(
struct xfs_buf *bp)
{
- xfs_agf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agf_verify(mp, bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agf, agf_crc));
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b1ddef6b2689..cafc90251d19 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -33,6 +33,7 @@
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
STATIC struct xfs_btree_cur *
@@ -272,7 +273,7 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-static void
+static bool
xfs_allocbt_verify(
struct xfs_buf *bp)
{
@@ -280,66 +281,103 @@ xfs_allocbt_verify(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
unsigned int level;
- int sblock_ok; /* block passes checks */
/*
* magic number and level verification
*
- * During growfs operations, we can't verify the exact level as the
- * perag is not fully initialised and hence not attached to the buffer.
- * In this case, check against the maximum tree depth.
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
*/
level = be16_to_cpu(block->bb_level);
switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag)
- sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
- else
- sblock_ok = level < mp->m_ag_maxlevels;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
break;
+ case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag)
- sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
- else
- sblock_ok = level < mp->m_ag_maxlevels;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
break;
default:
- sblock_ok = 0;
- break;
+ return false;
}
/* numrecs verification */
- sblock_ok = sblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+ return false;
/* sibling pointer verification */
- sblock_ok = sblock_ok &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_rightsib;
-
- if (!sblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
static void
xfs_allocbt_read_verify(
struct xfs_buf *bp)
{
- xfs_allocbt_verify(bp);
+ if (!(xfs_btree_sblock_verify_crc(bp) &&
+ xfs_allocbt_verify(bp))) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
xfs_allocbt_write_verify(
struct xfs_buf *bp)
{
- xfs_allocbt_verify(bp);
+ if (!xfs_allocbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
@@ -348,7 +386,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_allocbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -404,7 +442,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_allocbt_keys_inorder,
.recs_inorder = xfs_allocbt_recs_inorder,
#endif
@@ -444,6 +482,9 @@ xfs_allocbt_init_cursor(
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
return cur;
}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 7e89a2b429dd..e3a3f7424192 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -31,8 +31,10 @@ struct xfs_mount;
* by blockcount and blockno. All blocks look the same to make the code
* simpler; if we have time later, we'll make the optimizations.
*/
-#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
-#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
/*
* Data record/key structure
@@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t;
/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5f707e537171..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -724,6 +725,25 @@ xfs_convert_page(
(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
i_size_read(inode));
+ /*
+ * If the current map does not span the entire page we are about to try
+ * to write, then give up. The only way we can write a page that spans
+ * multiple mappings in a single writeback iteration is via the
+ * xfs_vm_writepage() function. Data integrity writeback requires the
+ * entire page to be written in a single attempt, otherwise the part of
+ * the page we don't write here doesn't get written as part of the data
+ * integrity sync.
+ *
+ * For normal writeback, we also don't attempt to write partial pages
+ * here as it simply means that write_cache_pages() will see it under
+ * writeback and ignore the page until some point in the future, at
+ * which time this will be the only page in the file that needs
+ * writeback. Hence for more optimal IO patterns, we should always
+ * avoid partial page writeback due to multiple mappings on a page here.
+ */
+ if (!xfs_imap_valid(inode, imap, end_offset))
+ goto fail_unlock_page;
+
len = 1 << inode->i_blkbits;
p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
PAGE_CACHE_SIZE);
@@ -823,10 +843,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -890,7 +912,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -920,7 +942,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -953,13 +975,13 @@ xfs_vm_writepage(
unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
/*
- * Just skip the page if it is fully outside i_size, e.g. due
- * to a truncate operation that is in progress.
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * xfs_vm_releasepage() is called on it and gets confused.
*/
- if (page->index >= end_index + 1 || offset_into_page == 0) {
- unlock_page(page);
- return 0;
- }
+ if (page->index >= end_index + 1 || offset_into_page == 0)
+ goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
@@ -1151,7 +1173,7 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 888683844d98..20fe3fe9d341 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,7 +15,6 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
@@ -35,6 +34,7 @@
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
@@ -74,13 +74,6 @@ STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-/*
- * Routines to manipulate out-of-line attribute values.
- */
-STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
-STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
-
-#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
STATIC int
xfs_attr_name_to_xname(
@@ -820,7 +813,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
error = 0;
goto out;
}
- error = xfs_attr_root_inactive(&trans, dp);
+ error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out;
@@ -906,7 +899,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
@@ -914,14 +907,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Look up the given attribute in the leaf block. Figure out if
* the given flags produce an error or call for an atomic rename.
*/
- retval = xfs_attr_leaf_lookup_int(bp, args);
+ retval = xfs_attr3_leaf_lookup_int(bp, args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
xfs_trans_brelse(args->trans, bp);
- return(retval);
+ return retval;
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE) { /* pure create op */
xfs_trans_brelse(args->trans, bp);
- return(retval);
+ return retval;
}
trace_xfs_attr_leaf_replace(args);
@@ -937,7 +930,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Add the attribute to the leaf block, transitioning to a Btree
* if required.
*/
- retval = xfs_attr_leaf_add(bp, args);
+ retval = xfs_attr3_leaf_add(bp, args);
if (retval == ENOSPC) {
/*
* Promote the attribute list to the Btree format, then
@@ -945,7 +938,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* can manage its own transactions.
*/
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1010,7 +1003,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
return(error);
@@ -1032,19 +1025,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Read in the block containing the "old" attr, then
* remove the "old" attr from that block (neat, huh!)
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-1, &bp);
if (error)
return error;
- xfs_attr_leaf_remove(bp, args);
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
@@ -1076,9 +1069,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
}
- return(error);
+ return error;
}
/*
@@ -1101,24 +1094,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error == ENOATTR) {
xfs_trans_brelse(args->trans, bp);
- return(error);
+ return error;
}
- xfs_attr_leaf_remove(bp, args);
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1128,7 +1121,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
@@ -1138,7 +1131,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
if (committed)
xfs_trans_ijoin(args->trans, dp, 0);
}
- return(0);
+ return 0;
}
/*
@@ -1156,21 +1149,21 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
trace_xfs_attr_leaf_get(args);
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error != EEXIST) {
xfs_trans_brelse(args->trans, bp);
- return(error);
+ return error;
}
- error = xfs_attr_leaf_getvalue(bp, args);
+ error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
error = xfs_attr_rmtval_get(args);
}
- return(error);
+ return error;
}
/*
@@ -1185,11 +1178,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
trace_xfs_attr_leaf_list(context);
context->cursor->blkno = 0;
- error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
if (error)
return XFS_ERROR(error);
- error = xfs_attr_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(NULL, bp);
return XFS_ERROR(error);
}
@@ -1236,7 +1229,7 @@ restart:
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
blk = &state->path.blk[ state->path.active-1 ];
@@ -1258,7 +1251,7 @@ restart:
args->rmtblkcnt = 0;
}
- retval = xfs_attr_leaf_add(blk->bp, state->args);
+ retval = xfs_attr3_leaf_add(blk->bp, state->args);
if (retval == ENOSPC) {
if (state->path.active == 1) {
/*
@@ -1268,7 +1261,7 @@ restart:
*/
xfs_da_state_free(state);
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
@@ -1307,7 +1300,7 @@ restart:
* in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
*/
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_split(state);
+ error = xfs_da3_split(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1329,7 +1322,7 @@ restart:
/*
* Addition succeeded, update Btree hashvals.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
}
/*
@@ -1370,7 +1363,7 @@ restart:
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
goto out;
@@ -1400,7 +1393,7 @@ restart:
state->blocksize = state->mp->m_sb.sb_blocksize;
state->node_ents = state->mp->m_attr_node_ents;
state->inleaf = 0;
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
@@ -1409,15 +1402,15 @@ restart:
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ error = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
@@ -1450,7 +1443,7 @@ restart:
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
if (error)
goto out;
}
@@ -1495,7 +1488,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error || (retval != EEXIST)) {
if (error == 0)
error = retval;
@@ -1524,7 +1517,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Mark the attribute as INCOMPLETE, then bunmapi() the
* remote value.
*/
- error = xfs_attr_leaf_setflag(args);
+ error = xfs_attr3_leaf_setflag(args);
if (error)
goto out;
error = xfs_attr_rmtval_remove(args);
@@ -1545,15 +1538,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1591,13 +1584,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
ASSERT(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
if (error)
goto out;
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
@@ -1699,7 +1692,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_node_read(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1718,7 +1711,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_node_read(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1758,7 +1751,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
} else if (retval == EEXIST) {
@@ -1769,7 +1762,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
/*
* Get the value, local or "remote"
*/
- retval = xfs_attr_leaf_getvalue(blk->bp, args);
+ retval = xfs_attr3_leaf_getvalue(blk->bp, args);
if (!retval && (args->rmtblkno > 0)
&& !(args->flags & ATTR_KERNOVAL)) {
retval = xfs_attr_rmtval_get(args);
@@ -1794,7 +1787,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
attrlist_cursor_kern_t *cursor;
xfs_attr_leafblock_t *leaf;
xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
int error, i;
struct xfs_buf *bp;
@@ -1810,27 +1805,33 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
+ error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
if ((error != 0) && (error != EFSCORRUPTED))
return(error);
if (bp) {
+ struct xfs_attr_leaf_entry *entries;
+
node = bp->b_addr;
switch (be16_to_cpu(node->hdr.info.magic)) {
case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
trace_xfs_attr_list_wrong_blk(context);
xfs_trans_brelse(NULL, bp);
bp = NULL;
break;
case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
leaf = bp->b_addr;
- if (cursor->hashval > be32_to_cpu(leaf->entries[
- be16_to_cpu(leaf->hdr.count)-1].hashval)) {
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (cursor->hashval > be32_to_cpu(
+ entries[leafhdr.count - 1].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
xfs_trans_brelse(NULL, bp);
bp = NULL;
- } else if (cursor->hashval <=
- be32_to_cpu(leaf->entries[0].hashval)) {
+ } else if (cursor->hashval <= be32_to_cpu(
+ entries[0].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
xfs_trans_brelse(NULL, bp);
bp = NULL;
@@ -1852,27 +1853,31 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (bp == NULL) {
cursor->blkno = 0;
for (;;) {
- error = xfs_da_node_read(NULL, context->dp,
+ __uint16_t magic;
+
+ error = xfs_da3_node_read(NULL, context->dp,
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
return(error);
node = bp->b_addr;
- if (node->hdr.info.magic ==
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
break;
- if (unlikely(node->hdr.info.magic !=
- cpu_to_be16(XFS_DA_NODE_MAGIC))) {
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC) {
XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
XFS_ERRLEVEL_LOW,
context->dp->i_mount,
node);
xfs_trans_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
+ return XFS_ERROR(EFSCORRUPTED);
}
- btree = node->btree;
- for (i = 0; i < be16_to_cpu(node->hdr.count);
- btree++, i++) {
+
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+ for (i = 0; i < nodehdr.count; btree++, i++) {
if (cursor->hashval
<= be32_to_cpu(btree->hashval)) {
cursor->blkno = be32_to_cpu(btree->before);
@@ -1881,9 +1886,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
break;
}
}
- if (i == be16_to_cpu(node->hdr.count)) {
+ if (i == nodehdr.count) {
xfs_trans_brelse(NULL, bp);
- return(0);
+ return 0;
}
xfs_trans_brelse(NULL, bp);
}
@@ -1897,310 +1902,21 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
for (;;) {
leaf = bp->b_addr;
- error = xfs_attr_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
if (error) {
xfs_trans_brelse(NULL, bp);
return error;
}
- if (context->seen_enough || leaf->hdr.info.forw == 0)
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ if (context->seen_enough || leafhdr.forw == 0)
break;
- cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
+ cursor->blkno = leafhdr.forw;
xfs_trans_brelse(NULL, bp);
- error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+ error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
&bp);
if (error)
return error;
}
xfs_trans_brelse(NULL, bp);
- return(0);
-}
-
-
-/*========================================================================
- * External routines for manipulating out-of-line attribute values.
- *========================================================================*/
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-int
-xfs_attr_rmtval_get(xfs_da_args_t *args)
-{
- xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
- xfs_mount_t *mp;
- xfs_daddr_t dblkno;
- void *dst;
- xfs_buf_t *bp;
- int nmap, error, tmp, valuelen, blkcnt, i;
- xfs_dablk_t lblkno;
-
- trace_xfs_attr_rmtval_get(args);
-
- ASSERT(!(args->flags & ATTR_KERNOVAL));
-
- mp = args->dp->i_mount;
- dst = args->value;
- valuelen = args->valuelen;
- lblkno = args->rmtblkno;
- while (valuelen > 0) {
- nmap = ATTR_RMTVALUE_MAPSIZE;
- error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap >= 1);
-
- for (i = 0; (i < nmap) && (valuelen > 0); i++) {
- ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
- (map[i].br_startblock != HOLESTARTBLOCK));
- dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
- blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- dblkno, blkcnt, 0, &bp, NULL);
- if (error)
- return(error);
-
- tmp = min_t(int, valuelen, BBTOB(bp->b_length));
- xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
- xfs_buf_relse(bp);
- dst += tmp;
- valuelen -= tmp;
-
- lblkno += map[i].br_blockcount;
- }
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-STATIC int
-xfs_attr_rmtval_set(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_fileoff_t lfileoff;
- xfs_inode_t *dp;
- xfs_bmbt_irec_t map;
- xfs_daddr_t dblkno;
- void *src;
- xfs_buf_t *bp;
- xfs_dablk_t lblkno;
- int blkcnt, valuelen, nmap, error, tmp, committed;
-
- trace_xfs_attr_rmtval_set(args);
-
- dp = args->dp;
- mp = dp->i_mount;
- src = args->value;
-
- /*
- * Find a "hole" in the attribute address space large enough for
- * us to drop the new attribute's value into.
- */
- blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
- lfileoff = 0;
- error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
- args->rmtblkcnt = blkcnt;
-
- /*
- * Roll through the "value", allocating blocks on disk as required.
- */
- while (blkcnt > 0) {
- /*
- * Allocate a single extent, up to the size of the value.
- */
- xfs_bmap_init(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, args->total, &map, &nmap,
- args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
-
- /*
- * Start the next trans in the chain.
- */
- error = xfs_trans_roll(&args->trans, dp);
- if (error)
- return (error);
- }
-
- /*
- * Roll through the "value", copying the attribute value to the
- * already-allocated blocks. Blocks are written synchronously
- * so that we can know they are all on disk before we turn off
- * the INCOMPLETE flag.
- */
- lblkno = args->rmtblkno;
- valuelen = args->valuelen;
- while (valuelen > 0) {
- int buflen;
-
- /*
- * Try to remember where we decided to put the value.
- */
- xfs_bmap_init(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
- if (!bp)
- return ENOMEM;
-
- buflen = BBTOB(bp->b_length);
- tmp = min_t(int, valuelen, buflen);
- xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
- if (tmp < buflen)
- xfs_buf_zero(bp, tmp, buflen - tmp);
-
- error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
- xfs_buf_relse(bp);
- if (error)
- return error;
- src += tmp;
- valuelen -= tmp;
-
- lblkno += map.br_blockcount;
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-STATIC int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_bmbt_irec_t map;
- xfs_buf_t *bp;
- xfs_daddr_t dblkno;
- xfs_dablk_t lblkno;
- int valuelen, blkcnt, nmap, error, done, committed;
-
- trace_xfs_attr_rmtval_remove(args);
-
- mp = args->dp->i_mount;
-
- /*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
- */
- lblkno = args->rmtblkno;
- valuelen = args->rmtblkcnt;
- while (valuelen > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- nmap = 1;
- error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
-
- valuelen -= map.br_blockcount;
-
- lblkno += map.br_blockcount;
- }
-
- /*
- * Keep de-allocating extents until the remote-value region is gone.
- */
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- done = 0;
- while (!done) {
- xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist,
- &done);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, args->dp, 0);
-
- /*
- * Close out trans and start the next one in the chain.
- */
- error = xfs_trans_roll(&args->trans, args->dp);
- if (error)
- return (error);
- }
- return(0);
+ return 0;
}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e920d68ef509..de8dd58da46c 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -140,7 +140,6 @@ typedef struct xfs_attr_list_context {
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_list_int(struct xfs_attr_list_context *);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index ee24993c7d12..31d3cd129269 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -31,6 +32,7 @@
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -39,6 +41,9 @@
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
/*
* xfs_attr_leaf.c
@@ -53,85 +58,226 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
- struct xfs_buf **bpp);
-STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
- xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
- struct xfs_buf *leaf_buffer);
-STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+ xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *leaf_blk_1,
- xfs_da_state_blk_t *leaf_blk_2,
- int *number_entries_in_blk1,
- int *number_usedbytes_in_blk1);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *leaf_blk_1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ xfs_da_state_blk_t *leaf_blk_2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *number_entries_in_blk1,
+ int *number_usedbytes_in_blk1);
/*
* Routines used for shrinking the Btree.
*/
-STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
struct xfs_buf *bp, int level);
-STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
struct xfs_buf *bp);
-STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
xfs_dablk_t blkno, int blkcnt);
/*
* Utility routines.
*/
-STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
- int src_start,
- xfs_attr_leafblock_t *dst_leaf,
- int dst_start, int move_count,
- xfs_mount_t *mp);
+STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
+ struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+ struct xfs_attr_leafblock *dst_leaf,
+ struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+ int move_count, struct xfs_mount *mp);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-static void
-xfs_attr_leaf_verify(
+void
+xfs_attr3_leaf_hdr_from_disk(
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ int i;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+ to->firstused = be16_to_cpu(hdr3->firstused);
+ to->holes = hdr3->holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+ }
+ return;
+ }
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+ to->firstused = be16_to_cpu(from->hdr.firstused);
+ to->holes = from->hdr.holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+ }
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ int i;
+
+ ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+ from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+ hdr3->firstused = cpu_to_be16(from->firstused);
+ hdr3->holes = from->holes;
+ hdr3->pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+ return;
+ }
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+ to->hdr.firstused = cpu_to_be16(from->firstused);
+ to->hdr.holes = from->holes;
+ to->hdr.pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+}
+
+static bool
+xfs_attr3_leaf_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
- int block_ok = 0;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr3_icleaf_hdr ichdr;
- block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+ return false;
}
+ if (ichdr.count == 0)
+ return false;
+
+ /* XXX: need to range check rest of attr header values */
+ /* XXX: hash order check? */
+
+ return true;
}
static void
-xfs_attr_leaf_read_verify(
+xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
{
- xfs_attr_leaf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_attr3_leaf_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF);
}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
static void
-xfs_attr_leaf_write_verify(
- struct xfs_buf *bp)
+xfs_attr3_leaf_read_verify(
+ struct xfs_buf *bp)
{
- xfs_attr_leaf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_ATTR3_LEAF_CRC_OFF)) ||
+ !xfs_attr3_leaf_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
-const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
- .verify_read = xfs_attr_leaf_read_verify,
- .verify_write = xfs_attr_leaf_write_verify,
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .verify_read = xfs_attr3_leaf_read_verify,
+ .verify_write = xfs_attr3_leaf_write_verify,
};
int
-xfs_attr_leaf_read(
+xfs_attr3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ return err;
}
/*========================================================================
@@ -172,7 +318,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
int dsize;
xfs_mount_t *mp = dp->i_mount;
- offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+ /* rounded down */
+ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
switch (dp->i_d.di_format) {
case XFS_DINODE_FMT_DEV:
@@ -231,7 +378,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return 0;
return dp->i_d.di_forkoff;
}
- dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
break;
}
@@ -243,7 +390,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -557,7 +705,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
}
ASSERT(blkno == 0);
- error = xfs_attr_leaf_create(args, blkno, &bp);
+ error = xfs_attr3_leaf_create(args, blkno, &bp);
if (error) {
error = xfs_da_shrink_inode(args, 0, bp);
bp = NULL;
@@ -586,9 +734,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
- error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+ error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == ENOATTR);
- error = xfs_attr_leaf_add(bp, &nargs);
+ error = xfs_attr3_leaf_add(bp, &nargs);
ASSERT(error != ENOSPC);
if (error)
goto out;
@@ -783,67 +931,74 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
*/
int
xfs_attr_shortform_allfit(
- struct xfs_buf *bp,
- struct xfs_inode *dp)
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
xfs_attr_leaf_name_local_t *name_loc;
- int bytes, i;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ int bytes;
+ int i;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
- entry = &leaf->entries[0];
bytes = sizeof(struct xfs_attr_sf_hdr);
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ for (i = 0; i < leafhdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!(entry->flags & XFS_ATTR_LOCAL))
return(0);
- name_loc = xfs_attr_leaf_name_local(leaf, i);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
- bytes += sizeof(struct xfs_attr_sf_entry)-1
+ bytes += sizeof(struct xfs_attr_sf_entry) - 1
+ name_loc->namelen
+ be16_to_cpu(name_loc->valuelen);
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
- return(-1);
- return(xfs_attr_shortform_bytesfit(dp, bytes));
+ return -1;
+ return xfs_attr_shortform_bytesfit(dp, bytes);
}
/*
* Convert a leaf attribute list to shortform attribute list
*/
int
-xfs_attr_leaf_to_shortform(
- struct xfs_buf *bp,
- xfs_da_args_t *args,
- int forkoff)
+xfs_attr3_leaf_to_shortform(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args,
+ int forkoff)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_da_args_t nargs;
- xfs_inode_t *dp;
- char *tmpbuffer;
- int error, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_da_args nargs;
+ struct xfs_inode *dp = args->dp;
+ char *tmpbuffer;
+ int error;
+ int i;
trace_xfs_attr_leaf_to_sf(args);
- dp = args->dp;
tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
+ if (!tmpbuffer)
+ return ENOMEM;
- ASSERT(bp != NULL);
memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
+
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ /* XXX (dgc): buffer is about to be marked stale - why zero it? */
memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
/*
@@ -873,14 +1028,14 @@ xfs_attr_leaf_to_shortform(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!entry->nameidx)
continue;
ASSERT(entry->flags & XFS_ATTR_LOCAL);
- name_loc = xfs_attr_leaf_name_local(leaf, i);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
nargs.name = name_loc->nameval;
nargs.namelen = name_loc->namelen;
nargs.value = &name_loc->nameval[nargs.namelen];
@@ -893,61 +1048,75 @@ xfs_attr_leaf_to_shortform(
out:
kmem_free(tmpbuffer);
- return(error);
+ return error;
}
/*
* Convert from using a single leaf to a root node and a leaf.
*/
int
-xfs_attr_leaf_to_node(xfs_da_args_t *args)
+xfs_attr3_leaf_to_node(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_inode_t *dp;
- struct xfs_buf *bp1, *bp2;
- xfs_dablk_t blkno;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr icleafhdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr icnodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp1 = NULL;
+ struct xfs_buf *bp2 = NULL;
+ xfs_dablk_t blkno;
+ int error;
trace_xfs_attr_leaf_to_node(args);
- dp = args->dp;
- bp1 = bp2 = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
if (error)
goto out;
- bp2 = NULL;
- error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
- XFS_ATTR_FORK);
+ error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
if (error)
goto out;
+
+ /* copy leaf to new buffer, update identifiers */
+ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
- bp1 = NULL;
- xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+ hdr3->blkno = cpu_to_be64(bp2->b_bn);
+ }
+ xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1);
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
if (error)
goto out;
node = bp1->b_addr;
+ xfs_da3_node_hdr_from_disk(&icnodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+
leaf = bp2->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
/* both on-disk, don't endian-flip twice */
- node->btree[0].hashval =
- leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
- node->btree[0].before = cpu_to_be32(blkno);
- node->hdr.count = cpu_to_be16(1);
- xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.count = 1;
+ xfs_da3_node_hdr_to_disk(node, &icnodehdr);
+ xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1);
error = 0;
out:
- return(error);
+ return error;
}
@@ -960,52 +1129,63 @@ out:
* or a leaf in a node attribute list.
*/
STATIC int
-xfs_attr_leaf_create(
- xfs_da_args_t *args,
- xfs_dablk_t blkno,
- struct xfs_buf **bpp)
+xfs_attr3_leaf_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ struct xfs_buf **bpp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_inode_t *dp;
- struct xfs_buf *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
trace_xfs_attr_leaf_create(args);
- dp = args->dp;
- ASSERT(dp != NULL);
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
- return(error);
- bp->b_ops = &xfs_attr_leaf_buf_ops;
+ return error;
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
leaf = bp->b_addr;
- memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
- hdr = &leaf->hdr;
- hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
- hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
- if (!hdr->firstused) {
- hdr->firstused = cpu_to_be16(
- XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
+ memset(leaf, 0, XFS_LBSIZE(mp));
+
+ memset(&ichdr, 0, sizeof(ichdr));
+ ichdr.firstused = XFS_LBSIZE(mp);
- hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
- sizeof(xfs_attr_leaf_hdr_t));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
- xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+ } else {
+ ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+ ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+ }
+ ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1);
*bpp = bp;
- return(0);
+ return 0;
}
/*
* Split the leaf node, rebalance, then add the new entry.
*/
int
-xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_attr3_leaf_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
xfs_dablk_t blkno;
int error;
@@ -1019,7 +1199,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
error = xfs_da_grow_inode(state->args, &blkno);
if (error)
return(error);
- error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+ error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
if (error)
return(error);
newblk->blkno = blkno;
@@ -1029,8 +1209,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Rebalance the entries across the two leaves.
* NOTE: rebalance() currently depends on the 2nd block being empty.
*/
- xfs_attr_leaf_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
@@ -1043,10 +1223,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*/
if (state->inleaf) {
trace_xfs_attr_leaf_add_old(state->args);
- error = xfs_attr_leaf_add(oldblk->bp, state->args);
+ error = xfs_attr3_leaf_add(oldblk->bp, state->args);
} else {
trace_xfs_attr_leaf_add_new(state->args);
- error = xfs_attr_leaf_add(newblk->bp, state->args);
+ error = xfs_attr3_leaf_add(newblk->bp, state->args);
}
/*
@@ -1061,22 +1241,23 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Add a name to the leaf attribute list structure.
*/
int
-xfs_attr_leaf_add(
+xfs_attr3_leaf_add(
struct xfs_buf *bp,
struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- int tablesize, entsize, sum, tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ int tablesize;
+ int entsize;
+ int sum;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_add(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT((args->index >= 0)
- && (args->index <= be16_to_cpu(leaf->hdr.count)));
- hdr = &leaf->hdr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index >= 0 && args->index <= ichdr.count);
entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
args->trans->t_mountp->m_sb.sb_blocksize, NULL);
@@ -1084,25 +1265,23 @@ xfs_attr_leaf_add(
* Search through freemap for first-fit on new name length.
* (may need to figure in size of entry struct too)
*/
- tablesize = (be16_to_cpu(hdr->count) + 1)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
- for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
- if (tablesize > be16_to_cpu(hdr->firstused)) {
- sum += be16_to_cpu(map->size);
+ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+ if (tablesize > ichdr.firstused) {
+ sum += ichdr.freemap[i].size;
continue;
}
- if (!map->size)
+ if (!ichdr.freemap[i].size)
continue; /* no space in this map */
tmp = entsize;
- if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
+ if (ichdr.freemap[i].base < ichdr.firstused)
tmp += sizeof(xfs_attr_leaf_entry_t);
- if (be16_to_cpu(map->size) >= tmp) {
- tmp = xfs_attr_leaf_add_work(bp, args, i);
- return(tmp);
+ if (ichdr.freemap[i].size >= tmp) {
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+ goto out_log_hdr;
}
- sum += be16_to_cpu(map->size);
+ sum += ichdr.freemap[i].size;
}
/*
@@ -1110,82 +1289,89 @@ xfs_attr_leaf_add(
* and we don't have enough freespace, then compaction will do us
* no good and we should just give up.
*/
- if (!hdr->holes && (sum < entsize))
- return(XFS_ERROR(ENOSPC));
+ if (!ichdr.holes && sum < entsize)
+ return XFS_ERROR(ENOSPC);
/*
* Compact the entries to coalesce free space.
* This may change the hdr->count via dropping INCOMPLETE entries.
*/
- xfs_attr_leaf_compact(args, bp);
+ xfs_attr3_leaf_compact(args, &ichdr, bp);
/*
* After compaction, the block is guaranteed to have only one
* free region, in freemap[0]. If it is not big enough, give up.
*/
- if (be16_to_cpu(hdr->freemap[0].size)
- < (entsize + sizeof(xfs_attr_leaf_entry_t)))
- return(XFS_ERROR(ENOSPC));
+ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+ tmp = ENOSPC;
+ goto out_log_hdr;
+ }
+
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
- return(xfs_attr_leaf_add_work(bp, args, 0));
+out_log_hdr:
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
+ return tmp;
}
/*
* Add a name to a leaf attribute list structure.
*/
STATIC int
-xfs_attr_leaf_add_work(
- struct xfs_buf *bp,
- xfs_da_args_t *args,
- int mapindex)
+xfs_attr3_leaf_add_work(
+ struct xfs_buf *bp,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args,
+ int mapindex)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_leaf_map_t *map;
- xfs_mount_t *mp;
- int tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_mount *mp;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_add_work(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr = &leaf->hdr;
- ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
- ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count)));
+ ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+ ASSERT(args->index >= 0 && args->index <= ichdr->count);
/*
* Force open some space in the entry array and fill it in.
*/
- entry = &leaf->entries[args->index];
- if (args->index < be16_to_cpu(hdr->count)) {
- tmp = be16_to_cpu(hdr->count) - args->index;
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ if (args->index < ichdr->count) {
+ tmp = ichdr->count - args->index;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)(entry+1), (char *)entry, tmp);
+ memmove(entry + 1, entry, tmp);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
}
- be16_add_cpu(&hdr->count, 1);
+ ichdr->count++;
/*
* Allocate space for the new string (at the end of the run).
*/
- map = &hdr->freemap[mapindex];
mp = args->trans->t_mountp;
- ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
- ASSERT((be16_to_cpu(map->base) & 0x3) == 0);
- ASSERT(be16_to_cpu(map->size) >=
+ ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp));
+ ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+ ASSERT(ichdr->freemap[mapindex].size >=
xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
mp->m_sb.sb_blocksize, NULL));
- ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
- ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
- be16_add_cpu(&map->size,
- -xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, &tmp));
- entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
- be16_to_cpu(map->size));
+ ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp));
+ ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+ ichdr->freemap[mapindex].size -=
+ xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+ mp->m_sb.sb_blocksize, &tmp);
+
+ entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+ ichdr->freemap[mapindex].size);
entry->hashval = cpu_to_be32(args->hashval);
entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
@@ -1200,7 +1386,7 @@ xfs_attr_leaf_add_work(
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
ASSERT((args->index == 0) ||
(be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
- ASSERT((args->index == be16_to_cpu(hdr->count)-1) ||
+ ASSERT((args->index == ichdr->count - 1) ||
(be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
/*
@@ -1211,14 +1397,14 @@ xfs_attr_leaf_add_work(
* as part of this transaction (a split operation for example).
*/
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
name_loc->namelen = args->namelen;
name_loc->valuelen = cpu_to_be16(args->valuelen);
memcpy((char *)name_loc->nameval, args->name, args->namelen);
memcpy((char *)&name_loc->nameval[args->namelen], args->value,
be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->namelen = args->namelen;
memcpy((char *)name_rmt->name, args->name, args->namelen);
entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1226,47 +1412,45 @@ xfs_attr_leaf_add_work(
name_rmt->valuelen = 0;
name_rmt->valueblk = 0;
args->rmtblkno = 1;
- args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
}
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
xfs_attr_leaf_entsize(leaf, args->index)));
/*
* Update the control info for this leaf node
*/
- if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) {
- /* both on-disk, don't endian-flip twice */
- hdr->firstused = entry->nameidx;
- }
- ASSERT(be16_to_cpu(hdr->firstused) >=
- ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
- tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- if (be16_to_cpu(map->base) == tmp) {
- be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
- be16_add_cpu(&map->size,
- -((int)sizeof(xfs_attr_leaf_entry_t)));
+ if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+ ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+ ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf));
+ tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (ichdr->freemap[i].base == tmp) {
+ ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+ ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
}
}
- be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
- xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
- return(0);
+ ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+ return 0;
}
/*
* Garbage collect a leaf attribute list block by copying it to a new buffer.
*/
STATIC void
-xfs_attr_leaf_compact(
+xfs_attr3_leaf_compact(
struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr_dst,
struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ struct xfs_attr_leafblock *leaf_src;
+ struct xfs_attr_leafblock *leaf_dst;
+ struct xfs_attr3_icleaf_hdr ichdr_src;
struct xfs_trans *trans = args->trans;
struct xfs_mount *mp = trans->t_mountp;
char *tmpbuffer;
@@ -1274,43 +1458,87 @@ xfs_attr_leaf_compact(
trace_xfs_attr_leaf_compact(args);
tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+ leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_dst = bp->b_addr;
/*
- * Copy basic information
+ * Copy the on-disk header back into the destination buffer to ensure
+ * all the information in the header that is not part of the incore
+ * header structure is preserved.
*/
- leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
- leaf_d = bp->b_addr;
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- hdr_d->info = hdr_s->info; /* struct copy */
- hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp));
- /* handle truncation gracefully */
- if (!hdr_d->firstused) {
- hdr_d->firstused = cpu_to_be16(
- XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- hdr_d->usedbytes = 0;
- hdr_d->count = 0;
- hdr_d->holes = 0;
- hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
- sizeof(xfs_attr_leaf_hdr_t));
+ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+ /* Initialise the incore headers */
+ ichdr_src = *ichdr_dst; /* struct copy */
+ ichdr_dst->firstused = XFS_LBSIZE(mp);
+ ichdr_dst->usedbytes = 0;
+ ichdr_dst->count = 0;
+ ichdr_dst->holes = 0;
+ ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+ ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+ ichdr_dst->freemap[0].base;
+
+
+ /* write the header back to initialise the underlying buffer */
+ xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
/*
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
- be16_to_cpu(hdr_s->count), mp);
+ xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
+ ichdr_src.count, mp);
+ /*
+ * this logs the entire buffer, but the caller must write the header
+ * back to the buffer when it is finished modifying it.
+ */
xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
kmem_free(tmpbuffer);
}
/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_attr3_icleaf_hdr *leaf1hdr,
+ struct xfs_buf *leaf2_bp,
+ struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+
+ entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+ entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+ if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+ ((be32_to_cpu(entries2[0].hashval) <
+ be32_to_cpu(entries1[0].hashval)) ||
+ (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+ be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+int
+xfs_attr_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp)
+{
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+ return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
* Redistribute the attribute list entries between two leaf nodes,
* taking into account the size of the new entry.
*
@@ -1323,14 +1551,23 @@ xfs_attr_leaf_compact(
* the "new" and "old" values can end up in different blocks.
*/
STATIC void
-xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_attr3_leaf_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_args_t *args;
- xfs_da_state_blk_t *tmp_blk;
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- int count, totallen, max, space, swap;
+ struct xfs_da_args *args;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+ int count;
+ int totallen;
+ int max;
+ int space;
+ int swap;
/*
* Set up environment.
@@ -1339,9 +1576,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf2->hdr.count == 0);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(ichdr2.count == 0);
args = state->args;
trace_xfs_attr_leaf_rebalance(args);
@@ -1353,16 +1590,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* second block, this code should never set "swap".
*/
swap = 0;
- if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+ if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+ struct xfs_da_state_blk *tmp_blk;
+ struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
tmp_blk = blk1;
blk1 = blk2;
blk2 = tmp_blk;
+
+ /* struct copies to swap them rather than reconverting */
+ tmp_ichdr = ichdr1;
+ ichdr1 = ichdr2;
+ ichdr2 = tmp_ichdr;
+
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
swap = 1;
}
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
/*
* Examine entries until we reduce the absolute difference in
@@ -1372,41 +1616,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* "inleaf" is true if the new entry should be inserted into blk1.
* If "swap" is also true, then reverse the sense of "inleaf".
*/
- state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
- &count, &totallen);
+ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+ blk2, &ichdr2,
+ &count, &totallen);
if (swap)
state->inleaf = !state->inleaf;
/*
* Move any entries required from leaf to leaf:
*/
- if (count < be16_to_cpu(hdr1->count)) {
+ if (count < ichdr1.count) {
/*
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count = be16_to_cpu(hdr1->count) - count;
- space = be16_to_cpu(hdr1->usedbytes) - totallen;
+ count = ichdr1.count - count;
+ space = ichdr1.usedbytes - totallen;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf2 is the destination, compact it if it looks tight.
*/
- max = be16_to_cpu(hdr2->firstused)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
+ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
if (space > max)
- xfs_attr_leaf_compact(args, blk2->bp);
+ xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
/*
* Move high entries from leaf1 to low end of leaf2.
*/
- xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
- leaf2, 0, count, state->mp);
+ xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count,
+ leaf2, &ichdr2, 0, count, state->mp);
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
- } else if (count > be16_to_cpu(hdr1->count)) {
+ } else if (count > ichdr1.count) {
/*
* I assert that since all callers pass in an empty
* second buffer, this code should never execute.
@@ -1417,36 +1659,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count -= be16_to_cpu(hdr1->count);
- space = totallen - be16_to_cpu(hdr1->usedbytes);
+ count -= ichdr1.count;
+ space = totallen - ichdr1.usedbytes;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf1 is the destination, compact it if it looks tight.
*/
- max = be16_to_cpu(hdr1->firstused)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
+ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
if (space > max)
- xfs_attr_leaf_compact(args, blk1->bp);
+ xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
/*
* Move low entries from leaf2 to high end of leaf1.
*/
- xfs_attr_leaf_moveents(leaf2, 0, leaf1,
- be16_to_cpu(hdr1->count), count, state->mp);
-
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count, state->mp);
}
+ xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+ xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+
/*
* Copy out last hashval in each block for B-tree code.
*/
- blk1->hashval = be32_to_cpu(
- leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval);
- blk2->hashval = be32_to_cpu(
- leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval);
+ entries1 = xfs_attr3_leaf_entryp(leaf1);
+ entries2 = xfs_attr3_leaf_entryp(leaf2);
+ blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
@@ -1460,12 +1703,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* inserting. The index/blkno fields refer to the "old" entry,
* while the index2/blkno2 fields refer to the "new" entry.
*/
- if (blk1->index > be16_to_cpu(leaf1->hdr.count)) {
+ if (blk1->index > ichdr1.count) {
ASSERT(state->inleaf == 0);
- blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - ichdr1.count;
args->index = args->index2 = blk2->index;
args->blkno = args->blkno2 = blk2->blkno;
- } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) {
+ } else if (blk1->index == ichdr1.count) {
if (state->inleaf) {
args->index = blk1->index;
args->blkno = blk1->blkno;
@@ -1477,8 +1720,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* is already stored in blkno2/index2, so don't
* overwrite it overwise we corrupt the tree.
*/
- blk2->index = blk1->index
- - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - ichdr1.count;
args->index = blk2->index;
args->blkno = blk2->blkno;
if (!state->extravalid) {
@@ -1506,42 +1748,40 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* GROT: Do a double-split for this case?
*/
STATIC int
-xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2,
- int *countarg, int *usedbytesarg)
+xfs_attr3_leaf_figure_balance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ struct xfs_da_state_blk *blk2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *countarg,
+ int *usedbytesarg)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- xfs_attr_leaf_entry_t *entry;
- int count, max, index, totallen, half;
- int lastdelta, foundit, tmp;
-
- /*
- * Set up environment.
- */
- leaf1 = blk1->bp->b_addr;
- leaf2 = blk2->bp->b_addr;
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
- foundit = 0;
- totallen = 0;
+ struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr;
+ struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ int count;
+ int max;
+ int index;
+ int totallen = 0;
+ int half;
+ int lastdelta;
+ int foundit = 0;
+ int tmp;
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum.
*/
- max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
- half = (max+1) * sizeof(*entry);
- half += be16_to_cpu(hdr1->usedbytes) +
- be16_to_cpu(hdr2->usedbytes) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ max = ichdr1->count + ichdr2->count;
+ half = (max + 1) * sizeof(*entry);
+ half += ichdr1->usedbytes + ichdr2->usedbytes +
+ xfs_attr_leaf_newentsize(state->args->namelen,
+ state->args->valuelen,
+ state->blocksize, NULL);
half /= 2;
lastdelta = state->blocksize;
- entry = &leaf1->entries[0];
+ entry = xfs_attr3_leaf_entryp(leaf1);
for (count = index = 0; count < max; entry++, index++, count++) {
#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
@@ -1564,9 +1804,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
/*
* Wrap around into the second block if necessary.
*/
- if (count == be16_to_cpu(hdr1->count)) {
+ if (count == ichdr1->count) {
leaf1 = leaf2;
- entry = &leaf1->entries[0];
+ entry = xfs_attr3_leaf_entryp(leaf1);
index = 0;
}
@@ -1597,7 +1837,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
*countarg = count;
*usedbytesarg = totallen;
- return(foundit);
+ return foundit;
}
/*========================================================================
@@ -1616,14 +1856,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
* GROT: allow for INCOMPLETE entries in calculation.
*/
int
-xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+xfs_attr3_leaf_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, bytes, forward, error, retval, i;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_buf *bp;
+ xfs_dablk_t blkno;
+ int bytes;
+ int forward;
+ int error;
+ int retval;
+ int i;
trace_xfs_attr_leaf_toosmall(state->args);
@@ -1633,13 +1879,11 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- leaf = (xfs_attr_leafblock_t *)info;
- count = be16_to_cpu(leaf->hdr.count);
- bytes = sizeof(xfs_attr_leaf_hdr_t) +
- count * sizeof(xfs_attr_leaf_entry_t) +
- be16_to_cpu(leaf->hdr.usedbytes);
+ leaf = blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ bytes = xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+ ichdr.usedbytes;
if (bytes > (state->blocksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0);
@@ -1651,14 +1895,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (ichdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = (info->forw != 0);
+ forward = (ichdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -1667,7 +1911,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
} else {
*action = 2;
}
- return(0);
+ return 0;
}
/*
@@ -1678,28 +1922,28 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to shrink an attribute list over time.
*/
/* start with smaller blk num */
- forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ forward = ichdr.forw < ichdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_attr3_icleaf_hdr ichdr2;
if (forward)
- blkno = be32_to_cpu(info->forw);
+ blkno = ichdr.forw;
else
- blkno = be32_to_cpu(info->back);
+ blkno = ichdr.back;
if (blkno == 0)
continue;
- error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+ error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
blkno, -1, &bp);
if (error)
return(error);
- leaf = (xfs_attr_leafblock_t *)info;
- count = be16_to_cpu(leaf->hdr.count);
- bytes = state->blocksize - (state->blocksize>>2);
- bytes -= be16_to_cpu(leaf->hdr.usedbytes);
- leaf = bp->b_addr;
- count += be16_to_cpu(leaf->hdr.count);
- bytes -= be16_to_cpu(leaf->hdr.usedbytes);
- bytes -= count * sizeof(xfs_attr_leaf_entry_t);
- bytes -= sizeof(xfs_attr_leaf_hdr_t);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+ bytes = state->blocksize - (state->blocksize >> 2) -
+ ichdr.usedbytes - ichdr2.usedbytes -
+ ((ichdr.count + ichdr2.count) *
+ sizeof(xfs_attr_leaf_entry_t)) -
+ xfs_attr3_leaf_hdr_size(leaf);
+
xfs_trans_brelse(state->args->trans, bp);
if (bytes >= 0)
break; /* fits with at least 25% to spare */
@@ -1715,10 +1959,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
}
if (error)
@@ -1738,32 +1982,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* If two leaves are 37% full, when combined they will leave 25% free.
*/
int
-xfs_attr_leaf_remove(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_remove(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- xfs_attr_leaf_entry_t *entry;
- int before, after, smallest, entsize;
- int tablesize, tmp, i;
- xfs_mount_t *mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_mount *mp = args->trans->t_mountp;
+ int before;
+ int after;
+ int smallest;
+ int entsize;
+ int tablesize;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_remove(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr = &leaf->hdr;
- mp = args->trans->t_mountp;
- ASSERT((be16_to_cpu(hdr->count) > 0)
- && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
- ASSERT((args->index >= 0)
- && (args->index < be16_to_cpu(hdr->count)));
- ASSERT(be16_to_cpu(hdr->firstused) >=
- ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
- entry = &leaf->entries[args->index];
- ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8);
+ ASSERT(args->index >= 0 && args->index < ichdr.count);
+ ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+ xfs_attr3_leaf_hdr_size(leaf));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
/*
@@ -1772,30 +2019,28 @@ xfs_attr_leaf_remove(
* find smallest free region in case we need to replace it,
* adjust any map that borders the entry table,
*/
- tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- tmp = be16_to_cpu(map->size);
+ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ tmp = ichdr.freemap[0].size;
before = after = -1;
smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
entsize = xfs_attr_leaf_entsize(leaf, args->index);
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
- ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
- if (be16_to_cpu(map->base) == tablesize) {
- be16_add_cpu(&map->base,
- -((int)sizeof(xfs_attr_leaf_entry_t)));
- be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp));
+ ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp));
+ if (ichdr.freemap[i].base == tablesize) {
+ ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+ ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
}
- if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
- == be16_to_cpu(entry->nameidx)) {
+ if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+ be16_to_cpu(entry->nameidx)) {
before = i;
- } else if (be16_to_cpu(map->base)
- == (be16_to_cpu(entry->nameidx) + entsize)) {
+ } else if (ichdr.freemap[i].base ==
+ (be16_to_cpu(entry->nameidx) + entsize)) {
after = i;
- } else if (be16_to_cpu(map->size) < tmp) {
- tmp = be16_to_cpu(map->size);
+ } else if (ichdr.freemap[i].size < tmp) {
+ tmp = ichdr.freemap[i].size;
smallest = i;
}
}
@@ -1806,36 +2051,30 @@ xfs_attr_leaf_remove(
*/
if ((before >= 0) || (after >= 0)) {
if ((before >= 0) && (after >= 0)) {
- map = &hdr->freemap[before];
- be16_add_cpu(&map->size, entsize);
- be16_add_cpu(&map->size,
- be16_to_cpu(hdr->freemap[after].size));
- hdr->freemap[after].base = 0;
- hdr->freemap[after].size = 0;
+ ichdr.freemap[before].size += entsize;
+ ichdr.freemap[before].size += ichdr.freemap[after].size;
+ ichdr.freemap[after].base = 0;
+ ichdr.freemap[after].size = 0;
} else if (before >= 0) {
- map = &hdr->freemap[before];
- be16_add_cpu(&map->size, entsize);
+ ichdr.freemap[before].size += entsize;
} else {
- map = &hdr->freemap[after];
- /* both on-disk, don't endian flip twice */
- map->base = entry->nameidx;
- be16_add_cpu(&map->size, entsize);
+ ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[after].size += entsize;
}
} else {
/*
* Replace smallest region (if it is smaller than free'd entry)
*/
- map = &hdr->freemap[smallest];
- if (be16_to_cpu(map->size) < entsize) {
- map->base = cpu_to_be16(be16_to_cpu(entry->nameidx));
- map->size = cpu_to_be16(entsize);
+ if (ichdr.freemap[smallest].size < entsize) {
+ ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[smallest].size = entsize;
}
}
/*
* Did we remove the first entry?
*/
- if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
+ if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
smallest = 1;
else
smallest = 0;
@@ -1843,20 +2082,20 @@ xfs_attr_leaf_remove(
/*
* Compress the remaining entries and zero out the removed stuff.
*/
- memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
- be16_add_cpu(&hdr->usedbytes, -entsize);
+ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+ ichdr.usedbytes -= entsize;
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
entsize));
- tmp = (be16_to_cpu(hdr->count) - args->index)
- * sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)entry, (char *)(entry+1), tmp);
- be16_add_cpu(&hdr->count, -1);
+ tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+ memmove(entry, entry + 1, tmp);
+ ichdr.count--;
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
- entry = &leaf->entries[be16_to_cpu(hdr->count)];
- memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+ memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
/*
* If we removed the first entry, re-find the first used byte
@@ -1866,130 +2105,140 @@ xfs_attr_leaf_remove(
*/
if (smallest) {
tmp = XFS_LBSIZE(mp);
- entry = &leaf->entries[0];
- for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
- ASSERT(be16_to_cpu(entry->nameidx) >=
- be16_to_cpu(hdr->firstused));
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
if (be16_to_cpu(entry->nameidx) < tmp)
tmp = be16_to_cpu(entry->nameidx);
}
- hdr->firstused = cpu_to_be16(tmp);
- if (!hdr->firstused) {
- hdr->firstused = cpu_to_be16(
- tmp - XFS_ATTR_LEAF_NAME_ALIGN);
- }
+ ichdr.firstused = tmp;
+ if (!ichdr.firstused)
+ ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
} else {
- hdr->holes = 1; /* mark as needing compaction */
+ ichdr.holes = 1; /* mark as needing compaction */
}
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
/*
* Check if leaf is less than 50% full, caller may want to
* "join" the leaf with a sibling if so.
*/
- tmp = sizeof(xfs_attr_leaf_hdr_t);
- tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t);
- tmp += be16_to_cpu(leaf->hdr.usedbytes);
- return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+ return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */
}
/*
* Move all the attribute list entries from drop_leaf into save_leaf.
*/
void
-xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_attr3_leaf_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
- xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+ struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+ struct xfs_attr3_icleaf_hdr drophdr;
+ struct xfs_attr3_icleaf_hdr savehdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_mount *mp = state->mp;
trace_xfs_attr_leaf_unbalance(state->args);
- /*
- * Set up environment.
- */
- mp = state->mp;
- ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- drop_hdr = &drop_leaf->hdr;
- save_hdr = &save_leaf->hdr;
+ xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+ xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+ entry = xfs_attr3_leaf_entryp(drop_leaf);
/*
* Save last hashval from dying block for later Btree fixup.
*/
- drop_blk->hashval = be32_to_cpu(
- drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
+ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
/*
* Check if we need a temp buffer, or can we do it in place.
* Note that we don't check "leaf" for holes because we will
* always be dropping it, toosmall() decided that for us already.
*/
- if (save_hdr->holes == 0) {
+ if (savehdr.holes == 0) {
/*
* dest leaf has no holes, so we add there. May need
* to make some room in the entry array.
*/
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
- be16_to_cpu(drop_hdr->count), mp);
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr, 0,
+ drophdr.count, mp);
} else {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
- be16_to_cpu(save_hdr->count),
- be16_to_cpu(drop_hdr->count), mp);
+ xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr,
+ savehdr.count, drophdr.count, mp);
}
} else {
/*
* Destination has holes, so we make a temporary copy
* of the leaf and add them both to that.
*/
- tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memset(tmpbuffer, 0, state->blocksize);
- tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- tmp_hdr = &tmp_leaf->hdr;
- tmp_hdr->info = save_hdr->info; /* struct copy */
- tmp_hdr->count = 0;
- tmp_hdr->firstused = cpu_to_be16(state->blocksize);
- if (!tmp_hdr->firstused) {
- tmp_hdr->firstused = cpu_to_be16(
- state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- tmp_hdr->usedbytes = 0;
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
- be16_to_cpu(drop_hdr->count), mp);
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
- be16_to_cpu(tmp_leaf->hdr.count),
- be16_to_cpu(save_hdr->count), mp);
+ struct xfs_attr_leafblock *tmp_leaf;
+ struct xfs_attr3_icleaf_hdr tmphdr;
+
+ tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
+
+ /*
+ * Copy the header into the temp leaf so that all the stuff
+ * not in the incore header is present and gets copied back in
+ * once we've moved all the entries.
+ */
+ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+ memset(&tmphdr, 0, sizeof(tmphdr));
+ tmphdr.magic = savehdr.magic;
+ tmphdr.forw = savehdr.forw;
+ tmphdr.back = savehdr.back;
+ tmphdr.firstused = state->blocksize;
+
+ /* write the header to the temp buffer to initialise it */
+ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ drophdr.count, mp);
+ xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ savehdr.count, mp);
} else {
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
- be16_to_cpu(save_hdr->count), mp);
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
- be16_to_cpu(tmp_leaf->hdr.count),
- be16_to_cpu(drop_hdr->count), mp);
+ xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ savehdr.count, mp);
+ xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ drophdr.count, mp);
}
- memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
- kmem_free(tmpbuffer);
+ memcpy(save_leaf, tmp_leaf, state->blocksize);
+ savehdr = tmphdr; /* struct copy */
+ kmem_free(tmp_leaf);
}
+ xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
state->blocksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
*/
- save_blk->hashval = be32_to_cpu(
- save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval);
+ entry = xfs_attr3_leaf_entryp(save_leaf);
+ save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
}
/*========================================================================
@@ -2010,31 +2259,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Don't change the args->value unless we find the attribute.
*/
int
-xfs_attr_leaf_lookup_int(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_lookup_int(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- int probe, span;
- xfs_dahash_t hashval;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ xfs_dahash_t hashval;
+ int probe;
+ int span;
trace_xfs_attr_leaf_lookup(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
/*
* Binary search. (note: small blocks will skip this loop)
*/
hashval = args->hashval;
- probe = span = be16_to_cpu(leaf->hdr.count) / 2;
- for (entry = &leaf->entries[probe]; span > 4;
- entry = &leaf->entries[probe]) {
+ probe = span = ichdr.count / 2;
+ for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
span /= 2;
if (be32_to_cpu(entry->hashval) < hashval)
probe += span;
@@ -2043,35 +2294,31 @@ xfs_attr_leaf_lookup_int(
else
break;
}
- ASSERT((probe >= 0) &&
- (!leaf->hdr.count
- || (probe < be16_to_cpu(leaf->hdr.count))));
- ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
+ ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+ ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
/*
* Since we may have duplicate hashval's, find the first matching
* hashval in the leaf.
*/
- while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
+ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
entry--;
probe--;
}
- while ((probe < be16_to_cpu(leaf->hdr.count)) &&
- (be32_to_cpu(entry->hashval) < hashval)) {
+ while (probe < ichdr.count &&
+ be32_to_cpu(entry->hashval) < hashval) {
entry++;
probe++;
}
- if ((probe == be16_to_cpu(leaf->hdr.count)) ||
- (be32_to_cpu(entry->hashval) != hashval)) {
+ if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
* Duplicate keys may be present, so search all of them for a match.
*/
- for ( ; (probe < be16_to_cpu(leaf->hdr.count)) &&
- (be32_to_cpu(entry->hashval) == hashval);
+ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
entry++, probe++) {
/*
* GROT: Add code to remove incomplete entries.
@@ -2085,33 +2332,36 @@ xfs_attr_leaf_lookup_int(
continue;
}
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, probe);
+ name_loc = xfs_attr3_leaf_name_local(leaf, probe);
if (name_loc->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
+ if (memcmp(args->name, name_loc->nameval,
+ args->namelen) != 0)
continue;
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- return(XFS_ERROR(EEXIST));
+ return XFS_ERROR(EEXIST);
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
if (name_rmt->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_rmt->name,
- args->namelen) != 0)
+ if (memcmp(args->name, name_rmt->name,
+ args->namelen) != 0)
continue;
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
+ args->valuelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- return(XFS_ERROR(EEXIST));
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(
+ args->dp->i_mount,
+ args->valuelen);
+ return XFS_ERROR(EEXIST);
}
}
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
@@ -2119,56 +2369,57 @@ xfs_attr_leaf_lookup_int(
* list structure.
*/
int
-xfs_attr_leaf_getvalue(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_getvalue(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- int valuelen;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ int valuelen;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+ ASSERT(args->index < ichdr.count);
- entry = &leaf->entries[args->index];
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
valuelen = be16_to_cpu(name_loc->valuelen);
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = valuelen;
- return(0);
+ return 0;
}
if (args->valuelen < valuelen) {
args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ return XFS_ERROR(ERANGE);
}
args->valuelen = valuelen;
memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
ASSERT(name_rmt->namelen == args->namelen);
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
valuelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ valuelen);
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = valuelen;
- return(0);
+ return 0;
}
if (args->valuelen < valuelen) {
args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ return XFS_ERROR(ERANGE);
}
args->valuelen = valuelen;
}
- return(0);
+ return 0;
}
/*========================================================================
@@ -2181,13 +2432,21 @@ xfs_attr_leaf_getvalue(
*/
/*ARGSUSED*/
STATIC void
-xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
- xfs_attr_leafblock_t *leaf_d, int start_d,
- int count, xfs_mount_t *mp)
+xfs_attr3_leaf_moveents(
+ struct xfs_attr_leafblock *leaf_s,
+ struct xfs_attr3_icleaf_hdr *ichdr_s,
+ int start_s,
+ struct xfs_attr_leafblock *leaf_d,
+ struct xfs_attr3_icleaf_hdr *ichdr_d,
+ int start_d,
+ int count,
+ struct xfs_mount *mp)
{
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_attr_leaf_entry_t *entry_s, *entry_d;
- int desti, tmp, i;
+ struct xfs_attr_leaf_entry *entry_s;
+ struct xfs_attr_leaf_entry *entry_d;
+ int desti;
+ int tmp;
+ int i;
/*
* Check for nothing to do.
@@ -2198,45 +2457,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Set up environment.
*/
- ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- ASSERT((be16_to_cpu(hdr_s->count) > 0) &&
- (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
- ASSERT(be16_to_cpu(hdr_s->firstused) >=
- ((be16_to_cpu(hdr_s->count)
- * sizeof(*entry_s))+sizeof(*hdr_s)));
- ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
- ASSERT(be16_to_cpu(hdr_d->firstused) >=
- ((be16_to_cpu(hdr_d->count)
- * sizeof(*entry_d))+sizeof(*hdr_d)));
-
- ASSERT(start_s < be16_to_cpu(hdr_s->count));
- ASSERT(start_d <= be16_to_cpu(hdr_d->count));
- ASSERT(count <= be16_to_cpu(hdr_s->count));
+ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+ ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+ ASSERT(ichdr_s->magic == ichdr_d->magic);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ + xfs_attr3_leaf_hdr_size(leaf_s));
+ ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ + xfs_attr3_leaf_hdr_size(leaf_d));
+
+ ASSERT(start_s < ichdr_s->count);
+ ASSERT(start_d <= ichdr_d->count);
+ ASSERT(count <= ichdr_s->count);
+
/*
* Move the entries in the destination leaf up to make a hole?
*/
- if (start_d < be16_to_cpu(hdr_d->count)) {
- tmp = be16_to_cpu(hdr_d->count) - start_d;
+ if (start_d < ichdr_d->count) {
+ tmp = ichdr_d->count - start_d;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_d->entries[start_d];
- entry_d = &leaf_d->entries[start_d + count];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+ memmove(entry_d, entry_s, tmp);
}
/*
* Copy all entry's in the same (sorted) order,
* but allocate attribute info packed and in sequence.
*/
- entry_s = &leaf_s->entries[start_s];
- entry_d = &leaf_d->entries[start_d];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
desti = start_d;
for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
- ASSERT(be16_to_cpu(entry_s->nameidx)
- >= be16_to_cpu(hdr_s->firstused));
+ ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
#ifdef GROT
/*
@@ -2245,36 +2500,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
* off for 6.2, should be revisited later.
*/
if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
- memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
- be16_add_cpu(&hdr_s->usedbytes, -tmp);
- be16_add_cpu(&hdr_s->count, -1);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_s->count -= 1;
entry_d--; /* to compensate for ++ in loop hdr */
desti--;
if ((start_s + i) < offset)
result++; /* insertion index adjustment */
} else {
#endif /* GROT */
- be16_add_cpu(&hdr_d->firstused, -tmp);
+ ichdr_d->firstused -= tmp;
/* both on-disk, don't endian flip twice */
entry_d->hashval = entry_s->hashval;
- /* both on-disk, don't endian flip twice */
- entry_d->nameidx = hdr_d->firstused;
+ entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
entry_d->flags = entry_s->flags;
ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
<= XFS_LBSIZE(mp));
- memmove(xfs_attr_leaf_name(leaf_d, desti),
- xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
+ memmove(xfs_attr3_leaf_name(leaf_d, desti),
+ xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
<= XFS_LBSIZE(mp));
- memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
- be16_add_cpu(&hdr_s->usedbytes, -tmp);
- be16_add_cpu(&hdr_d->usedbytes, tmp);
- be16_add_cpu(&hdr_s->count, -1);
- be16_add_cpu(&hdr_d->count, 1);
- tmp = be16_to_cpu(hdr_d->count)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_d->usedbytes += tmp;
+ ichdr_s->count -= 1;
+ ichdr_d->count += 1;
+ tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf_d);
+ ASSERT(ichdr_d->firstused >= tmp);
#ifdef GROT
}
#endif /* GROT */
@@ -2283,71 +2536,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Zero out the entries we just copied.
*/
- if (start_s == be16_to_cpu(hdr_s->count)) {
+ if (start_s == ichdr_s->count) {
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
ASSERT(((char *)entry_s + tmp) <=
((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ memset(entry_s, 0, tmp);
} else {
/*
* Move the remaining entries down to fill the hole,
* then zero the entries at the top.
*/
- tmp = be16_to_cpu(hdr_s->count) - count;
- tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s + count];
- entry_d = &leaf_s->entries[start_s];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ memmove(entry_d, entry_s, tmp);
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
ASSERT(((char *)entry_s + tmp) <=
((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ memset(entry_s, 0, tmp);
}
/*
* Fill in the freemap information
*/
- hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
- sizeof(xfs_attr_leaf_entry_t));
- hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
- - be16_to_cpu(hdr_d->freemap[0].base));
- hdr_d->freemap[1].base = 0;
- hdr_d->freemap[2].base = 0;
- hdr_d->freemap[1].size = 0;
- hdr_d->freemap[2].size = 0;
- hdr_s->holes = 1; /* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-int
-xfs_attr_leaf_order(
- struct xfs_buf *leaf1_bp,
- struct xfs_buf *leaf2_bp)
-{
- xfs_attr_leafblock_t *leaf1, *leaf2;
-
- leaf1 = leaf1_bp->b_addr;
- leaf2 = leaf2_bp->b_addr;
- ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) &&
- (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)));
- if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
- (be16_to_cpu(leaf2->hdr.count) > 0) &&
- ((be32_to_cpu(leaf2->entries[0].hashval) <
- be32_to_cpu(leaf1->entries[0].hashval)) ||
- (be32_to_cpu(leaf2->entries[
- be16_to_cpu(leaf2->hdr.count)-1].hashval) <
- be32_to_cpu(leaf1->entries[
- be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
- return(1);
- }
- return(0);
+ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+ ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+ ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+ ichdr_d->freemap[1].base = 0;
+ ichdr_d->freemap[2].base = 0;
+ ichdr_d->freemap[1].size = 0;
+ ichdr_d->freemap[2].size = 0;
+ ichdr_s->holes = 1; /* leaf may not be compact */
}
/*
@@ -2358,15 +2580,16 @@ xfs_attr_leaf_lasthash(
struct xfs_buf *bp,
int *count)
{
- xfs_attr_leafblock_t *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+ entries = xfs_attr3_leaf_entryp(bp->b_addr);
if (count)
- *count = be16_to_cpu(leaf->hdr.count);
- if (!leaf->hdr.count)
- return(0);
- return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
+ *count = ichdr.count;
+ if (!ichdr.count)
+ return 0;
+ return be32_to_cpu(entries[ichdr.count - 1].hashval);
}
/*
@@ -2376,20 +2599,21 @@ xfs_attr_leaf_lasthash(
STATIC int
xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
{
+ struct xfs_attr_leaf_entry *entries;
xfs_attr_leaf_name_local_t *name_loc;
xfs_attr_leaf_name_remote_t *name_rmt;
int size;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, index);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (entries[index].flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, index);
size = xfs_attr_leaf_entsize_local(name_loc->namelen,
be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
}
- return(size);
+ return size;
}
/*
@@ -2414,35 +2638,40 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
*local = 0;
}
}
- return(size);
+ return size;
}
/*
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
int
-xfs_attr_leaf_list_int(
- struct xfs_buf *bp,
- xfs_attr_list_context_t *context)
+xfs_attr3_leaf_list_int(
+ struct xfs_buf *bp,
+ struct xfs_attr_list_context *context)
{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- int retval, i;
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *entry;
+ int retval;
+ int i;
+
+ trace_xfs_attr_list_leaf(context);
- ASSERT(bp != NULL);
leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
cursor = context->cursor;
cursor->initted = 1;
- trace_xfs_attr_list_leaf(context);
-
/*
* Re-find our place in the leaf block if this is a new syscall.
*/
if (context->resynch) {
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ entry = &entries[0];
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (be32_to_cpu(entry->hashval) == cursor->hashval) {
if (cursor->offset == context->dupcnt) {
context->dupcnt = 0;
@@ -2455,12 +2684,12 @@ xfs_attr_leaf_list_int(
break;
}
}
- if (i == be16_to_cpu(leaf->hdr.count)) {
+ if (i == ichdr.count) {
trace_xfs_attr_list_notfound(context);
- return(0);
+ return 0;
}
} else {
- entry = &leaf->entries[0];
+ entry = &entries[0];
i = 0;
}
context->resynch = 0;
@@ -2469,7 +2698,7 @@ xfs_attr_leaf_list_int(
* We have found our place, start copying out the new attributes.
*/
retval = 0;
- for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
+ for (; i < ichdr.count; entry++, i++) {
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
cursor->hashval = be32_to_cpu(entry->hashval);
cursor->offset = 0;
@@ -2480,7 +2709,7 @@ xfs_attr_leaf_list_int(
if (entry->flags & XFS_ATTR_LOCAL) {
xfs_attr_leaf_name_local_t *name_loc =
- xfs_attr_leaf_name_local(leaf, i);
+ xfs_attr3_leaf_name_local(leaf, i);
retval = context->put_listent(context,
entry->flags,
@@ -2492,7 +2721,7 @@ xfs_attr_leaf_list_int(
return retval;
} else {
xfs_attr_leaf_name_remote_t *name_rmt =
- xfs_attr_leaf_name_remote(leaf, i);
+ xfs_attr3_leaf_name_remote(leaf, i);
int valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2505,7 +2734,8 @@ xfs_attr_leaf_list_int(
args.valuelen = valuelen;
args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+ args.rmtblkcnt = xfs_attr3_rmt_blocks(
+ args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
if (retval)
return retval;
@@ -2532,7 +2762,7 @@ xfs_attr_leaf_list_int(
cursor->offset++;
}
trace_xfs_attr_list_leaf_end(context);
- return(retval);
+ return retval;
}
@@ -2544,14 +2774,16 @@ xfs_attr_leaf_list_int(
* Clear the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+xfs_attr3_leaf_clearflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
+ int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
xfs_attr_leaf_name_local_t *name_loc;
int namelen;
char *name;
@@ -2561,23 +2793,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return(error);
leaf = bp->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
- ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
+ ASSERT(args->index >= 0);
+
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
namelen = name_loc->namelen;
name = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
namelen = name_rmt->namelen;
name = (char *)name_rmt->name;
}
@@ -2592,7 +2826,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
if (args->rmtblkno) {
ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
name_rmt->valuelen = cpu_to_be32(args->valuelen);
xfs_trans_log_buf(args->trans, bp,
@@ -2609,34 +2843,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
* Set the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_setflag(xfs_da_args_t *args)
+xfs_attr3_leaf_setflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
+#endif
trace_xfs_attr_leaf_setflag(args);
/*
* Set up the operation.
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return(error);
leaf = bp->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+#endif
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
entry->flags |= XFS_ATTR_INCOMPLETE;
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
xfs_trans_log_buf(args->trans, bp,
@@ -2657,14 +2898,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
* Note that they could be in different blocks, or in the same block.
*/
int
-xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+xfs_attr3_leaf_flipflags(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_entry_t *entry1, *entry2;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp1, *bp2;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr_leaf_entry *entry1;
+ struct xfs_attr_leaf_entry *entry2;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp1;
+ struct xfs_buf *bp2;
int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
xfs_attr_leaf_name_local_t *name_loc;
int namelen1, namelen2;
char *name1, *name2;
@@ -2675,7 +2922,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
if (error)
return error;
@@ -2683,7 +2930,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
-1, &bp2);
if (error)
return error;
@@ -2692,31 +2939,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
}
leaf1 = bp1->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
- ASSERT(args->index >= 0);
- entry1 = &leaf1->entries[ args->index ];
+ entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
leaf2 = bp2->b_addr;
- ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
- ASSERT(args->index2 >= 0);
- entry2 = &leaf2->entries[ args->index2 ];
+ entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ ASSERT(args->index < ichdr1.count);
+ ASSERT(args->index >= 0);
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(args->index2 < ichdr2.count);
+ ASSERT(args->index2 >= 0);
+
if (entry1->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
namelen1 = name_loc->namelen;
name1 = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
namelen1 = name_rmt->namelen;
name1 = (char *)name_rmt->name;
}
if (entry2->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
+ name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
namelen2 = name_loc->namelen;
name2 = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
namelen2 = name_rmt->namelen;
name2 = (char *)name_rmt->name;
}
@@ -2733,7 +2984,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
if (args->rmtblkno) {
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
name_rmt->valuelen = cpu_to_be32(args->valuelen);
xfs_trans_log_buf(args->trans, bp1,
@@ -2744,7 +2995,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
xfs_trans_log_buf(args->trans, bp2,
@@ -2756,7 +3007,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
*/
error = xfs_trans_roll(&args->trans, args->dp);
- return(error);
+ return error;
}
/*========================================================================
@@ -2768,12 +3019,14 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
* We're doing a depth-first traversal in order to invalidate everything.
*/
int
-xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+xfs_attr3_root_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp)
{
- xfs_da_blkinfo_t *info;
- xfs_daddr_t blkno;
- struct xfs_buf *bp;
- int error;
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ int error;
/*
* Read block 0 to see what we have to work with.
@@ -2781,40 +3034,46 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
* the extents in reverse order the extent containing
* block 0 must still be there.
*/
- error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
if (error)
- return(error);
- blkno = XFS_BUF_ADDR(bp);
+ return error;
+ blkno = bp->b_bn;
/*
* Invalidate the tree, even if the "tree" is only a single leaf block.
* This is a depth-first traversal!
*/
info = bp->b_addr;
- if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- error = xfs_attr_node_inactive(trans, dp, bp, 1);
- } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
- error = xfs_attr_leaf_inactive(trans, dp, bp);
- } else {
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp, bp);
+ break;
+ default:
error = XFS_ERROR(EIO);
xfs_trans_brelse(*trans, bp);
+ break;
}
if (error)
- return(error);
+ return error;
/*
* Invalidate the incore copy of the root block.
*/
error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
xfs_trans_binval(*trans, bp); /* remove from cache */
/*
* Commit the invalidate and start the next transaction.
*/
error = xfs_trans_roll(trans, dp);
- return (error);
+ return error;
}
/*
@@ -2822,7 +3081,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
* We're doing a depth-first traversal in order to invalidate everything.
*/
STATIC int
-xfs_attr_node_inactive(
+xfs_attr3_node_inactive(
struct xfs_trans **trans,
struct xfs_inode *dp,
struct xfs_buf *bp,
@@ -2832,26 +3091,28 @@ xfs_attr_node_inactive(
xfs_da_intnode_t *node;
xfs_dablk_t child_fsb;
xfs_daddr_t parent_blkno, child_blkno;
- int error, count, i;
+ int error, i;
struct xfs_buf *child_bp;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr ichdr;
/*
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return(XFS_ERROR(EIO));
+ return XFS_ERROR(EIO);
}
node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */
- count = be16_to_cpu(node->hdr.count);
- if (!count) {
+ xfs_da3_node_hdr_from_disk(&ichdr, node);
+ parent_blkno = bp->b_bn;
+ if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
- return(0);
+ return 0;
}
- child_fsb = be32_to_cpu(node->btree[0].before);
+ btree = xfs_da3_node_tree_p(node);
+ child_fsb = be32_to_cpu(btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
/*
@@ -2859,14 +3120,14 @@ xfs_attr_node_inactive(
* over the leaves removing all of them. If this is higher up
* in the tree, recurse downward.
*/
- for (i = 0; i < count; i++) {
+ for (i = 0; i < ichdr.count; i++) {
/*
* Read the subsidiary block to see what we have to work with.
* Don't do this in a transaction. This is a depth-first
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
+ error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
XFS_ATTR_FORK);
if (error)
return(error);
@@ -2878,18 +3139,24 @@ xfs_attr_node_inactive(
* Invalidate the subtree, however we have to.
*/
info = child_bp->b_addr;
- if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- error = xfs_attr_node_inactive(trans, dp,
- child_bp, level+1);
- } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
- error = xfs_attr_leaf_inactive(trans, dp,
- child_bp);
- } else {
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp,
+ child_bp, level + 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp,
+ child_bp);
+ break;
+ default:
error = XFS_ERROR(EIO);
xfs_trans_brelse(*trans, child_bp);
+ break;
}
if (error)
- return(error);
+ return error;
/*
* Remove the subsidiary block from the cache
@@ -2898,7 +3165,7 @@ xfs_attr_node_inactive(
error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
&child_bp, XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
xfs_trans_binval(*trans, child_bp);
}
@@ -2906,12 +3173,12 @@ xfs_attr_node_inactive(
* If we're not done, re-read the parent to get the next
* child block number.
*/
- if ((i+1) < count) {
- error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+ if (i + 1 < ichdr.count) {
+ error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
&bp, XFS_ATTR_FORK);
if (error)
- return(error);
- child_fsb = be32_to_cpu(node->btree[i+1].before);
+ return error;
+ child_fsb = be32_to_cpu(btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
/*
@@ -2919,10 +3186,10 @@ xfs_attr_node_inactive(
*/
error = xfs_trans_roll(trans, dp);
if (error)
- return (error);
+ return error;
}
- return(0);
+ return 0;
}
/*
@@ -2932,29 +3199,35 @@ xfs_attr_node_inactive(
* caught holding something that the logging code wants to flush to disk.
*/
STATIC int
-xfs_attr_leaf_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+xfs_attr3_leaf_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_inactive_list_t *list, *lp;
- int error, count, size, tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_attr_inactive_list *list;
+ struct xfs_attr_inactive_list *lp;
+ int error;
+ int count;
+ int size;
+ int tmp;
+ int i;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
/*
* Count the number of "remote" value extents.
*/
count = 0;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (be16_to_cpu(entry->nameidx) &&
((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
if (name_rmt->valueblk)
count++;
}
@@ -2965,27 +3238,27 @@ xfs_attr_leaf_inactive(
*/
if (count == 0) {
xfs_trans_brelse(*trans, bp);
- return(0);
+ return 0;
}
/*
* Allocate storage for a list of all the "remote" value extents.
*/
size = count * sizeof(xfs_attr_inactive_list_t);
- list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+ list = kmem_alloc(size, KM_SLEEP);
/*
* Identify each of the "remote" value extents.
*/
lp = list;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (be16_to_cpu(entry->nameidx) &&
((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
if (name_rmt->valueblk) {
lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+ lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
be32_to_cpu(name_rmt->valuelen));
lp++;
}
@@ -2998,15 +3271,15 @@ xfs_attr_leaf_inactive(
*/
error = 0;
for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr_leaf_freextent(trans, dp,
+ tmp = xfs_attr3_leaf_freextent(trans, dp,
lp->valueblk, lp->valuelen);
if (error == 0)
error = tmp; /* save only the 1st errno */
}
- kmem_free((xfs_caddr_t)list);
- return(error);
+ kmem_free(list);
+ return error;
}
/*
@@ -3014,14 +3287,20 @@ xfs_attr_leaf_inactive(
* invalidate any buffers that are incore/in transactions.
*/
STATIC int
-xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dablk_t blkno, int blkcnt)
+xfs_attr3_leaf_freextent(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t blkno,
+ int blkcnt)
{
- xfs_bmbt_irec_t map;
- xfs_dablk_t tblkno;
- int tblkcnt, dblkcnt, nmap, error;
- xfs_daddr_t dblkno;
- xfs_buf_t *bp;
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_dablk_t tblkno;
+ xfs_daddr_t dblkno;
+ int tblkcnt;
+ int dblkcnt;
+ int nmap;
+ int error;
/*
* Roll through the "value", invalidating the attribute value's
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 77de139a58f0..444a7704596c 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -89,7 +90,7 @@ typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
__be32 hashval; /* hash value of name */
- __be16 nameidx; /* index into buffer of name/value */
+ __be16 nameidx; /* index into buffer of name/value */
__u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
__u8 pad2; /* unused pad byte */
} xfs_attr_leaf_entry_t;
@@ -115,6 +116,55 @@ typedef struct xfs_attr_leafblock {
} xfs_attr_leafblock_t;
/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+ struct xfs_da3_blkinfo info;
+ __be16 count;
+ __be16 usedbytes;
+ __be16 firstused;
+ __u8 holes;
+ __u8 pad1;
+ struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+ struct xfs_attr3_leaf_hdr hdr;
+ struct xfs_attr_leaf_entry entries[1];
+
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced, the locations accessed purely from helper functions.
+ *
+ * struct xfs_attr_leaf_name_local
+ * struct xfs_attr_leaf_name_remote
+ */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t usedbytes;
+ __uint16_t firstused;
+ __u8 holes;
+ struct {
+ __uint16_t base;
+ __uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
* Flags used in the leaf_entry[i].flags field.
* NOTE: the INCOMPLETE bit must not collide with the flags bits specified
* on the system call, they are "or"ed together for various operations.
@@ -147,26 +197,43 @@ typedef struct xfs_attr_leafblock {
*/
#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return sizeof(struct xfs_attr3_leaf_hdr);
+ return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+ return &leafp->entries[0];
+}
+
/*
* Cast typed pointers for "local" and "remote" name/value structs.
*/
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
{
- return (xfs_attr_leaf_name_remote_t *)
- &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
}
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
{
- return (xfs_attr_leaf_name_local_t *)
- &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+ return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
}
-static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
{
- return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+ return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
}
/*
@@ -221,37 +288,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
/*
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
*/
-int xfs_attr_leaf_to_node(struct xfs_da_args *args);
-int xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
+int xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
struct xfs_da_args *args, int forkoff);
-int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
-int xfs_attr_leaf_setflag(struct xfs_da_args *args);
-int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+int xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
/*
* Routines used for growing the Btree.
*/
-int xfs_attr_leaf_split(struct xfs_da_state *state,
+int xfs_attr3_leaf_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk,
struct xfs_da_state_blk *newblk);
-int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf,
+int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
struct xfs_da_args *args);
-int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
-int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer,
+int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer,
+int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_list_int(struct xfs_buf *bp,
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
-int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
/*
* Utility routines.
@@ -261,10 +328,12 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
int *local);
-int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp);
+void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from);
-extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
new file mode 100644
index 000000000000..ef6b0c124528
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+ struct xfs_mount *mp,
+ int attrlen)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+ return (attrlen + buflen - 1) / buflen;
+ }
+ return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (bno != be64_to_cpu(rmt->rm_blkno))
+ return false;
+ if (offset != be32_to_cpu(rmt->rm_offset))
+ return false;
+ if (size != be32_to_cpu(rmt->rm_bytes))
+ return false;
+ if (ino != be64_to_cpu(rmt->rm_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+ struct xfs_mount *mp,
+ void *ptr,
+ int fsbsize,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ return false;
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(rmt->rm_blkno) != bno)
+ return false;
+ if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ return false;
+ if (be32_to_cpu(rmt->rm_offset) +
+ be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+ return false;
+ if (rmt->rm_owner == 0)
+ return false;
+
+ return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ char *ptr;
+ int len;
+ bool corrupt = false;
+ xfs_daddr_t bno;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0) {
+ if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
+ XFS_ATTR3_RMT_CRC_OFF)) {
+ corrupt = true;
+ break;
+ }
+ if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ corrupt = true;
+ break;
+ }
+ len -= XFS_LBSIZE(mp);
+ ptr += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+ }
+
+ if (corrupt) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ } else
+ ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+ if (bip) {
+ struct xfs_attr3_rmt_hdr *rmt;
+
+ rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+ rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
+
+ len -= XFS_LBSIZE(mp);
+ ptr += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+ }
+ ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .verify_read = xfs_attr3_rmt_read_verify,
+ .verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return 0;
+
+ rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+ rmt->rm_offset = cpu_to_be32(offset);
+ rmt->rm_bytes = cpu_to_be32(size);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ rmt->rm_owner = cpu_to_be64(ino);
+ rmt->rm_blkno = cpu_to_be64(bno);
+
+ return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ char **dst)
+{
+ char *src = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size = 0;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+ byte_cnt = min_t(int, *valuelen, byte_cnt);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+ byte_cnt, bno)) {
+ xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+ bno, *offset, byte_cnt, ino);
+ return EFSCORRUPTED;
+ }
+ hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+ }
+
+ memcpy(*dst, src + hdr_size, byte_cnt);
+
+ /* roll buffer forwards */
+ len -= XFS_LBSIZE(mp);
+ src += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *dst += byte_cnt;
+ *offset += byte_cnt;
+ }
+ return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ char **src)
+{
+ char *dst = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+
+ ASSERT(len >= XFS_LBSIZE(mp));
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+ byte_cnt = min(*valuelen, byte_cnt);
+ hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+ byte_cnt, bno);
+
+ memcpy(dst + hdr_size, *src, byte_cnt);
+
+ /*
+ * If this is the last block, zero the remainder of it.
+ * Check that we are actually the last block, too.
+ */
+ if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+ ASSERT(*valuelen - byte_cnt == 0);
+ ASSERT(len == XFS_LBSIZE(mp));
+ memset(dst + hdr_size + byte_cnt, 0,
+ XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+ }
+
+ /* roll buffer forwards */
+ len -= XFS_LBSIZE(mp);
+ dst += XFS_LBSIZE(mp);
+ bno += mp->m_bsize;
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *src += byte_cnt;
+ *offset += byte_cnt;
+ }
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+ struct xfs_da_args *args)
+{
+ struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_buf *bp;
+ xfs_dablk_t lblkno = args->rmtblkno;
+ char *dst = args->value;
+ int valuelen = args->valuelen;
+ int nmap;
+ int error;
+ int blkcnt = args->rmtblkcnt;
+ int i;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_get(args);
+
+ ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+ while (valuelen > 0) {
+ nmap = ATTR_RMTVALUE_MAPSIZE;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap >= 1);
+
+ for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+ (map[i].br_startblock != HOLESTARTBLOCK));
+ dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ dblkno, dblkcnt, 0, &bp,
+ &xfs_attr3_rmt_buf_ops);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ &offset, &valuelen,
+ &dst);
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+ /* roll attribute extent map forwards */
+ lblkno += map[i].br_blockcount;
+ blkcnt -= map[i].br_blockcount;
+ }
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ xfs_fileoff_t lfileoff = 0;
+ char *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_set(args);
+
+ /*
+ * Find a "hole" in the attribute address space large enough for
+ * us to drop the new attribute's value into. Because CRC enable
+ * attributes have headers, we can't just do a straight byte to FSB
+ * conversion and have to take the header space into account.
+ */
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ /*
+ * Roll through the "value", allocating blocks on disk as required.
+ */
+ while (blkcnt > 0) {
+ int committed;
+
+ /*
+ * Allocate a single extent, up to the size of the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+ blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
+
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+
+ /*
+ * Start the next trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ valuelen = args->valuelen;
+ while (valuelen > 0) {
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+ if (!bp)
+ return ENOMEM;
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
+
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+
+ /* roll attribute extent map forwards */
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_dablk_t lblkno;
+ int blkcnt;
+ int error;
+ int done;
+
+ trace_xfs_attr_rmtval_remove(args);
+
+ /*
+ * Roll through the "value", invalidating the attribute value's blocks.
+ * Note that args->rmtblkcnt is the minimum number of data blocks we'll
+ * see for a CRC enabled remote attribute. Each extent will have a
+ * header, and so we may have more blocks than we realise here. If we
+ * fail to map the blocks correctly, we'll have problems with the buffer
+ * lookups.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ while (blkcnt > 0) {
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+ int nmap;
+
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ /*
+ * If the "remote" value is in the cache, remove it.
+ */
+ bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ bp = NULL;
+ }
+
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+
+ /*
+ * Keep de-allocating extents until the remote-value region is gone.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ done = 0;
+ while (!done) {
+ int committed;
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ 1, args->firstblock, args->flist,
+ &done);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return error;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, args->dp, 0);
+
+ /*
+ * Close out trans and start the next one in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, args->dp);
+ if (error)
+ return (error);
+ }
+ return(0);
+}
+
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
new file mode 100644
index 000000000000..92a8fd7977cc
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define __XFS_ATTR_REMOTE_H__
+
+#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+
+/*
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+struct xfs_attr3_rmt_hdr {
+ __be32 rm_magic;
+ __be32 rm_offset;
+ __be32 rm_bytes;
+ __be32 rm_crc;
+ uuid_t rm_uuid;
+ __be64 rm_owner;
+ __be64 rm_blkno;
+ __be64 rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index b44af9211bd9..89042848f9ec 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -25,6 +25,7 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
+#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
@@ -47,180 +48,78 @@
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_symlink.h"
kmem_zone_t *xfs_bmap_free_item_zone;
/*
- * Prototypes for internal bmap routines.
- */
-
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(
- struct xfs_btree_cur *cur,
- struct xfs_inode *ip,
- int whichfork);
-#else
-#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
-#endif
-
-
-/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
- */
-STATIC int /* error */
-xfs_bmap_add_attrfork_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
-
-/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Miscellaneous helper functions
*/
-STATIC int /* error */
-xfs_bmap_add_attrfork_local(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int /* error */
-xfs_bmap_alloc(
- xfs_bmalloca_t *ap); /* bmap alloc argument struct */
-
-/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free); /* list item to be freed */
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int /* error */
-xfs_bmap_extents_to_btree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
- xfs_btree_cur_t **curp, /* cursor returned to caller */
- int wasdel, /* converting a delayed alloc */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Convert a local file to an extents file.
- * This code is sort of bogus, since the file data needs to get
- * logged so it won't be lost. The bmap-level manipulations are ok, though.
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- void (*init_fn)(struct xfs_buf *bp,
- struct xfs_inode *ip,
- struct xfs_ifork *ifp));
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int whichfork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len); /* delayed extent length */
-
-#ifdef DEBUG
-/*
- * Perform various validation checks on the values being returned
- * from xfs_bmapi().
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
*/
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap);
-#else
-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
-
-STATIC int
-xfs_bmap_count_tree(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ifork_t *ifp,
- xfs_fsblock_t blockno,
- int levelin,
- int *count);
-
-STATIC void
-xfs_bmap_count_leaves(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- int numrecs,
- int *count);
+void
+xfs_bmap_compute_maxlevels(
+ xfs_mount_t *mp, /* file system mount structure */
+ int whichfork) /* data or attr fork */
+{
+ int level; /* btree level */
+ uint maxblocks; /* max blocks at this level */
+ uint maxleafents; /* max leaf entries possible */
+ int maxrootrecs; /* max records in root block */
+ int minleafrecs; /* min records in leaf block */
+ int minnoderecs; /* min records in node block */
+ int sz; /* root block size */
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- int *count);
+ /*
+ * The maximum number of extents in a file, hence the maximum
+ * number of leaf entries, is controlled by the type of di_nextents
+ * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+ * (a signed 16-bit number, xfs_aextnum_t).
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that
+ * the fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+ * with ATTR2 and then mounted back with ATTR1, keeping the
+ * di_forkoff's fixed but probably at various positions. Therefore,
+ * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+ * of a minimum size available.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ maxleafents = MAXEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ } else {
+ maxleafents = MAXAEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ }
+ maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+ minleafrecs = mp->m_bmap_dmnr[0];
+ minnoderecs = mp->m_bmap_dmnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++) {
+ if (maxblocks <= maxrootrecs)
+ maxblocks = 1;
+ else
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ }
+ mp->m_bm_maxlevels[whichfork] = level;
+}
/*
- * Bmap internal routines.
+ * Convert the given file system block to a disk block. We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
*/
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
STATIC int /* error */
xfs_bmbt_lookup_eq(
@@ -290,6 +189,1070 @@ xfs_bmbt_update(
}
/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
+{
+ int level; /* btree level number */
+ int maxrecs; /* maximum record count at this level */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t rval; /* return value */
+
+ mp = ip->i_mount;
+ maxrecs = mp->m_bmap_dmxr[0];
+ for (level = 0, rval = 0;
+ level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+ level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ rval += len;
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ level - 1;
+ if (level == 0)
+ maxrecs = mp->m_bmap_dmxr[1];
+ }
+ return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint offset;
+
+ if (mp->m_sb.sb_inodesize == 256) {
+ offset = XFS_LITINO(mp, ip->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ } else {
+ offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ }
+
+ ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ if (whichfork == XFS_ATTR_FORK &&
+ ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+ ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = dfl_forkoff;
+ }
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ int numrecs,
+ int *count)
+{
+ int b;
+
+ for (b = 0; b < numrecs; b++) {
+ xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+ *count += xfs_bmbt_get_blockcount(frp);
+ }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ int numrecs,
+ int *count)
+{
+ int b;
+ xfs_bmbt_rec_t *frp;
+
+ for (b = 1; b <= numrecs; b++) {
+ frp = XFS_BMBT_REC_ADDR(mp, block, b);
+ *count += xfs_bmbt_disk_get_blockcount(frp);
+ }
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+STATIC int /* error */
+xfs_bmap_count_tree(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fsblock_t blockno, /* file system block number */
+ int levelin, /* level in btree */
+ int *count) /* Count of blocks */
+{
+ int error;
+ xfs_buf_t *bp, *nbp;
+ int level = levelin;
+ __be64 *pp;
+ xfs_fsblock_t bno = blockno;
+ xfs_fsblock_t nextbno;
+ struct xfs_btree_block *block, *nextblock;
+ int numrecs;
+
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+
+ if (--level) {
+ /* Not at node above leaves, count this level of nodes */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ while (nextbno != NULLFSBLOCK) {
+ error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ nextblock = XFS_BUF_TO_BLOCK(nbp);
+ nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+ xfs_trans_brelse(tp, nbp);
+ }
+
+ /* Dive to the next level */
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ if (unlikely((error =
+ xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+ xfs_trans_brelse(tp, bp);
+ XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ xfs_trans_brelse(tp, bp);
+ } else {
+ /* count all level 1 nodes and their leaves */
+ for (;;) {
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ numrecs = be16_to_cpu(block->bb_numrecs);
+ xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+ xfs_trans_brelse(tp, bp);
+ if (nextbno == NULLFSBLOCK)
+ break;
+ bno = nextbno;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int /* error */
+xfs_bmap_count_blocks(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ int *count) /* out: count of blocks */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+ xfs_bmap_count_leaves(ifp, 0,
+ ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+ count);
+ return 0;
+ }
+
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ block = ifp->if_broot;
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+ XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ return 0;
+}
+
+/*
+ * Debug/sanity checking code
+ */
+
+STATIC int
+xfs_bmap_sanity_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ int level)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+ block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+ return 0;
+
+ if (be16_to_cpu(block->bb_level) != level ||
+ be16_to_cpu(block->bb_numrecs) == 0 ||
+ be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return 0;
+
+ return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t bno)
+{
+ struct xfs_log_item_desc *lidp;
+ int i;
+
+ if (!cur)
+ return NULL;
+
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+ if (!cur->bc_bufs[i])
+ break;
+ if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ return cur->bc_bufs[i];
+ }
+
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+ struct xfs_buf_log_item *bip;
+ bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_ADDR(bip->bli_buf) == bno)
+ return bip->bli_buf;
+ }
+
+ return NULL;
+}
+
+STATIC void
+xfs_check_block(
+ struct xfs_btree_block *block,
+ xfs_mount_t *mp,
+ int root,
+ short sz)
+{
+ int i, j, dmxr;
+ __be64 *pp, *thispa; /* pointer to block address */
+ xfs_bmbt_key_t *prevp, *keyp;
+
+ ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+ prevp = NULL;
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+ dmxr = mp->m_bmap_dmxr[0];
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+ if (prevp) {
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
+ }
+ prevp = keyp;
+
+ /*
+ * Compare the block numbers to see if there are dups.
+ */
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+ for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ if (*thispa == *pp) {
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ __func__, j, i,
+ (unsigned long long)be64_to_cpu(*thispa));
+ panic("%s: ptrs are equal in node\n",
+ __func__);
+ }
+ }
+ }
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+ xfs_btree_cur_t *cur, /* btree cursor or null */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_extnum_t i=0, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ xfs_bmbt_rec_t *ep; /* pointer to current extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
+ xfs_bmbt_rec_t *nextp; /* pointer to next extent */
+ int bp_release = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ return;
+ }
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ /* See if buf is in cur first */
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+
+ /*
+ * Check this block for basic sanity (increasing keys and
+ * no duplicate blocks).
+ */
+
+ xfs_check_block(block, mp, 0, 0);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ i = 0;
+
+ /*
+ * Loop over all leaf nodes checking that all extents are in the right order.
+ */
+ for (;;) {
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ /*
+ * Check all the extents to make sure they are OK.
+ * If we had a previous block, the last entry should
+ * conform with the first entry in this one.
+ */
+
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ if (i) {
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
+ }
+ for (j = 1; j < num_recs; j++) {
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
+ ep = nextp;
+ }
+
+ last = *ep;
+ i += num_recs;
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ return;
+
+error0:
+ xfs_warn(mp, "%s: at error0", __func__);
+ if (bp_release)
+ xfs_trans_brelse(NULL, bp);
+error_norelse:
+ xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ __func__, i);
+ panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t cnt, /* count of entries in the list */
+ int whichfork, /* data or attr fork */
+ unsigned long caller_ip)
+{
+ xfs_extnum_t idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int state = 0;
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ for (idx = 0; idx < cnt; idx++)
+ trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters. Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap)
+{
+ int i; /* index to map values */
+
+ ASSERT(ret_nmap <= nmap);
+
+ for (i = 0; i < ret_nmap; i++) {
+ ASSERT(mval[i].br_blockcount > 0);
+ if (!(flags & XFS_BMAPI_ENTIRE)) {
+ ASSERT(mval[i].br_startoff >= bno);
+ ASSERT(mval[i].br_blockcount <= len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+ bno + len);
+ } else {
+ ASSERT(mval[i].br_startoff < bno + len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+ bno);
+ }
+ ASSERT(i == 0 ||
+ mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+ mval[i].br_startoff);
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+ mval[i].br_state == XFS_EXT_UNWRITTEN);
+ }
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+ xfs_fsblock_t bno, /* fs block number of extent */
+ xfs_filblks_t len, /* length of extent */
+ xfs_bmap_free_t *flist, /* list of extents */
+ xfs_mount_t *mp) /* mount point structure */
+{
+ xfs_bmap_free_item_t *cur; /* current (next) element */
+ xfs_bmap_free_item_t *new; /* new element */
+ xfs_bmap_free_item_t *prev; /* previous element */
+#ifdef DEBUG
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xbfi_startblock = bno;
+ new->xbfi_blockcount = (xfs_extlen_t)len;
+ for (prev = NULL, cur = flist->xbf_first;
+ cur != NULL;
+ prev = cur, cur = cur->xbfi_next) {
+ if (cur->xbfi_startblock >= bno)
+ break;
+ }
+ if (prev)
+ prev->xbfi_next = new;
+ else
+ flist->xbf_first = new;
+ new->xbfi_next = cur;
+ flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list. Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+ xfs_bmap_free_t *flist, /* free item list header */
+ xfs_bmap_free_item_t *prev, /* previous item on list, if any */
+ xfs_bmap_free_item_t *free) /* list item to be freed */
+{
+ if (prev)
+ prev->xbfi_next = free->xbfi_next;
+ else
+ flist->xbf_first = free->xbfi_next;
+ flist->xbf_count--;
+ kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations. We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int /* error */
+xfs_bmap_finish(
+ xfs_trans_t **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed) /* xact committed or not */
+{
+ xfs_efd_log_item_t *efd; /* extent free data */
+ xfs_efi_log_item_t *efi; /* extent free intention */
+ int error; /* error return value */
+ xfs_bmap_free_item_t *free; /* free extent item */
+ unsigned int logres; /* new log reservation */
+ unsigned int logcount; /* new log count */
+ xfs_mount_t *mp; /* filesystem mount structure */
+ xfs_bmap_free_item_t *next; /* next item on free list */
+ xfs_trans_t *ntp; /* new transaction pointer */
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (flist->xbf_count == 0) {
+ *committed = 0;
+ return 0;
+ }
+ ntp = *tp;
+ efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ for (free = flist->xbf_first; free; free = free->xbfi_next)
+ xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ logres = ntp->t_log_res;
+ logcount = ntp->t_log_count;
+ ntp = xfs_trans_dup(*tp);
+ error = xfs_trans_commit(*tp, 0);
+ *tp = ntp;
+ *committed = 1;
+ /*
+ * We have a new transaction, so we should return committed=1,
+ * even though we're returning an error.
+ */
+ if (error)
+ return error;
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+
+ if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+ logcount)))
+ return error;
+ efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ for (free = flist->xbf_first; free != NULL; free = next) {
+ next = free->xbfi_next;
+ if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ free->xbfi_blockcount))) {
+ /*
+ * The bmap free list will be cleaned up at a
+ * higher level. The EFI will be canceled when
+ * this transaction is aborted.
+ * Need to force shutdown here to make sure it
+ * happens, since this transaction may not be
+ * dirty yet.
+ */
+ mp = ntp->t_mountp;
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_force_shutdown(mp,
+ (error == EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ return error;
+ }
+ xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+ xfs_bmap_free_t *flist) /* list of bmap_free_items */
+{
+ xfs_bmap_free_item_t *free; /* free list item */
+ xfs_bmap_free_item_t *next;
+
+ if (flist->xbf_count == 0)
+ return;
+ ASSERT(flist->xbf_first != NULL);
+ for (free = flist->xbf_first; free; free = next) {
+ next = free->xbfi_next;
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ /* REFERENCED */
+ struct xfs_btree_block *cblock;/* child btree block */
+ xfs_fsblock_t cbno; /* child block number */
+ xfs_buf_t *cbp; /* child block's buffer */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork data */
+ xfs_mount_t *mp; /* mount point structure */
+ __be64 *pp; /* ptr to block address */
+ struct xfs_btree_block *rblock;/* root btree block */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ rblock = ifp->if_broot;
+ ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+ ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ cbno = be64_to_cpu(*pp);
+ *logflagsp = 0;
+#ifdef DEBUG
+ if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+ return error;
+#endif
+ error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+ return error;
+ xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, cbp);
+ if (cur->bc_bufs[0] == cbp)
+ cur->bc_bufs[0] = NULL;
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int /* error */
+xfs_bmap_extents_to_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first-block-allocated */
+ xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ xfs_btree_cur_t **curp, /* cursor returned to caller */
+ int wasdel, /* converting a delayed alloc */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
+ xfs_buf_t *abp; /* buffer for ablock */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_bmbt_rec_t *arp; /* child record pointer */
+ struct xfs_btree_block *block; /* btree root block */
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return value */
+ xfs_extnum_t i, cnt; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_key_t *kp; /* root block key pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_ptr_t *pp; /* root block address pointer */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+ /*
+ * Make space in the inode incore.
+ */
+ xfs_iroot_realloc(ip, 1, whichfork);
+ ifp->if_flags |= XFS_IFBROOT;
+
+ /*
+ * Fill in the root.
+ */
+ block = ifp->if_broot;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
+ /*
+ * Need a cursor. Can't allocate until bb_level is filled in.
+ */
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ /*
+ * Convert to a btree with two levels, one record in root.
+ */
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = mp;
+ args.firstblock = *firstblock;
+ if (*firstblock == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = *firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = *firstblock;
+ }
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = wasdel;
+ *logflagsp = 0;
+ if ((error = xfs_alloc_vextent(&args))) {
+ xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+ (flist->xbf_low &&
+ args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ ip->i_d.di_nblocks++;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ /*
+ * Fill in the child block.
+ */
+ abp->b_ops = &xfs_bmbt_buf_ops;
+ ablock = XFS_BUF_TO_BLOCK(abp);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (cnt = i = 0; i < nextents; i++) {
+ ep = xfs_iext_get_ext(ifp, i);
+ if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+ arp->l0 = cpu_to_be64(ep->l0);
+ arp->l1 = cpu_to_be64(ep->l1);
+ arp++; cnt++;
+ }
+ }
+ ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ xfs_btree_set_numrecs(ablock, cnt);
+
+ /*
+ * Fill in the root key and pointer.
+ */
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
+ *pp = cpu_to_be64(args.fsbno);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ ASSERT(*curp == NULL);
+ *curp = cur;
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+ return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated in xaction */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork,
+ void (*init_fn)(struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp))
+{
+ int error; /* error return value */
+ int flags; /* logging flags returned */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+
+ /*
+ * We don't want to deal with the case of keeping inode data inline yet.
+ * So sending the data fork of a regular inode is invalid.
+ */
+ ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ flags = 0;
+ error = 0;
+ if (ifp->if_bytes) {
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+
+ ASSERT((ifp->if_flags &
+ (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
+
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+ /* initialise the block and copy the data */
+ init_fn(tp, bp, ip, ifp);
+
+ /* account for the change in fork size and log everything */
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+ } else {
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+ xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ }
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ flags |= XFS_ILOG_CORE;
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
* Called from xfs_bmap_add_attrfork to handle btree format files.
*/
STATIC int /* error */
@@ -360,29 +1323,22 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Block initialisation functions for local to extent format conversion.
- * As these get more complex, they will be moved to the relevant files,
- * but for now they are too simple to worry about.
+ * Block initialisation function for local to extent format conversion.
+ *
+ * This shouldn't actually be called by anyone, so make sure debug kernels cause
+ * a noticable failure.
*/
STATIC void
xfs_bmap_local_to_extents_init_fn(
+ struct xfs_trans *tp,
struct xfs_buf *bp,
struct xfs_inode *ip,
struct xfs_ifork *ifp)
{
+ ASSERT(0);
bp->b_ops = &xfs_bmbt_buf_ops;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-}
-
-STATIC void
-xfs_symlink_local_to_remote(
- struct xfs_buf *bp,
- struct xfs_inode *ip,
- struct xfs_ifork *ifp)
-{
- /* remote symlink blocks are not verifiable until CRCs come along */
- bp->b_ops = NULL;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
}
/*
@@ -394,8 +1350,7 @@ xfs_symlink_local_to_remote(
*
* XXX (dgc): investigate whether directory conversion can use the generic
* formatting callout. It should be possible - it's just a very complex
- * formatter. it would also require passing the transaction through to the init
- * function.
+ * formatter.
*/
STATIC int /* error */
xfs_bmap_add_attrfork_local(
@@ -432,6 +1387,640 @@ xfs_bmap_add_attrfork_local(
}
/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
+{
+ xfs_fsblock_t firstblock; /* 1st block/ag allocated */
+ xfs_bmap_free_t flist; /* freed extent records */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ int blks; /* space reservation */
+ int version = 1; /* superblock attr version */
+ int committed; /* xaction was committed */
+ int logflags; /* logging flags */
+ int error; /* error return value */
+
+ ASSERT(XFS_IFORK_Q(ip) == 0);
+
+ mp = ip->i_mount;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
+ if (rsvd)
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+ goto error0;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return error;
+ }
+ if (XFS_IFORK_Q(ip))
+ goto error1;
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+ /*
+ * For inodes coming from pre-6.2 filesystems.
+ */
+ ASSERT(ip->i_d.di_aformat == 0);
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ }
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_UUID:
+ ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if (mp->m_flags & XFS_MOUNT_ATTR2)
+ version = 2;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ goto error1;
+ }
+
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp->if_flags = XFS_IFEXTENTS;
+ logflags = 0;
+ xfs_bmap_init(&flist, &firstblock);
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+ &flist, &logflags);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (error)
+ goto error2;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+ (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ __int64_t sbfields = 0;
+
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+ xfs_sb_version_addattr(&mp->m_sb);
+ sbfields |= XFS_SB_VERSIONNUM;
+ }
+ if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+ }
+ if (sbfields) {
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, sbfields);
+ } else
+ spin_unlock(&mp->m_sb_lock);
+ }
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto error2;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+error2:
+ xfs_bmap_cancel(&flist);
+error1:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int /* error */
+xfs_bmap_read_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
+ xfs_extnum_t i, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ /* REFERENCED */
+ xfs_extnum_t room; /* number of entries there's room for */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+ XFS_EXTFMT_INODE(ip);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ xfs_trans_brelse(tp, bp);
+ }
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ i = 0;
+ /*
+ * Loop over all leaf nodes. Copy information to the extent records.
+ */
+ for (;;) {
+ xfs_bmbt_rec_t *frp;
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t start;
+
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(i + num_recs > room)) {
+ ASSERT(i + num_recs <= room);
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, (btree extents).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, block);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, 0),
+ error0);
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ if (nextbno != NULLFSBLOCK)
+ xfs_btree_reada_bufl(mp, nextbno, 1,
+ &xfs_bmbt_buf_ops);
+ /*
+ * Copy records into the extent records.
+ */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ start = i;
+ for (j = 0; j < num_recs; j++, i++, frp++) {
+ xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+ trp->l0 = be64_to_cpu(frp->l0);
+ trp->l1 = be64_to_cpu(frp->l1);
+ }
+ if (exntf == XFS_EXTFMT_NOSTATE) {
+ /*
+ * Check all attribute bmap btree records and
+ * any "older" data bmap btree records for a
+ * set bit in the "extent flag" position.
+ */
+ if (unlikely(xfs_check_nostate_extents(ifp,
+ start, num_recs))) {
+ XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ goto error0;
+ }
+ }
+ xfs_trans_brelse(tp, bp);
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+ return 0;
+error0:
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry. If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none). Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t lastx; /* last extent index */
+
+ /*
+ * Initialize the extent entry structure to catch access to
+ * uninitialized br_startblock field.
+ */
+ gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+ gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+ gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+ gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+ gotp->br_startblock = 0xffffa5a5;
+#endif
+ prevp->br_startoff = NULLFILEOFF;
+
+ ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+ if (lastx > 0) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+ }
+ if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+ xfs_bmbt_get_all(ep, gotp);
+ *eofp = 0;
+ } else {
+ if (lastx > 0) {
+ *gotp = *prevp;
+ }
+ *eofp = 1;
+ ep = NULL;
+ }
+ *lastxp = lastx;
+ return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry. If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_extents(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int fork, /* data or attr fork */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+
+ XFS_STATS_INC(xs_look_exlist);
+ ifp = XFS_IFORK_PTR(ip, fork);
+
+ ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+ if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+ !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x lastx: %x\n",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)gotp->br_startblock,
+ (unsigned long long)gotp->br_startoff,
+ (unsigned long long)gotp->br_blockcount,
+ gotp->br_state, *lastxp);
+ *lastxp = NULLEXTNUM;
+ *eofp = 1;
+ return NULL;
+ }
+ return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int /* error */
+xfs_bmap_first_unused(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t lastaddr; /* last block number seen */
+ xfs_fileoff_t lowest; /* lowest useful block */
+ xfs_fileoff_t max; /* starting useful block */
+ xfs_fileoff_t off; /* offset for this block */
+ xfs_extnum_t nextents; /* number of extent entries */
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *first_unused = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ lowest = *first_unused;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+ off = xfs_bmbt_get_startoff(ep);
+ /*
+ * See if the hole before this extent will work.
+ */
+ if (off >= lowest + len && off - max >= len) {
+ *first_unused = max;
+ return 0;
+ }
+ lastaddr = off + xfs_bmbt_get_blockcount(ep);
+ max = XFS_FILEOFF_MAX(lastaddr, lowest);
+ }
+ *first_unused = max;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_before(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ xfs_fileoff_t bno; /* input file offset */
+ int eof; /* hit end of file */
+ xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
+ int error; /* error return value */
+ xfs_bmbt_irec_t got; /* current extent value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent used */
+ xfs_bmbt_irec_t prev; /* previous extent value */
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EIO);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *last_block = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ bno = *last_block - 1;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+ if (prev.br_startoff == NULLFILEOFF)
+ *last_block = 0;
+ else
+ *last_block = prev.br_startoff + prev.br_blockcount;
+ }
+ /*
+ * Otherwise *last_block is already the right answer.
+ */
+ return 0;
+}
+
+STATIC int
+xfs_bmap_last_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *rec,
+ int *is_empty)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int error;
+ int nextents;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *is_empty = 1;
+ return 0;
+ }
+
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+ *is_empty = 0;
+ return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ bma->aeof = 0;
+ error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+ &is_empty);
+ if (error || is_empty)
+ return error;
+
+ /*
+ * Check if we are allocation or past the last extent, or at least into
+ * the last delayed allocated extent.
+ */
+ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+ (bma->offset >= rec.br_startoff &&
+ isnullstartblock(rec.br_startblock));
+ return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary. All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+ struct xfs_inode *ip,
+ xfs_fileoff_t endoff,
+ int whichfork,
+ int *eof)
+{
+ struct xfs_bmbt_irec rec;
+ int error;
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+ if (error || *eof)
+ return error;
+
+ *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *last_block,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ *last_block = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ return 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return XFS_ERROR(EIO);
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+ if (error || is_empty)
+ return error;
+
+ *last_block = rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not. For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int rval; /* return value */
+ xfs_bmbt_irec_t s; /* internal version of extent */
+
+#ifndef DEBUG
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif /* !DEBUG */
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_get_all(ep, &s);
+ rval = s.br_startoff == 0 && s.br_blockcount == 1;
+ if (rval && whichfork == XFS_DATA_FORK)
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+ return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
* Convert a delayed allocation to a real allocation.
*/
STATIC int /* error */
@@ -1894,6 +3483,10 @@ done:
}
/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
* Adjust the size of the new extent based on di_extsize and rt extsize.
*/
STATIC int
@@ -2666,1628 +4259,6 @@ xfs_bmap_alloc(
}
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- /* REFERENCED */
- struct xfs_btree_block *cblock;/* child btree block */
- xfs_fsblock_t cbno; /* child block number */
- xfs_buf_t *cbp; /* child block's buffer */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
- __be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
- ASSERT(be16_to_cpu(rblock->bb_level) == 1);
- ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
- ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
- cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
- return error;
-#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- cblock = XFS_BUF_TO_BLOCK(cbp);
- if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
- return error;
- xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
- ip->i_d.di_nblocks--;
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
- xfs_iroot_realloc(ip, -1, whichfork);
- ASSERT(ifp->if_broot == NULL);
- ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
- return 0;
-}
-
-/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_del_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t *idx, /* extent number to update/delete */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *del, /* data to remove from extents */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
- xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
- xfs_fsblock_t del_endblock=0; /* first block past del */
- xfs_fileoff_t del_endoff; /* first offset past del */
- int delay; /* current block is delayed allocated */
- int do_fx; /* free extent at end of routine */
- xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
- int error; /* error return value */
- int flags; /* inode logging flags */
- xfs_bmbt_irec_t got; /* current extent entry */
- xfs_fileoff_t got_endoff; /* first offset past got */
- int i; /* temp state */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t nblks; /* quota/sb block count */
- xfs_bmbt_irec_t new; /* new record to be inserted */
- /* REFERENCED */
- uint qfield; /* quota field to update */
- xfs_filblks_t temp; /* for indirect length calculations */
- xfs_filblks_t temp2; /* for indirect length calculations */
- int state = 0;
-
- XFS_STATS_INC(xs_del_exlist);
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &got);
- ASSERT(got.br_startoff <= del->br_startoff);
- del_endoff = del->br_startoff + del->br_blockcount;
- got_endoff = got.br_startoff + got.br_blockcount;
- ASSERT(got_endoff >= del_endoff);
- delay = isnullstartblock(got.br_startblock);
- ASSERT(isnullstartblock(del->br_startblock) == delay);
- flags = 0;
- qfield = 0;
- error = 0;
- /*
- * If deleting a real allocation, must free up the disk space.
- */
- if (!delay) {
- flags = XFS_ILOG_CORE;
- /*
- * Realtime allocation. Free it and record di_nblocks update.
- */
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
- xfs_filblks_t len;
-
- ASSERT(do_mod(del->br_blockcount,
- mp->m_sb.sb_rextsize) == 0);
- ASSERT(do_mod(del->br_startblock,
- mp->m_sb.sb_rextsize) == 0);
- bno = del->br_startblock;
- len = del->br_blockcount;
- do_div(bno, mp->m_sb.sb_rextsize);
- do_div(len, mp->m_sb.sb_rextsize);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
- do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
- qfield = XFS_TRANS_DQ_RTBCOUNT;
- }
- /*
- * Ordinary allocation.
- */
- else {
- do_fx = 1;
- nblks = del->br_blockcount;
- qfield = XFS_TRANS_DQ_BCOUNT;
- }
- /*
- * Set up del_endblock and cur for later.
- */
- del_endblock = del->br_startblock + del->br_blockcount;
- if (cur) {
- if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock, got.br_blockcount,
- &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- }
- da_old = da_new = 0;
- } else {
- da_old = startblockval(got.br_startblock);
- da_new = 0;
- nblks = 0;
- do_fx = 0;
- }
- /*
- * Set flag value to use in switch statement.
- * Left-contig is 2, right-contig is 1.
- */
- switch (((got.br_startoff == del->br_startoff) << 1) |
- (got_endoff == del_endoff)) {
- case 3:
- /*
- * Matches the whole extent. Delete the entry.
- */
- xfs_iext_remove(ip, *idx, 1,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- --*idx;
- if (delay)
- break;
-
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
- flags |= XFS_ILOG_CORE;
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_btree_delete(cur, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- break;
-
- case 2:
- /*
- * Deleting the first part of the extent.
- */
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, del_endoff);
- temp = got.br_blockcount - del->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
- goto done;
- break;
-
- case 1:
- /*
- * Deleting the last part of the extent.
- */
- temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
- goto done;
- break;
-
- case 0:
- /*
- * Deleting the middle of the extent.
- */
- temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- new.br_startoff = del_endoff;
- temp2 = got_endoff - del_endoff;
- new.br_blockcount = temp2;
- new.br_state = got.br_state;
- if (!delay) {
- new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
- if (cur) {
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock, temp,
- got.br_state)))
- goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto done;
- cur->bc_rec.b = new;
- error = xfs_btree_insert(cur, &i);
- if (error && error != ENOSPC)
- goto done;
- /*
- * If get no-space back from btree insert,
- * it tried a split, and we have a zero
- * block reservation.
- * Fix up our state and return the error.
- */
- if (error == ENOSPC) {
- /*
- * Reset the cursor, don't trust
- * it after any insert operation.
- */
- if ((error = xfs_bmbt_lookup_eq(cur,
- got.br_startoff,
- got.br_startblock,
- temp, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- /*
- * Update the btree record back
- * to the original value.
- */
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state)))
- goto done;
- /*
- * Reset the extent record back
- * to the original value.
- */
- xfs_bmbt_set_blockcount(ep,
- got.br_blockcount);
- flags = 0;
- error = XFS_ERROR(ENOSPC);
- goto done;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- } else
- flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- } else {
- ASSERT(whichfork == XFS_DATA_FORK);
- temp = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
- new.br_startblock = nullstartblock((int)temp2);
- da_new = temp + temp2;
- while (da_new > da_old) {
- if (temp) {
- temp--;
- da_new--;
- xfs_bmbt_set_startblock(ep,
- nullstartblock((int)temp));
- }
- if (da_new == da_old)
- break;
- if (temp2) {
- temp2--;
- da_new--;
- new.br_startblock =
- nullstartblock((int)temp2);
- }
- }
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, *idx + 1, 1, &new, state);
- ++*idx;
- break;
- }
- /*
- * If we need to, add to list of extents to delete.
- */
- if (do_fx)
- xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
- mp);
- /*
- * Adjust inode # blocks in the file.
- */
- if (nblks)
- ip->i_d.di_nblocks -= nblks;
- /*
- * Adjust quota data.
- */
- if (qfield)
- xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-
- /*
- * Account for change in delayed indirect blocks.
- * Nothing to do for disk quota accounting here.
- */
- ASSERT(da_old >= da_new);
- if (da_old > da_new) {
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - da_new), 0);
- }
-done:
- *logflagsp = flags;
- return error;
-}
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free) /* list item to be freed */
-{
- if (prev)
- prev->xbfi_next = free->xbfi_next;
- else
- flist->xbf_first = free->xbfi_next;
- flist->xbf_count--;
- kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int /* error */
-xfs_bmap_extents_to_btree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
- xfs_btree_cur_t **curp, /* cursor returned to caller */
- int wasdel, /* converting a delayed alloc */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *ablock; /* allocated (child) bt block */
- xfs_buf_t *abp; /* buffer for ablock */
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_bmbt_rec_t *arp; /* child record pointer */
- struct xfs_btree_block *block; /* btree root block */
- xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- int error; /* error return value */
- xfs_extnum_t i, cnt; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_key_t *kp; /* root block key pointer */
- xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_bmbt_ptr_t *pp; /* root block address pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
- /*
- * Make space in the inode incore.
- */
- xfs_iroot_realloc(ip, 1, whichfork);
- ifp->if_flags |= XFS_IFBROOT;
-
- /*
- * Fill in the root.
- */
- block = ifp->if_broot;
- block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- block->bb_level = cpu_to_be16(1);
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-
- /*
- * Need a cursor. Can't allocate until bb_level is filled in.
- */
- mp = ip->i_mount;
- cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
- /*
- * Convert to a btree with two levels, one record in root.
- */
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = mp;
- args.firstblock = *firstblock;
- if (*firstblock == NULLFSBLOCK) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (flist->xbf_low) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = *firstblock;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = *firstblock;
- }
- args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = wasdel;
- *logflagsp = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- xfs_iroot_realloc(ip, -1, whichfork);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * Allocation can't fail, the space was reserved.
- */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(*firstblock == NULLFSBLOCK ||
- args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
- (flist->xbf_low &&
- args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
- *firstblock = cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- ip->i_d.di_nblocks++;
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
- /*
- * Fill in the child block.
- */
- abp->b_ops = &xfs_bmbt_buf_ops;
- ablock = XFS_BUF_TO_BLOCK(abp);
- ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- ablock->bb_level = 0;
- ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (cnt = i = 0; i < nextents; i++) {
- ep = xfs_iext_get_ext(ifp, i);
- if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
- arp->l0 = cpu_to_be64(ep->l0);
- arp->l1 = cpu_to_be64(ep->l1);
- arp++; cnt++;
- }
- }
- ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
- xfs_btree_set_numrecs(ablock, cnt);
-
- /*
- * Fill in the root key and pointer.
- */
- kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
- be16_to_cpu(block->bb_level)));
- *pp = cpu_to_be64(args.fsbno);
-
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
- xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
- ASSERT(*curp == NULL);
- *curp = cur;
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
- return 0;
-}
-
-/*
- * Calculate the default attribute fork offset for newly created inodes.
- */
-uint
-xfs_default_attroffset(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint offset;
-
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
- offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
-
- ASSERT(offset < XFS_LITINO(mp));
- return offset;
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- int whichfork)
-{
- if (whichfork == XFS_ATTR_FORK &&
- ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
- uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-
- if (dfl_forkoff > ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = dfl_forkoff;
- }
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork,
- void (*init_fn)(struct xfs_buf *bp,
- struct xfs_inode *ip,
- struct xfs_ifork *ifp))
-{
- int error; /* error return value */
- int flags; /* logging flags returned */
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- /*
- * We don't want to deal with the case of keeping inode data inline yet.
- * So sending the data fork of a regular inode is invalid.
- */
- ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- flags = 0;
- error = 0;
- if (ifp->if_bytes) {
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent block */
- xfs_bmbt_rec_host_t *ep;/* extent record pointer */
-
- ASSERT((ifp->if_flags &
- (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = ip->i_mount;
- args.firstblock = *firstblock;
- /*
- * Allocate a block. We know we need only one, since the
- * file currently fits in an inode.
- */
- if (*firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = *firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- args.total = total;
- args.minlen = args.maxlen = args.prod = 1;
- error = xfs_alloc_vextent(&args);
- if (error)
- goto done;
-
- /* Can't fail, the space was reserved. */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(args.len == 1);
- *firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-
- /* initialise the block and copy the data */
- init_fn(bp, ip, ifp);
-
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
- xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_iext_add(ifp, 0, 1);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
- xfs_trans_mod_dquot_byino(tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- flags |= xfs_ilog_fext(whichfork);
- } else {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
- }
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- flags |= XFS_ILOG_CORE;
-done:
- *logflagsp = flags;
- return error;
-}
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t lastx; /* last extent index */
-
- /*
- * Initialize the extent entry structure to catch access to
- * uninitialized br_startblock field.
- */
- gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
- gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
- gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
- gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
- gotp->br_startblock = 0xffffa5a5;
-#endif
- prevp->br_startoff = NULLFILEOFF;
-
- ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
- if (lastx > 0) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
- }
- if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, gotp);
- *eofp = 0;
- } else {
- if (lastx > 0) {
- *gotp = *prevp;
- }
- *eofp = 1;
- ep = NULL;
- }
- *lastxp = lastx;
- return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int fork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
-
- XFS_STATS_INC(xs_look_exlist);
- ifp = XFS_IFORK_PTR(ip, fork);
-
- ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
- if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
- !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x lastx: %x\n",
- (unsigned long long)ip->i_ino,
- (unsigned long long)gotp->br_startblock,
- (unsigned long long)gotp->br_startoff,
- (unsigned long long)gotp->br_blockcount,
- gotp->br_state, *lastxp);
- *lastxp = NULLEXTNUM;
- *eofp = 1;
- return NULL;
- }
- return ep;
-}
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
-{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
-
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
- for (level = 0, rval = 0;
- level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
- level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- rval += len;
- if (len == 1)
- return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
- level - 1;
- if (level == 0)
- maxrecs = mp->m_bmap_dmxr[1];
- }
- return rval;
-}
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int /* error code */
-xfs_bmap_add_attrfork(
- xfs_inode_t *ip, /* incore inode pointer */
- int size, /* space new attribute needs */
- int rsvd) /* xact may use reserved blks */
-{
- xfs_fsblock_t firstblock; /* 1st block/ag allocated */
- xfs_bmap_free_t flist; /* freed extent records */
- xfs_mount_t *mp; /* mount structure */
- xfs_trans_t *tp; /* transaction pointer */
- int blks; /* space reservation */
- int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
- int logflags; /* logging flags */
- int error; /* error return value */
-
- ASSERT(XFS_IFORK_Q(ip) == 0);
-
- mp = ip->i_mount;
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
- blks = XFS_ADDAFORK_SPACE_RES(mp);
- if (rsvd)
- tp->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
- goto error0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
- return error;
- }
- if (XFS_IFORK_Q(ip))
- goto error1;
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
- /*
- * For inodes coming from pre-6.2 filesystems.
- */
- ASSERT(ip->i_d.di_aformat == 0);
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- }
- ASSERT(ip->i_d.di_anextents == 0);
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_UUID:
- ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = XFS_ERROR(EINVAL);
- goto error1;
- }
-
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_flags = XFS_IFEXTENTS;
- logflags = 0;
- xfs_bmap_init(&flist, &firstblock);
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_LOCAL:
- error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
- &flist, &logflags);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- default:
- error = 0;
- break;
- }
- if (logflags)
- xfs_trans_log_inode(tp, ip, logflags);
- if (error)
- goto error2;
- if (!xfs_sb_version_hasattr(&mp->m_sb) ||
- (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
- __int64_t sbfields = 0;
-
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr(&mp->m_sb)) {
- xfs_sb_version_addattr(&mp->m_sb);
- sbfields |= XFS_SB_VERSIONNUM;
- }
- if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
- xfs_sb_version_addattr2(&mp->m_sb);
- sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
- }
- if (sbfields) {
- spin_unlock(&mp->m_sb_lock);
- xfs_mod_sb(tp, sbfields);
- } else
- spin_unlock(&mp->m_sb_lock);
- }
-
- error = xfs_bmap_finish(&tp, &flist, &committed);
- if (error)
- goto error2;
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
- xfs_bmap_cancel(&flist);
-error1:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- return error;
-}
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-/* ARGSUSED */
-void
-xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- xfs_mount_t *mp) /* mount point structure */
-{
- xfs_bmap_free_item_t *cur; /* current (next) element */
- xfs_bmap_free_item_t *new; /* new element */
- xfs_bmap_free_item_t *prev; /* previous element */
-#ifdef DEBUG
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
- new->xbfi_startblock = bno;
- new->xbfi_blockcount = (xfs_extlen_t)len;
- for (prev = NULL, cur = flist->xbf_first;
- cur != NULL;
- prev = cur, cur = cur->xbfi_next) {
- if (cur->xbfi_startblock >= bno)
- break;
- }
- if (prev)
- prev->xbfi_next = new;
- else
- flist->xbf_first = new;
- new->xbfi_next = cur;
- flist->xbf_count++;
-}
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem. Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
- xfs_mount_t *mp, /* file system mount structure */
- int whichfork) /* data or attr fork */
-{
- int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
- int maxrootrecs; /* max records in root block */
- int minleafrecs; /* min records in leaf block */
- int minnoderecs; /* min records in node block */
- int sz; /* root block size */
-
- /*
- * The maximum number of extents in a file, hence the maximum
- * number of leaf entries, is controlled by the type of di_nextents
- * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
- * (a signed 16-bit number, xfs_aextnum_t).
- *
- * Note that we can no longer assume that if we are in ATTR1 that
- * the fork offset of all the inodes will be
- * (xfs_default_attroffset(ip) >> 3) because we could have mounted
- * with ATTR2 and then mounted back with ATTR1, keeping the
- * di_forkoff's fixed but probably at various positions. Therefore,
- * for both ATTR1 and ATTR2 we have to assume the worst case scenario
- * of a minimum size available.
- */
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
- sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
- sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
- maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
- minleafrecs = mp->m_bmap_dmnr[0];
- minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++) {
- if (maxblocks <= maxrootrecs)
- maxblocks = 1;
- else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- }
- mp->m_bm_maxlevels[whichfork] = level;
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations. We never free any extents in
- * the first transaction.
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-int /* error */
-xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed) /* xact committed or not */
-{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent item */
- unsigned int logres; /* new log reservation */
- unsigned int logcount; /* new log count */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
-
- ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
- return 0;
- }
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
- for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
- free->xbfi_blockcount);
- logres = ntp->t_log_res;
- logcount = ntp->t_log_count;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0);
- *tp = ntp;
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error)
- return error;
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(ntp->t_ticket);
-
- if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
- logcount)))
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
- for (free = flist->xbf_first; free != NULL; free = next) {
- next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = ntp->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
- return error;
- }
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
- xfs_bmap_del_free(flist, NULL, free);
- }
- return 0;
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist) /* list of bmap_free_items */
-{
- xfs_bmap_free_item_t *free; /* free list item */
- xfs_bmap_free_item_t *next;
-
- if (flist->xbf_count == 0)
- return;
- ASSERT(flist->xbf_first != NULL);
- for (free = flist->xbf_first; free; free = next) {
- next = free->xbfi_next;
- xfs_bmap_del_free(flist, NULL, free);
- }
- ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int /* error */
-xfs_bmap_first_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *first_unused, /* unused block */
- int whichfork) /* data or attr fork */
-{
- int error; /* error return value */
- int idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_fileoff_t lastaddr; /* last block number seen */
- xfs_fileoff_t lowest; /* lowest useful block */
- xfs_fileoff_t max; /* starting useful block */
- xfs_fileoff_t off; /* offset for this block */
- xfs_extnum_t nextents; /* number of extent entries */
-
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *first_unused = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- lowest = *first_unused;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- off = xfs_bmbt_get_startoff(ep);
- /*
- * See if the hole before this extent will work.
- */
- if (off >= lowest + len && off - max >= len) {
- *first_unused = max;
- return 0;
- }
- lastaddr = off + xfs_bmbt_get_blockcount(ep);
- max = XFS_FILEOFF_MAX(lastaddr, lowest);
- }
- *first_unused = max;
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int /* error */
-xfs_bmap_last_before(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
-{
- xfs_fileoff_t bno; /* input file offset */
- int eof; /* hit end of file */
- xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_bmbt_irec_t got; /* current extent value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent used */
- xfs_bmbt_irec_t prev; /* previous extent value */
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EIO);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *last_block = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- bno = *last_block - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- if (eof || xfs_bmbt_get_startoff(ep) > bno) {
- if (prev.br_startoff == NULLFILEOFF)
- *last_block = 0;
- else
- *last_block = prev.br_startoff + prev.br_blockcount;
- }
- /*
- * Otherwise *last_block is already the right answer.
- */
- return 0;
-}
-
-STATIC int
-xfs_bmap_last_extent(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- int whichfork,
- struct xfs_bmbt_irec *rec,
- int *is_empty)
-{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int error;
- int nextents;
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
-
- nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *is_empty = 1;
- return 0;
- }
-
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
- *is_empty = 0;
- return 0;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
- * at, or past the EOF.
- */
-STATIC int
-xfs_bmap_isaeof(
- struct xfs_bmalloca *bma,
- int whichfork)
-{
- struct xfs_bmbt_irec rec;
- int is_empty;
- int error;
-
- bma->aeof = 0;
- error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
- &is_empty);
- if (error || is_empty)
- return error;
-
- /*
- * Check if we are allocation or past the last extent, or at least into
- * the last delayed allocated extent.
- */
- bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
- (bma->offset >= rec.br_startoff &&
- isnullstartblock(rec.br_startblock));
- return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary. All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof)
-{
- struct xfs_bmbt_irec rec;
- int error;
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
- if (error || *eof)
- return error;
-
- *eof = endoff >= rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file. This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int
-xfs_bmap_last_offset(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- xfs_fileoff_t *last_block,
- int whichfork)
-{
- struct xfs_bmbt_irec rec;
- int is_empty;
- int error;
-
- *last_block = 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
- return 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return XFS_ERROR(EIO);
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
- if (error || is_empty)
- return error;
-
- *last_block = rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int /* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int rval; /* return value */
- xfs_bmbt_irec_t s; /* internal version of extent */
-
-#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK)
- return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif /* !DEBUG */
- if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
- return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_get_all(ep, &s);
- rval = s.br_startoff == 0 && s.br_blockcount == 1;
- if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
- return rval;
-}
-
-STATIC int
-xfs_bmap_sanity_check(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- int level)
-{
- struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
-
- if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) ||
- be16_to_cpu(block->bb_level) != level ||
- be16_to_cpu(block->bb_numrecs) == 0 ||
- be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
- return 0;
- return 1;
-}
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int /* error */
-xfs_bmap_read_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
- xfs_extnum_t i, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- /* REFERENCED */
- xfs_extnum_t room; /* number of entries there's room for */
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
- block = ifp->if_broot;
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- return error;
- block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
- if (level == 0)
- break;
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
- xfs_trans_brelse(tp, bp);
- }
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- i = 0;
- /*
- * Loop over all leaf nodes. Copy information to the extent records.
- */
- for (;;) {
- xfs_bmbt_rec_t *frp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
- xfs_extnum_t start;
-
- num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > room)) {
- ASSERT(i + num_recs <= room);
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, (btree extents).",
- (unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
- XFS_ERRLEVEL_LOW, ip->i_mount, block);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, 0),
- error0);
- /*
- * Read-ahead the next leaf block, if any.
- */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1,
- &xfs_bmbt_buf_ops);
- /*
- * Copy records into the extent records.
- */
- frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- start = i;
- for (j = 0; j < num_recs; j++, i++, frp++) {
- xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
- trp->l0 = be64_to_cpu(frp->l0);
- trp->l1 = be64_to_cpu(frp->l1);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(ifp,
- start, num_recs))) {
- XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- goto error0;
- }
- }
- xfs_trans_brelse(tp, bp);
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- return error;
- block = XFS_BUF_TO_BLOCK(bp);
- }
- ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
- XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
- return 0;
-error0:
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-#ifdef DEBUG
-/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr fork */
- unsigned long caller_ip)
-{
- xfs_extnum_t idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int state = 0;
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-}
-
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters. Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap)
-{
- int i; /* index to map values */
-
- ASSERT(ret_nmap <= nmap);
-
- for (i = 0; i < ret_nmap; i++) {
- ASSERT(mval[i].br_blockcount > 0);
- if (!(flags & XFS_BMAPI_ENTIRE)) {
- ASSERT(mval[i].br_startoff >= bno);
- ASSERT(mval[i].br_blockcount <= len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
- bno + len);
- } else {
- ASSERT(mval[i].br_startoff < bno + len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
- bno);
- }
- ASSERT(i == 0 ||
- mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
- mval[i].br_startoff);
- ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
- mval[i].br_startblock != HOLESTARTBLOCK);
- ASSERT(mval[i].br_state == XFS_EXT_NORM ||
- mval[i].br_state == XFS_EXT_UNWRITTEN);
- }
-}
-#endif /* DEBUG */
-
-
-/*
* Trim the returned map to the required bounds
*/
STATIC void
@@ -5151,6 +5122,328 @@ error0:
}
/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int /* error */
+xfs_bmap_del_extent(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_trans_t *tp, /* current transaction pointer */
+ xfs_extnum_t *idx, /* extent number to update/delete */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ xfs_btree_cur_t *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *del, /* data to remove from extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
+ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
+ xfs_fsblock_t del_endblock=0; /* first block past del */
+ xfs_fileoff_t del_endoff; /* first offset past del */
+ int delay; /* current block is delayed allocated */
+ int do_fx; /* free extent at end of routine */
+ xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
+ int error; /* error return value */
+ int flags; /* inode logging flags */
+ xfs_bmbt_irec_t got; /* current extent entry */
+ xfs_fileoff_t got_endoff; /* first offset past got */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t nblks; /* quota/sb block count */
+ xfs_bmbt_irec_t new; /* new record to be inserted */
+ /* REFERENCED */
+ uint qfield; /* quota field to update */
+ xfs_filblks_t temp; /* for indirect length calculations */
+ xfs_filblks_t temp2; /* for indirect length calculations */
+ int state = 0;
+
+ XFS_STATS_INC(xs_del_exlist);
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(del->br_blockcount > 0);
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_get_all(ep, &got);
+ ASSERT(got.br_startoff <= del->br_startoff);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got.br_startoff + got.br_blockcount;
+ ASSERT(got_endoff >= del_endoff);
+ delay = isnullstartblock(got.br_startblock);
+ ASSERT(isnullstartblock(del->br_startblock) == delay);
+ flags = 0;
+ qfield = 0;
+ error = 0;
+ /*
+ * If deleting a real allocation, must free up the disk space.
+ */
+ if (!delay) {
+ flags = XFS_ILOG_CORE;
+ /*
+ * Realtime allocation. Free it and record di_nblocks update.
+ */
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_fsblock_t bno;
+ xfs_filblks_t len;
+
+ ASSERT(do_mod(del->br_blockcount,
+ mp->m_sb.sb_rextsize) == 0);
+ ASSERT(do_mod(del->br_startblock,
+ mp->m_sb.sb_rextsize) == 0);
+ bno = del->br_startblock;
+ len = del->br_blockcount;
+ do_div(bno, mp->m_sb.sb_rextsize);
+ do_div(len, mp->m_sb.sb_rextsize);
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ }
+ /*
+ * Ordinary allocation.
+ */
+ else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
+ }
+ /*
+ * Set up del_endblock and cur for later.
+ */
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock, got.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ da_old = da_new = 0;
+ } else {
+ da_old = startblockval(got.br_startblock);
+ da_new = 0;
+ nblks = 0;
+ do_fx = 0;
+ }
+ /*
+ * Set flag value to use in switch statement.
+ * Left-contig is 2, right-contig is 1.
+ */
+ switch (((got.br_startoff == del->br_startoff) << 1) |
+ (got_endoff == del_endoff)) {
+ case 3:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+ --*idx;
+ if (delay)
+ break;
+
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ flags |= XFS_ILOG_CORE;
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ break;
+
+ case 2:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startoff(ep, del_endoff);
+ temp = got.br_blockcount - del->br_blockcount;
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ xfs_bmbt_set_startblock(ep, del_endblock);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 1:
+ /*
+ * Deleting the last part of the extent.
+ */
+ temp = got.br_blockcount - del->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ temp = del->br_startoff - got.br_startoff;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ new.br_startoff = del_endoff;
+ temp2 = got_endoff - del_endoff;
+ new.br_blockcount = temp2;
+ new.br_state = got.br_state;
+ if (!delay) {
+ new.br_startblock = del_endblock;
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock, temp,
+ got.br_state)))
+ goto done;
+ if ((error = xfs_btree_increment(cur, 0, &i)))
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert,
+ * it tried a split, and we have a zero
+ * block reservation.
+ * Fix up our state and return the error.
+ */
+ if (error == ENOSPC) {
+ /*
+ * Reset the cursor, don't trust
+ * it after any insert operation.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ got.br_startoff,
+ got.br_startblock,
+ temp, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state)))
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_bmbt_set_blockcount(ep,
+ got.br_blockcount);
+ flags = 0;
+ error = XFS_ERROR(ENOSPC);
+ goto done;
+ }
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ } else {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ temp = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ new.br_startblock = nullstartblock((int)temp2);
+ da_new = temp + temp2;
+ while (da_new > da_old) {
+ if (temp) {
+ temp--;
+ da_new--;
+ xfs_bmbt_set_startblock(ep,
+ nullstartblock((int)temp));
+ }
+ if (da_new == da_old)
+ break;
+ if (temp2) {
+ temp2--;
+ da_new--;
+ new.br_startblock =
+ nullstartblock((int)temp2);
+ }
+ }
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
+ break;
+ }
+ /*
+ * If we need to, add to list of extents to delete.
+ */
+ if (do_fx)
+ xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+ mp);
+ /*
+ * Adjust inode # blocks in the file.
+ */
+ if (nblks)
+ ip->i_d.di_nblocks -= nblks;
+ /*
+ * Adjust quota data.
+ */
+ if (qfield)
+ xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+ /*
+ * Account for change in delayed indirect blocks.
+ * Nothing to do for disk quota accounting here.
+ */
+ ASSERT(da_old >= da_new);
+ if (da_old > da_new) {
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - da_new), 0);
+ }
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
* that value. If not all extents in the block range can be removed then
@@ -5811,416 +6104,6 @@ xfs_getbmap(
return error;
}
-#ifdef DEBUG
-STATIC struct xfs_buf *
-xfs_bmap_get_bp(
- struct xfs_btree_cur *cur,
- xfs_fsblock_t bno)
-{
- struct xfs_log_item_desc *lidp;
- int i;
-
- if (!cur)
- return NULL;
-
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
- break;
- if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
- }
-
- /* Chase down all the log items to see if the bp is there */
- list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
- struct xfs_buf_log_item *bip;
- bip = (struct xfs_buf_log_item *)lidp->lid_item;
- if (bip->bli_item.li_type == XFS_LI_BUF &&
- XFS_BUF_ADDR(bip->bli_buf) == bno)
- return bip->bli_buf;
- }
-
- return NULL;
-}
-
-STATIC void
-xfs_check_block(
- struct xfs_btree_block *block,
- xfs_mount_t *mp,
- int root,
- short sz)
-{
- int i, j, dmxr;
- __be64 *pp, *thispa; /* pointer to block address */
- xfs_bmbt_key_t *prevp, *keyp;
-
- ASSERT(be16_to_cpu(block->bb_level) > 0);
-
- prevp = NULL;
- for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
- dmxr = mp->m_bmap_dmxr[0];
- keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-
- if (prevp) {
- ASSERT(be64_to_cpu(prevp->br_startoff) <
- be64_to_cpu(keyp->br_startoff));
- }
- prevp = keyp;
-
- /*
- * Compare the block numbers to see if there are dups.
- */
- if (root)
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
- else
- pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-
- for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
- if (root)
- thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
- else
- thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
- if (*thispa == *pp) {
- xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
- __func__, j, i,
- (unsigned long long)be64_to_cpu(*thispa));
- panic("%s: ptrs are equal in node\n",
- __func__);
- }
- }
- }
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_extnum_t i=0, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- xfs_bmbt_rec_t *ep; /* pointer to current extent */
- xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
- xfs_bmbt_rec_t *nextp; /* pointer to next extent */
- int bp_release = 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
- return;
- }
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- block = ifp->if_broot;
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
-
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- /* See if buf is in cur first */
- bp_release = 0;
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (!bp) {
- bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- goto error_norelse;
- }
- block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
- if (level == 0)
- break;
-
- /*
- * Check this block for basic sanity (increasing keys and
- * no duplicate blocks).
- */
-
- xfs_check_block(block, mp, 0, 0);
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- }
-
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- i = 0;
-
- /*
- * Loop over all leaf nodes checking that all extents are in the right order.
- */
- for (;;) {
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
-
- num_recs = xfs_btree_get_numrecs(block);
-
- /*
- * Read-ahead the next leaf block, if any.
- */
-
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
- /*
- * Check all the extents to make sure they are OK.
- * If we had a previous block, the last entry should
- * conform with the first entry in this one.
- */
-
- ep = XFS_BMBT_REC_ADDR(mp, block, 1);
- if (i) {
- ASSERT(xfs_bmbt_disk_get_startoff(&last) +
- xfs_bmbt_disk_get_blockcount(&last) <=
- xfs_bmbt_disk_get_startoff(ep));
- }
- for (j = 1; j < num_recs; j++) {
- nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
- ASSERT(xfs_bmbt_disk_get_startoff(ep) +
- xfs_bmbt_disk_get_blockcount(ep) <=
- xfs_bmbt_disk_get_startoff(nextp));
- ep = nextp;
- }
-
- last = *ep;
- i += num_recs;
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
-
- bp_release = 0;
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (!bp) {
- bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- goto error_norelse;
- }
- block = XFS_BUF_TO_BLOCK(bp);
- }
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- return;
-
-error0:
- xfs_warn(mp, "%s: at error0", __func__);
- if (bp_release)
- xfs_trans_brelse(NULL, bp);
-error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
- __func__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
- return;
-}
-#endif
-
-/*
- * Count fsblocks of the given fork.
- */
-int /* error */
-xfs_bmap_count_blocks(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- int *count) /* out: count of blocks */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- xfs_bmap_count_leaves(ifp, 0,
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count);
- return 0;
- }
-
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
- mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- return 0;
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-STATIC int /* error */
-xfs_bmap_count_tree(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fsblock_t blockno, /* file system block number */
- int levelin, /* level in btree */
- int *count) /* Count of blocks */
-{
- int error;
- xfs_buf_t *bp, *nbp;
- int level = levelin;
- __be64 *pp;
- xfs_fsblock_t bno = blockno;
- xfs_fsblock_t nextbno;
- struct xfs_btree_block *block, *nextblock;
- int numrecs;
-
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
-
- if (--level) {
- /* Not at node above leaves, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- nextblock = XFS_BUF_TO_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
- xfs_trans_brelse(tp, nbp);
- }
-
- /* Dive to the next level */
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- if (unlikely((error =
- xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_trans_brelse(tp, bp);
- } else {
- /* count all level 1 nodes and their leaves */
- for (;;) {
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- numrecs = be16_to_cpu(block->bb_numrecs);
- xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
- xfs_trans_brelse(tp, bp);
- if (nextbno == NULLFSBLOCK)
- break;
- bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
- }
- }
- return 0;
-}
-
-/*
- * Count leaf blocks given a range of extent records.
- */
-STATIC void
-xfs_bmap_count_leaves(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- int numrecs,
- int *count)
-{
- int b;
-
- for (b = 0; b < numrecs; b++) {
- xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
- *count += xfs_bmbt_get_blockcount(frp);
- }
-}
-
-/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- int *count)
-{
- int b;
- xfs_bmbt_rec_t *frp;
-
- for (b = 1; b <= numrecs; b++) {
- frp = XFS_BMBT_REC_ADDR(mp, block, b);
- *count += xfs_bmbt_disk_get_blockcount(frp);
- }
-}
-
/*
* dead simple method of punching delalyed allocation blocks from a range in
* the inode. Walks a block at a time so will be slow, but is only executed in
@@ -6295,16 +6178,3 @@ next_block:
return error;
}
-
-/*
- * Convert the given file system block to a disk block. We have to treat it
- * differently based on whether the file is a real time file or not, because the
- * bmap code does.
- */
-xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 061b45cbe614..0c61a22be6fd 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,6 +37,7 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
/*
* Determine the extent state.
@@ -59,24 +60,31 @@ xfs_extent_state(
*/
void
xfs_bmdr_to_bmbt(
- struct xfs_mount *mp,
+ struct xfs_inode *ip,
xfs_bmdr_block_t *dblock,
int dblocklen,
struct xfs_btree_block *rblock,
int rblocklen)
{
+ struct xfs_mount *mp = ip->i_mount;
int dmxr;
xfs_bmbt_key_t *fkp;
__be64 *fpp;
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
@@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno ==
+ cpu_to_be64(XFS_BUF_DADDR_NULL));
+ } else
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
ASSERT(rblock->bb_level != 0);
@@ -708,59 +722,89 @@ xfs_bmbt_key_diff(
cur->bc_rec.b.br_startoff;
}
-static void
+static int
xfs_bmbt_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
unsigned int level;
- int lblock_ok; /* block passes checks */
- /* magic number and level verification.
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+ return false;
+ /*
+ * XXX: need a better way of verifying the owner here. Right now
+ * just make sure there has been one set.
+ */
+ if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_BMAP_MAGIC):
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * numrecs and level verification.
*
- * We don't know waht fork we belong to, so just verify that the level
+ * We don't know what fork we belong to, so just verify that the level
* is less than the maximum of the two. Later checks will be more
* precise.
*/
level = be16_to_cpu(block->bb_level);
- lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
- level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
-
- /* numrecs verification */
- lblock_ok = lblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+ if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return false;
/* sibling pointer verification */
- lblock_ok = lblock_ok &&
- block->bb_u.l.bb_leftsib &&
- (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
- block->bb_u.l.bb_rightsib &&
- (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
- if (!lblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.l.bb_leftsib ||
+ (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+ return false;
+ if (!block->bb_u.l.bb_rightsib ||
+ (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+ return false;
+
+ return true;
+
}
static void
xfs_bmbt_read_verify(
struct xfs_buf *bp)
{
- xfs_bmbt_verify(bp);
+ if (!(xfs_btree_lblock_verify_crc(bp) &&
+ xfs_bmbt_verify(bp))) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+
}
static void
xfs_bmbt_write_verify(
struct xfs_buf *bp)
{
- xfs_bmbt_verify(bp);
+ if (!xfs_bmbt_verify(bp)) {
+ xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+ xfs_btree_lblock_calc_crc(bp);
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -769,7 +813,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_bmbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -809,7 +853,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
.buf_ops = &xfs_bmbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
#endif
@@ -838,6 +882,8 @@ xfs_bmbt_init_cursor(
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
cur->bc_private.b.ip = ip;
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 88469ca08696..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,7 +18,8 @@
#ifndef __XFS_BMAP_BTREE_H__
#define __XFS_BMAP_BTREE_H__
-#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
struct xfs_btree_cur;
struct xfs_btree_block;
@@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+#define XFS_BMBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
#define XFS_BMBT_REC_ADDR(mp, block, index) \
((xfs_bmbt_rec_t *) \
@@ -186,15 +187,17 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
- (int)(XFS_BTREE_LBLOCK_LEN + \
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+ (int)(XFS_BMBT_BLOCK_LEN(mp) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BROOT_SPACE(bb) \
- (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+ (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
#define XFS_BMDR_SPACE_CALC(nrecs) \
(int)(sizeof(xfs_bmdr_block_t) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
/*
* Maximum number of bmap btree levels.
@@ -204,7 +207,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
/*
* Prototypes for xfs_bmap.c to call.
*/
-extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index db010408d701..0903960410a2 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -30,9 +30,11 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
/*
* Cursor allocation zone.
@@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
- XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
};
+#define xfs_btree_magic(cur) \
+ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
STATIC int /* error (0 or EFSCORRUPTED) */
@@ -54,30 +60,38 @@ xfs_btree_check_lblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer for block, if any */
{
- int lblock_ok; /* block passes checks */
+ int lblock_ok = 1; /* block passes checks */
struct xfs_mount *mp; /* file system mount point */
mp = cur->bc_mp;
- lblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ lblock_ok = lblock_ok &&
+ uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.l.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
+
+ lblock_ok = lblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
block->bb_u.l.bb_leftsib &&
(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
block->bb_u.l.bb_rightsib &&
(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_LBLOCK,
XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
- mp);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -90,16 +104,26 @@ xfs_btree_check_sblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block */
{
+ struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for ag. freespace struct */
struct xfs_agf *agf; /* ag. freespace structure */
xfs_agblock_t agflen; /* native ag. freespace length */
- int sblock_ok; /* block passes checks */
+ int sblock_ok = 1; /* block passes checks */
+ mp = cur->bc_mp;
agbp = cur->bc_private.a.agbp;
agf = XFS_BUF_TO_AGF(agbp);
agflen = be32_to_cpu(agf->agf_length);
- sblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ sblock_ok = sblock_ok &&
+ uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.s.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
+
+ sblock_ok = sblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
@@ -109,13 +133,13 @@ xfs_btree_check_sblock(
(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
block->bb_u.s.bb_rightsib;
- if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+
+ if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK,
XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
- XFS_ERRLEVEL_LOW, cur->bc_mp, block);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -194,6 +218,72 @@ xfs_btree_check_ptr(
#endif
/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_BTREE_LBLOCK_CRC_OFF);
+ return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_BTREE_SBLOCK_CRC_OFF);
+ return true;
+}
+
+/*
* Delete the btree cursor.
*/
void
@@ -277,10 +367,8 @@ xfs_btree_dup_cursor(
*ncur = NULL;
return error;
}
- new->bc_bufs[i] = bp;
- ASSERT(!xfs_buf_geterror(bp));
- } else
- new->bc_bufs[i] = NULL;
+ }
+ new->bc_bufs[i] = bp;
}
*ncur = new;
return 0;
@@ -321,9 +409,14 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
- XFS_BTREE_LBLOCK_LEN :
- XFS_BTREE_SBLOCK_LEN;
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_LBLOCK_CRC_LEN;
+ return XFS_BTREE_LBLOCK_LEN;
+ }
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_SBLOCK_CRC_LEN;
+ return XFS_BTREE_SBLOCK_LEN;
}
/*
@@ -863,43 +956,85 @@ xfs_btree_set_sibling(
}
void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags)
+{
+ buf->bb_magic = cpu_to_be32(magic);
+ buf->bb_level = cpu_to_be16(level);
+ buf->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (flags & XFS_BTREE_LONG_PTRS) {
+ buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.l.bb_pad = 0;
+ }
+ } else {
+ /* owner is a 32 bit value on short blocks */
+ __u32 __owner = (__u32)owner;
+
+ buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ }
+ }
+}
+
+void
xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_buf *bp,
__u32 magic,
__u16 level,
__u16 numrecs,
+ __u64 owner,
unsigned int flags)
{
- struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
-
- new->bb_magic = cpu_to_be32(magic);
- new->bb_level = cpu_to_be16(level);
- new->bb_numrecs = cpu_to_be16(numrecs);
-
- if (flags & XFS_BTREE_LONG_PTRS) {
- new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
- } else {
- new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- }
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ magic, level, numrecs, owner, flags);
}
STATIC void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
int level,
- int numrecs,
- struct xfs_buf *bp)
+ int numrecs)
{
- xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
- level, numrecs, cur->bc_flags);
+ __u64 owner;
+
+ /*
+ * we can pull the owner from the cursor right now as the different
+ * owners align directly with the pointer size of the btree. This may
+ * change in future, but is safe for current users of the generic btree
+ * code.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ owner = cur->bc_private.b.ip->i_ino;
+ else
+ owner = cur->bc_private.a.agno;
+
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ xfs_btree_magic(cur), level, numrecs,
+ owner, cur->bc_flags);
}
/*
* Return true if ptr is the last record in the btree and
- * we need to track updateѕ to this record. The decision
+ * we need to track updates to this record. The decision
* will be further refined in the update_lastrec method.
*/
STATIC int
@@ -1147,6 +1282,7 @@ xfs_btree_log_keys(
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
if (bp) {
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
@@ -1171,6 +1307,7 @@ xfs_btree_log_recs(
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
@@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
int level = xfs_btree_get_level(block);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
@@ -1223,7 +1361,12 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
- XFS_BTREE_SBLOCK_LEN
+ offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+ XFS_BTREE_SBLOCK_CRC_LEN
};
static const short loffsets[] = { /* table of offsets (long) */
offsetof(struct xfs_btree_block, bb_magic),
@@ -1231,17 +1374,40 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
- XFS_BTREE_LBLOCK_LEN
+ offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+ XFS_BTREE_LBLOCK_CRC_LEN
};
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
if (bp) {
+ int nbits;
+
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ /*
+ * We don't log the CRC when updating a btree
+ * block but instead recreate it during log
+ * recovery. As the log buffers have checksums
+ * of their own this is safe and avoids logging a crc
+ * update in a lot of places.
+ */
+ if (fields == XFS_BB_ALL_BITS)
+ fields = XFS_BB_ALL_BITS_CRC;
+ nbits = XFS_BB_NUM_BITS_CRC;
+ } else {
+ nbits = XFS_BB_NUM_BITS;
+ }
xfs_btree_offsets(fields,
(cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
loffsets : soffsets,
- XFS_BB_NUM_BITS, &first, &last);
+ nbits, &first, &last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
@@ -2204,7 +2370,7 @@ xfs_btree_split(
goto error0;
/* Fill in the btree header for the new right block. */
- xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
+ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
/*
* Split the entries between the old and the new block evenly.
@@ -2378,7 +2544,17 @@ xfs_btree_new_iroot(
if (error)
goto error0;
+ /*
+ * we can't just memcpy() the root in for CRC enabled btree blocks.
+ * In that case have to also ensure the blkno remains correct
+ */
memcpy(cblock, block, xfs_btree_block_len(cur));
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ else
+ cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ }
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
@@ -2513,7 +2689,7 @@ xfs_btree_new_root(
nptr = 2;
}
/* Fill in the new block's btree header and log it. */
- xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
+ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
!xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index f932897194eb..55e3c7cc3c3d 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -42,11 +42,15 @@ extern kmem_zone_t *xfs_btree_cur_zone;
* Generic btree header.
*
* This is a combination of the actual format used on disk for short and long
- * format btrees. The first three fields are shared by both format, but
- * the pointers are different and should be used with care.
+ * format btrees. The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
*
- * To get the size of the actual short or long form headers please use
- * the size macros below. Never use sizeof(xfs_btree_block).
+ * To get the size of the actual short or long form headers please use the size
+ * macros below. Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
*/
struct xfs_btree_block {
__be32 bb_magic; /* magic number for block type */
@@ -56,10 +60,23 @@ struct xfs_btree_block {
struct {
__be32 bb_leftsib;
__be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
} s; /* short form pointers */
struct {
__be64 bb_leftsib;
__be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
} l; /* long form pointers */
} bb_u; /* rest */
};
@@ -67,6 +84,16 @@ struct xfs_btree_block {
#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
/*
* Generic key, ptr and record wrapper structures.
@@ -101,13 +128,11 @@ union xfs_btree_rec {
#define XFS_BB_NUMRECS 0x04
#define XFS_BB_LEFTSIB 0x08
#define XFS_BB_RIGHTSIB 0x10
+#define XFS_BB_BLKNO 0x20
#define XFS_BB_NUM_BITS 5
#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
-
-/*
- * Magic numbers for btree blocks.
- */
-extern const __uint32_t xfs_magics[];
+#define XFS_BB_NUM_BITS_CRC 8
+#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -190,7 +215,7 @@ struct xfs_btree_ops {
const struct xfs_buf_ops *buf_ops;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
@@ -256,6 +281,7 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_NOERROR 0
@@ -393,8 +419,20 @@ xfs_btree_init_block(
__u32 magic,
__u16 level,
__u16 numrecs,
+ __u64 owner,
unsigned int flags);
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags);
+
/*
* Common btree core entry points.
*/
@@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
* Internal btree helpers also used by xfs_bmap.c.
*/
void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8459b5d8cb71..1b2472a46e46 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -513,6 +513,7 @@ _xfs_buf_find(
xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
__func__, blkno, eofs);
+ WARN_ON(1);
return NULL;
}
@@ -1022,7 +1023,9 @@ xfs_buf_iodone_work(
bool read = !!(bp->b_flags & XBF_READ);
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- if (read && bp->b_ops)
+
+ /* only validate buffers that were read without errors */
+ if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
bp->b_ops->verify_read(bp);
if (bp->b_iodone)
@@ -1647,7 +1650,7 @@ xfs_alloc_buftarg(
{
xfs_buftarg_t *btp;
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+ btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cf263476d6b4..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+ if (bip->bli_flags & XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it.
+ * It is not being included in the transaction
+ * commit, so no vectors are used at all.
+ */
+ trace_xfs_buf_item_size_ordered(bip);
+ return XFS_LOG_VEC_ORDERED;
+ }
+
/*
* the vector count is based on the number of buffer vectors we have
* dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
goto out;
}
+
/*
* Fill in an iovec for each set of contiguous chunks.
*/
@@ -262,12 +273,7 @@ xfs_buf_item_format_segment(
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/*
- * You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- * nvecs++;
- */
+ nvecs++;
vecp++;
first_bit = next_bit;
last_bit = next_bit;
@@ -304,18 +310,36 @@ xfs_buf_item_format(
/*
* If it is an inode buffer, transfer the in-memory state to the
- * format flags and clear the in-memory state. We do not transfer
+ * format flags and clear the in-memory state.
+ *
+ * For buffer based inode allocation, we do not transfer
* this state if the inode buffer allocation has not yet been committed
* to the log as setting the XFS_BLI_INODE_BUF flag will prevent
* correct replay of the inode allocation.
+ *
+ * For icreate item based inode allocation, the buffers aren't written
+ * to the journal during allocation, and hence we should always tag the
+ * buffer as an inode buffer so that the correct unlinked list replay
+ * occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
+ if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+ XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so don't format it.
+ */
+ trace_xfs_buf_item_format_ordered(bip);
+ return;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
vecp = xfs_buf_item_format_segment(bip, vecp, offset,
&bip->bli_formats[i]);
@@ -345,6 +369,7 @@ xfs_buf_item_pin(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_pin(bip);
@@ -517,8 +542,9 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted, clean, i;
- uint hold;
+ bool clean;
+ bool aborted;
+ int flags;
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -529,23 +555,21 @@ xfs_buf_item_unlock(
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
*/
- aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
+ * Before possibly freeing the buf item, copy the per-transaction state
+ * so we can reference it safely later after clearing it from the
+ * buffer log item.
*/
- hold = bip->bli_flags & XFS_BLI_HOLD;
-
- /* Clear the per transaction state. */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+ flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
* If the buf item is marked stale, then don't do anything. We'll
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
+ if (flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -562,13 +586,19 @@ xfs_buf_item_unlock(
* be the only reference to the buf item, so we free it anyway
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
+ *
+ * Ordered buffers are dirty but may have no recorded changes, so ensure
+ * we only release clean items here.
*/
- clean = 1;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = 0;
- break;
+ clean = (flags & XFS_BLI_DIRTY) ? false : true;
+ if (clean) {
+ int i;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = false;
+ break;
+ }
}
}
if (clean)
@@ -581,7 +611,7 @@ xfs_buf_item_unlock(
} else
atomic_dec(&bip->bli_refcount);
- if (!hold)
+ if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
}
@@ -847,12 +877,6 @@ xfs_buf_item_log(
struct xfs_buf *bp = bip->bli_buf;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* walk each buffer segment and mark them dirty appropriately.
*/
start = 0;
@@ -878,7 +902,7 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
*/
uint
@@ -912,11 +936,11 @@ void
xfs_buf_item_relse(
xfs_buf_t *bp)
{
- xfs_buf_log_item_t *bip;
+ xfs_buf_log_item_t *bip = bp->b_fspriv;
trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
- bip = bp->b_fspriv;
bp->b_fspriv = bip->bli_item.li_bio_list;
if (bp->b_fspriv == NULL)
bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index ee36c88ecfde..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -24,19 +24,20 @@ extern kmem_zone_t *xfs_buf_item_zone;
* This flag indicates that the buffer contains on disk inodes
* and requires special recovery handling.
*/
-#define XFS_BLF_INODE_BUF 0x1
+#define XFS_BLF_INODE_BUF (1<<0)
/*
* This flag indicates that the buffer should not be replayed
* during recovery because its blocks are being freed.
*/
-#define XFS_BLF_CANCEL 0x2
+#define XFS_BLF_CANCEL (1<<1)
+
/*
* This flag indicates that the buffer contains on disk
* user or group dquots and may require special recovery handling.
*/
-#define XFS_BLF_UDQUOT_BUF 0x4
-#define XFS_BLF_PDQUOT_BUF 0x8
-#define XFS_BLF_GDQUOT_BUF 0x10
+#define XFS_BLF_UDQUOT_BUF (1<<2)
+#define XFS_BLF_PDQUOT_BUF (1<<3)
+#define XFS_BLF_GDQUOT_BUF (1<<4)
#define XFS_BLF_CHUNK 128
#define XFS_BLF_SHIFT 7
@@ -61,6 +62,55 @@ typedef struct xfs_buf_log_format {
} xfs_buf_log_format_t;
/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS 5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+ XFS_BLFT_UNKNOWN_BUF = 0,
+ XFS_BLFT_UDQUOT_BUF,
+ XFS_BLFT_PDQUOT_BUF,
+ XFS_BLFT_GDQUOT_BUF,
+ XFS_BLFT_BTREE_BUF,
+ XFS_BLFT_AGF_BUF,
+ XFS_BLFT_AGFL_BUF,
+ XFS_BLFT_AGI_BUF,
+ XFS_BLFT_DINO_BUF,
+ XFS_BLFT_SYMLINK_BUF,
+ XFS_BLFT_DIR_BLOCK_BUF,
+ XFS_BLFT_DIR_DATA_BUF,
+ XFS_BLFT_DIR_FREE_BUF,
+ XFS_BLFT_DIR_LEAF1_BUF,
+ XFS_BLFT_DIR_LEAFN_BUF,
+ XFS_BLFT_DA_NODE_BUF,
+ XFS_BLFT_ATTR_LEAF_BUF,
+ XFS_BLFT_ATTR_RMT_BUF,
+ XFS_BLFT_SB_BUF,
+ XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+ ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+ blf->blf_flags &= ~XFS_BLFT_MASK;
+ blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+ return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
* buf log item flags
*/
#define XFS_BLI_HOLD 0x01
@@ -70,6 +120,7 @@ typedef struct xfs_buf_log_format {
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
#define XFS_BLI_INODE_BUF 0x40
+#define XFS_BLI_ORDERED 0x80
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -78,7 +129,8 @@ typedef struct xfs_buf_log_format {
{ XFS_BLI_LOGGED, "LOGGED" }, \
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
{ XFS_BLI_STALE_INODE, "STALE_INODE" }, \
- { XFS_BLI_INODE_BUF, "INODE_BUF" }
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
+ { XFS_BLI_ORDERED, "ORDERED" }
#ifdef __KERNEL__
@@ -113,6 +165,10 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+ enum xfs_blft);
+void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
+
#endif /* __KERNEL__ */
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 4d7696a02418..0b8b2a13cd24 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -38,6 +39,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
/*
* xfs_da_btree.c
@@ -52,69 +55,195 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_da_root_split(xfs_da_state_t *state,
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_root,
xfs_da_state_blk_t *new_child);
-STATIC int xfs_da_node_split(xfs_da_state_t *state,
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_blk,
xfs_da_state_blk_t *split_blk,
xfs_da_state_blk_t *blk_to_add,
int treelevel,
int *result);
-STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *node_blk_1,
xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da_node_add(xfs_da_state_t *state,
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
xfs_da_state_blk_t *old_node_blk,
xfs_da_state_blk_t *new_node_blk);
/*
* Routines used for shrinking the Btree.
*/
-STATIC int xfs_da_root_join(xfs_da_state_t *state,
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
xfs_da_state_blk_t *src_node_blk,
xfs_da_state_blk_t *dst_node_blk);
/*
* Utility routines.
*/
-STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count);
-STATIC int xfs_da_node_order(struct xfs_buf *node1_bp,
- struct xfs_buf *node2_bp);
-STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
+STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk,
xfs_da_state_blk_t *save_blk);
-STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
-static void
-xfs_da_node_verify(
+
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+ return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+ int i;
+
+ for (i = 0; i < state->altpath.active; i++)
+ state->altpath.blk[i].bp = NULL;
+ state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+ xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+ memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+ kmem_zone_free(xfs_da_state_zone, state);
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->__count);
+ to->level = be16_to_cpu(hdr3->__level);
+ return;
+ }
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC ||
+ from->magic == XFS_DA3_NODE_MAGIC);
+
+ if (from->magic == XFS_DA3_NODE_MAGIC) {
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->__count = cpu_to_be16(from->count);
+ hdr3->__level = cpu_to_be16(from->level);
+ return;
+ }
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static bool
+xfs_da3_node_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_da_node_hdr *hdr = bp->b_addr;
- int block_ok = 0;
-
- block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
- block_ok = block_ok &&
- be16_to_cpu(hdr->level) > 0 &&
- be16_to_cpu(hdr->count) > 0 ;
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ struct xfs_da_intnode *hdr = bp->b_addr;
+ struct xfs_da3_icnode_hdr ichdr;
+
+ xfs_da3_node_hdr_from_disk(&ichdr, hdr);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_DA_NODE_MAGIC)
+ return false;
}
+ if (ichdr.level == 0)
+ return false;
+ if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+ return false;
+ if (ichdr.count == 0)
+ return false;
+ /*
+ * we don't know if the node is for and attribute or directory tree,
+ * so only fail if the count is outside both bounds
+ */
+ if (ichdr.count > mp->m_dir_node_ents &&
+ ichdr.count > mp->m_attr_node_ents)
+ return false;
+
+ /* XXX: hash order check? */
+
+ return true;
}
static void
-xfs_da_node_write_verify(
+xfs_da3_node_write_verify(
struct xfs_buf *bp)
{
- xfs_da_node_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_da3_node_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF);
}
/*
@@ -124,40 +253,48 @@ xfs_da_node_write_verify(
* format of the block being read.
*/
static void
-xfs_da_node_read_verify(
+xfs_da3_node_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_da_blkinfo *info = bp->b_addr;
switch (be16_to_cpu(info->magic)) {
+ case XFS_DA3_NODE_MAGIC:
+ if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_DA3_NODE_CRC_OFF))
+ break;
+ /* fall through */
case XFS_DA_NODE_MAGIC:
- xfs_da_node_verify(bp);
- break;
+ if (!xfs_da3_node_verify(bp))
+ break;
+ return;
case XFS_ATTR_LEAF_MAGIC:
- bp->b_ops = &xfs_attr_leaf_buf_ops;
+ case XFS_ATTR3_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
bp->b_ops->verify_read(bp);
return;
case XFS_DIR2_LEAFN_MAGIC:
- bp->b_ops = &xfs_dir2_leafn_buf_ops;
+ case XFS_DIR3_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
bp->b_ops->verify_read(bp);
return;
default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- mp, info);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
break;
}
+
+ /* corrupt block */
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
}
-const struct xfs_buf_ops xfs_da_node_buf_ops = {
- .verify_read = xfs_da_node_read_verify,
- .verify_write = xfs_da_node_write_verify,
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .verify_read = xfs_da3_node_read_verify,
+ .verify_write = xfs_da3_node_write_verify,
};
-
int
-xfs_da_node_read(
+xfs_da3_node_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
@@ -165,8 +302,35 @@ xfs_da_node_read(
struct xfs_buf **bpp,
int which_fork)
{
- return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- which_fork, &xfs_da_node_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da3_node_buf_ops);
+ if (!err && tp) {
+ struct xfs_da_blkinfo *info = (*bpp)->b_addr;
+ int type;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ type = XFS_BLFT_DA_NODE_BUF;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ type = XFS_BLFT_ATTR_LEAF_BUF;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ type = XFS_BLFT_DIR_LEAFN_BUF;
+ break;
+ default:
+ type = 0;
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_buf_set_type(tp, *bpp, type);
+ }
+ return err;
}
/*========================================================================
@@ -177,33 +341,46 @@ xfs_da_node_read(
* Create the initial contents of an intermediate node.
*/
int
-xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- struct xfs_buf **bpp, int whichfork)
+xfs_da3_node_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ int level,
+ struct xfs_buf **bpp,
+ int whichfork)
{
- xfs_da_intnode_t *node;
- struct xfs_buf *bp;
- int error;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *node;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_da3_icnode_hdr ichdr = {0};
+ struct xfs_buf *bp;
+ int error;
trace_xfs_da_node_create(args);
+ ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
- tp = args->trans;
error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
node = bp->b_addr;
- node->hdr.info.forw = 0;
- node->hdr.info.back = 0;
- node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
- node->hdr.info.pad = 0;
- node->hdr.count = 0;
- node->hdr.level = cpu_to_be16(level);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_DA3_NODE_MAGIC;
+ hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ ichdr.magic = XFS_DA_NODE_MAGIC;
+ }
+ ichdr.level = level;
+
+ xfs_da3_node_hdr_to_disk(node, &ichdr);
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
- bp->b_ops = &xfs_da_node_buf_ops;
*bpp = bp;
return(0);
}
@@ -213,12 +390,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
* intermediate nodes, rebalance, etc.
*/
int /* error */
-xfs_da_split(xfs_da_state_t *state)
+xfs_da3_split(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *oldblk, *newblk, *addblk;
- xfs_da_intnode_t *node;
- struct xfs_buf *bp;
- int max, action, error, i;
+ struct xfs_da_state_blk *oldblk;
+ struct xfs_da_state_blk *newblk;
+ struct xfs_da_state_blk *addblk;
+ struct xfs_da_intnode *node;
+ struct xfs_buf *bp;
+ int max;
+ int action;
+ int error;
+ int i;
trace_xfs_da_split(state->args);
@@ -246,7 +429,7 @@ xfs_da_split(xfs_da_state_t *state)
*/
switch (oldblk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_split(state, oldblk, newblk);
+ error = xfs_attr3_leaf_split(state, oldblk, newblk);
if ((error != 0) && (error != ENOSPC)) {
return(error); /* GROT: attr is inconsistent */
}
@@ -261,12 +444,12 @@ xfs_da_split(xfs_da_state_t *state)
if (state->inleaf) {
state->extraafter = 0; /* before newblk */
trace_xfs_attr_leaf_split_before(state->args);
- error = xfs_attr_leaf_split(state, oldblk,
+ error = xfs_attr3_leaf_split(state, oldblk,
&state->extrablk);
} else {
state->extraafter = 1; /* after newblk */
trace_xfs_attr_leaf_split_after(state->args);
- error = xfs_attr_leaf_split(state, newblk,
+ error = xfs_attr3_leaf_split(state, newblk,
&state->extrablk);
}
if (error)
@@ -280,7 +463,7 @@ xfs_da_split(xfs_da_state_t *state)
addblk = newblk;
break;
case XFS_DA_NODE_MAGIC:
- error = xfs_da_node_split(state, oldblk, newblk, addblk,
+ error = xfs_da3_node_split(state, oldblk, newblk, addblk,
max - i, &action);
addblk->bp = NULL;
if (error)
@@ -298,7 +481,7 @@ xfs_da_split(xfs_da_state_t *state)
/*
* Update the btree to show the new hashval for this child.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
}
if (!addblk)
return(0);
@@ -308,7 +491,7 @@ xfs_da_split(xfs_da_state_t *state)
*/
ASSERT(state->path.active == 0);
oldblk = &state->path.blk[0];
- error = xfs_da_root_split(state, oldblk, addblk);
+ error = xfs_da3_root_split(state, oldblk, addblk);
if (error) {
addblk->bp = NULL;
return(error); /* GROT: dir is inconsistent */
@@ -319,8 +502,12 @@ xfs_da_split(xfs_da_state_t *state)
* just got bumped because of the addition of a new root node.
* There might be three blocks involved if a double split occurred,
* and the original block 0 could be at any position in the list.
+ *
+ * Note: the magic numbers and sibling pointers are in the same
+ * physical place for both v2 and v3 headers (by design). Hence it
+ * doesn't matter which version of the xfs_da_intnode structure we use
+ * here as the result will be the same using either structure.
*/
-
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
@@ -359,18 +546,25 @@ xfs_da_split(xfs_da_state_t *state)
* the EOF, extending the inode in process.
*/
STATIC int /* error */
-xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_root_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node, *oldroot;
- xfs_da_args_t *args;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
- int error, size;
- xfs_inode_t *dp;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- xfs_dir2_leaf_t *leaf;
+ struct xfs_da_intnode *node;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_dir2_leaf *leaf;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+ int size;
trace_xfs_da_root_split(state->args);
@@ -379,29 +573,65 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* to a free space somewhere.
*/
args = state->args;
- ASSERT(args != NULL);
error = xfs_da_grow_inode(args, &blkno);
if (error)
- return(error);
+ return error;
+
dp = args->dp;
tp = args->trans;
mp = state->mp;
error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
node = bp->b_addr;
oldroot = blk1->bp->b_addr;
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
- (char *)oldroot);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
+ btree = xfs_da3_node_tree_p(oldroot);
+ size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+ level = nodehdr.level;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
- ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
leaf = (xfs_dir2_leaf_t *)oldroot;
- size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
- (char *)leaf);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+ size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+ level = 0;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
+
+ /*
+ * we can copy most of the information in the node from one block to
+ * another, but for CRC enabled headers we have to make sure that the
+ * block specific identifiers are kept intact. We update the buffer
+ * directly for this.
+ */
memcpy(node, oldroot, size);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+ node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+ }
xfs_trans_log_buf(tp, bp, 0, size - 1);
bp->b_ops = blk1->bp->b_ops;
@@ -411,20 +641,25 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args,
+ error = xfs_da3_node_create(args,
(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
- be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
+ level + 1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
+
node = bp->b_addr;
- node->btree[0].hashval = cpu_to_be32(blk1->hashval);
- node->btree[0].before = cpu_to_be32(blk1->blkno);
- node->btree[1].hashval = cpu_to_be32(blk2->hashval);
- node->btree[1].before = cpu_to_be32(blk2->blkno);
- node->hdr.count = cpu_to_be16(2);
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+ btree[0].hashval = cpu_to_be32(blk1->hashval);
+ btree[0].before = cpu_to_be32(blk1->blkno);
+ btree[1].hashval = cpu_to_be32(blk2->hashval);
+ btree[1].before = cpu_to_be32(blk2->blkno);
+ nodehdr.count = 2;
+ xfs_da3_node_hdr_to_disk(node, &nodehdr);
#ifdef DEBUG
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
ASSERT(blk1->blkno >= mp->m_dirleafblk &&
blk1->blkno < mp->m_dirfreeblk);
ASSERT(blk2->blkno >= mp->m_dirleafblk &&
@@ -434,30 +669,34 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/* Header is already logged by xfs_da_node_create */
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, node->btree,
- sizeof(xfs_da_node_entry_t) * 2));
+ XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
- return(0);
+ return 0;
}
/*
* Split the node, rebalance, then add the new entry.
*/
STATIC int /* error */
-xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk,
- xfs_da_state_blk_t *addblk,
- int treelevel, int *result)
+xfs_da3_node_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk,
+ struct xfs_da_state_blk *addblk,
+ int treelevel,
+ int *result)
{
- xfs_da_intnode_t *node;
- xfs_dablk_t blkno;
- int newcount, error;
- int useextra;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno;
+ int newcount;
+ int error;
+ int useextra;
trace_xfs_da_node_split(state->args);
node = oldblk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
/*
* With V2 dirs the extra block is data or freespace.
@@ -467,7 +706,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
/*
* Do we have to split the node?
*/
- if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
+ if (nodehdr.count + newcount > state->node_ents) {
/*
* Allocate a new node, add to the doubly linked chain of
* nodes, then move some of our excess entries into it.
@@ -476,14 +715,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
if (error)
return(error); /* GROT: dir is inconsistent */
- error = xfs_da_node_create(state->args, blkno, treelevel,
+ error = xfs_da3_node_create(state->args, blkno, treelevel,
&newblk->bp, state->args->whichfork);
if (error)
return(error); /* GROT: dir is inconsistent */
newblk->blkno = blkno;
newblk->magic = XFS_DA_NODE_MAGIC;
- xfs_da_node_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_da3_node_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
*result = 1;
@@ -495,7 +734,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Insert the new entry(s) into the correct block
* (updating last hashval in the process).
*
- * xfs_da_node_add() inserts BEFORE the given index,
+ * xfs_da3_node_add() inserts BEFORE the given index,
* and as a result of using node_lookup_int() we always
* point to a valid entry (not after one), but a split
* operation always results in a new block whose hashvals
@@ -504,22 +743,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* If we had double-split op below us, then add the extra block too.
*/
node = oldblk->bp->b_addr;
- if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ if (oldblk->index <= nodehdr.count) {
oldblk->index++;
- xfs_da_node_add(state, oldblk, addblk);
+ xfs_da3_node_add(state, oldblk, addblk);
if (useextra) {
if (state->extraafter)
oldblk->index++;
- xfs_da_node_add(state, oldblk, &state->extrablk);
+ xfs_da3_node_add(state, oldblk, &state->extrablk);
state->extravalid = 0;
}
} else {
newblk->index++;
- xfs_da_node_add(state, newblk, addblk);
+ xfs_da3_node_add(state, newblk, addblk);
if (useextra) {
if (state->extraafter)
newblk->index++;
- xfs_da_node_add(state, newblk, &state->extrablk);
+ xfs_da3_node_add(state, newblk, &state->extrablk);
state->extravalid = 0;
}
}
@@ -534,33 +774,53 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* NOTE: if blk2 is empty, then it will get the upper half of blk1.
*/
STATIC void
-xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_node_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node1, *node2, *tmpnode;
- xfs_da_node_entry_t *btree_s, *btree_d;
- int count, tmp;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_intnode *tmpnode;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da_node_entry *btree_s;
+ struct xfs_da_node_entry *btree_d;
+ struct xfs_da3_icnode_hdr nodehdr1;
+ struct xfs_da3_icnode_hdr nodehdr2;
+ struct xfs_trans *tp;
+ int count;
+ int tmp;
+ int swap = 0;
trace_xfs_da_node_rebalance(state->args);
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = xfs_da3_node_tree_p(node1);
+ btree2 = xfs_da3_node_tree_p(node2);
+
/*
* Figure out how many entries need to move, and in which direction.
* Swap the nodes around if that makes it simpler.
*/
- if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
- ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
- (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
- be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+ if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+ be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
+ xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = xfs_da3_node_tree_p(node1);
+ btree2 = xfs_da3_node_tree_p(node2);
+ swap = 1;
}
- ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
+
+ count = (nodehdr1.count - nodehdr2.count) / 2;
if (count == 0)
return;
tp = state->args->trans;
@@ -571,10 +831,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/*
* Move elements in node2 up to make a hole.
*/
- if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
+ tmp = nodehdr2.count;
+ if (tmp > 0) {
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node2->btree[count];
+ btree_s = &btree2[0];
+ btree_d = &btree2[count];
memmove(btree_d, btree_s, tmp);
}
@@ -582,12 +843,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Move the req'd B-tree elements from high in node1 to
* low in node2.
*/
- be16_add_cpu(&node2->hdr.count, count);
+ nodehdr2.count += count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
- btree_d = &node2->btree[0];
+ btree_s = &btree1[nodehdr1.count - count];
+ btree_d = &btree2[0];
memcpy(btree_d, btree_s, tmp);
- be16_add_cpu(&node1->hdr.count, -count);
+ nodehdr1.count -= count;
} else {
/*
* Move the req'd B-tree elements from low in node2 to
@@ -595,49 +856,60 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
*/
count = -count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
+ btree_s = &btree2[0];
+ btree_d = &btree1[nodehdr1.count];
memcpy(btree_d, btree_s, tmp);
- be16_add_cpu(&node1->hdr.count, count);
+ nodehdr1.count += count;
+
xfs_trans_log_buf(tp, blk1->bp,
XFS_DA_LOGRANGE(node1, btree_d, tmp));
/*
* Move elements in node2 down to fill the hole.
*/
- tmp = be16_to_cpu(node2->hdr.count) - count;
+ tmp = nodehdr2.count - count;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[count];
- btree_d = &node2->btree[0];
+ btree_s = &btree2[count];
+ btree_d = &btree2[0];
memmove(btree_d, btree_s, tmp);
- be16_add_cpu(&node2->hdr.count, -count);
+ nodehdr2.count -= count;
}
/*
* Log header of node 1 and all current bits of node 2.
*/
+ xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
xfs_trans_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+ XFS_DA_LOGRANGE(node1, &node1->hdr,
+ xfs_da3_node_hdr_size(node1)));
+
+ xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- sizeof(node2->hdr) +
- sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
+ xfs_da3_node_hdr_size(node2) +
+ (sizeof(btree2[0]) * nodehdr2.count)));
/*
* Record the last hashval from each block for upward propagation.
* (note: don't use the swapped node pointers)
*/
- node1 = blk1->bp->b_addr;
- node2 = blk2->bp->b_addr;
- blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
- blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
+ if (swap) {
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = xfs_da3_node_tree_p(node1);
+ btree2 = xfs_da3_node_tree_p(node2);
+ }
+ blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
*/
- if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
- blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
- blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */
+ if (blk1->index >= nodehdr1.count) {
+ blk2->index = blk1->index - nodehdr1.count;
+ blk1->index = nodehdr1.count + 1; /* make it invalid */
}
}
@@ -645,18 +917,23 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Add a new entry to an intermediate node.
*/
STATIC void
-xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_da3_node_add(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int tmp;
trace_xfs_da_node_add(state->args);
node = oldblk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+
+ ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
if (state->args->whichfork == XFS_DATA_FORK)
ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
@@ -666,23 +943,25 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* We may need to make some room before we insert the new node.
*/
tmp = 0;
- btree = &node->btree[ oldblk->index ];
- if (oldblk->index < be16_to_cpu(node->hdr.count)) {
- tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
- memmove(btree + 1, btree, tmp);
+ if (oldblk->index < nodehdr.count) {
+ tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+ memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
}
- btree->hashval = cpu_to_be32(newblk->hashval);
- btree->before = cpu_to_be32(newblk->blkno);
+ btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+ btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
- be16_add_cpu(&node->hdr.count, 1);
+ XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+ tmp + sizeof(*btree)));
+
+ nodehdr.count += 1;
+ xfs_da3_node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
/*
* Copy the last hash value from the oldblk to propagate upwards.
*/
- oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
+ oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*========================================================================
@@ -694,14 +973,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* possibly deallocating that block, etc...
*/
int
-xfs_da_join(xfs_da_state_t *state)
+xfs_da3_join(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *drop_blk, *save_blk;
- int action, error;
+ struct xfs_da_state_blk *drop_blk;
+ struct xfs_da_state_blk *save_blk;
+ int action = 0;
+ int error;
trace_xfs_da_join(state->args);
- action = 0;
drop_blk = &state->path.blk[ state->path.active-1 ];
save_blk = &state->altpath.blk[ state->path.active-1 ];
ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
@@ -722,12 +1003,12 @@ xfs_da_join(xfs_da_state_t *state)
*/
switch (drop_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_toosmall(state, &action);
+ error = xfs_attr3_leaf_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return(0);
- xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+ xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
break;
case XFS_DIR2_LEAFN_MAGIC:
error = xfs_dir2_leafn_toosmall(state, &action);
@@ -742,18 +1023,18 @@ xfs_da_join(xfs_da_state_t *state)
* Remove the offending node, fixup hashvals,
* check for a toosmall neighbor.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_node_toosmall(state, &action);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_node_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return 0;
- xfs_da_node_unbalance(state, drop_blk, save_blk);
+ xfs_da3_node_unbalance(state, drop_blk, save_blk);
break;
}
- xfs_da_fixhashpath(state, &state->altpath);
- error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+ xfs_da3_fixhashpath(state, &state->altpath);
+ error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
xfs_da_state_kill_altpath(state);
if (error)
return(error);
@@ -768,9 +1049,9 @@ xfs_da_join(xfs_da_state_t *state)
* we only have one entry in the root, make the child block
* the new root.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_root_join(state, &state->path.blk[0]);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_root_join(state, &state->path.blk[0]);
return(error);
}
@@ -782,9 +1063,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
if (level == 1) {
ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
- magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- } else
- ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+ } else {
+ ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ }
ASSERT(!blkinfo->forw);
ASSERT(!blkinfo->back);
}
@@ -797,52 +1082,61 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
* the old root to block 0 as the new root node.
*/
STATIC int
-xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+xfs_da3_root_join(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *root_blk)
{
- xfs_da_intnode_t *oldroot;
- xfs_da_args_t *args;
- xfs_dablk_t child;
- struct xfs_buf *bp;
- int error;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_args *args;
+ xfs_dablk_t child;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr oldroothdr;
+ struct xfs_da_node_entry *btree;
+ int error;
trace_xfs_da_root_join(state->args);
- args = state->args;
- ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+ args = state->args;
oldroot = root_blk->bp->b_addr;
- ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(!oldroot->hdr.info.forw);
- ASSERT(!oldroot->hdr.info.back);
+ xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot);
+ ASSERT(oldroothdr.forw == 0);
+ ASSERT(oldroothdr.back == 0);
/*
* If the root has more than one child, then don't do anything.
*/
- if (be16_to_cpu(oldroot->hdr.count) > 1)
- return(0);
+ if (oldroothdr.count > 1)
+ return 0;
/*
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- child = be32_to_cpu(oldroot->btree[0].before);
+ btree = xfs_da3_node_tree_p(oldroot);
+ child = be32_to_cpu(btree[0].before);
ASSERT(child != 0);
- error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
- xfs_da_blkinfo_onlychild_validate(bp->b_addr,
- be16_to_cpu(oldroot->hdr.level));
+ return error;
+ xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
* This could be copying a leaf back into the root block in the case of
* there only being a single leaf block left in the tree. Hence we have
* to update the b_ops pointer as well to match the buffer type change
- * that could occur.
+ * that could occur. For dir3 blocks we also need to update the block
+ * number in the buffer header.
*/
memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
root_blk->bp->b_ops = bp->b_ops;
+ xfs_trans_buf_copy_type(root_blk->bp, bp);
+ if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+ struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+ da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+ }
xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
@@ -858,14 +1152,21 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
* If nothing can be done, return 0.
*/
STATIC int
-xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+xfs_da3_node_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_da_intnode_t *node;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, forward, error, retval, i;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t blkno;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int count;
+ int forward;
+ int error;
+ int retval;
+ int i;
trace_xfs_da_node_toosmall(state->args);
@@ -876,10 +1177,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
*/
blk = &state->path.blk[ state->path.active-1 ];
info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
node = (xfs_da_intnode_t *)info;
- count = be16_to_cpu(node->hdr.count);
- if (count > (state->node_ents >> 1)) {
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ if (nodehdr.count > (state->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0); /* blk over 50%, don't try to join */
}
@@ -890,14 +1190,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (nodehdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
forward = (info->forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -916,35 +1216,34 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
+ count = state->node_ents;
+ count -= state->node_ents >> 2;
+ count -= nodehdr.count;
+
/* start with smaller blk num */
- forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ forward = nodehdr.forw < nodehdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
if (forward)
- blkno = be32_to_cpu(info->forw);
+ blkno = nodehdr.forw;
else
- blkno = be32_to_cpu(info->back);
+ blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da_node_read(state->args->trans, state->args->dp,
+ error = xfs_da3_node_read(state->args->trans, state->args->dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
- node = (xfs_da_intnode_t *)info;
- count = state->node_ents;
- count -= state->node_ents >> 2;
- count -= be16_to_cpu(node->hdr.count);
node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- count -= be16_to_cpu(node->hdr.count);
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
xfs_trans_brelse(state->args->trans, bp);
- if (count >= 0)
+
+ if (count - nodehdr.count >= 0)
break; /* fits with at least 25% to spare */
}
if (i >= 2) {
*action = 0;
- return(0);
+ return 0;
}
/*
@@ -953,28 +1252,42 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
+ }
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ return 0;
}
*action = 1;
- return(0);
+ return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+ struct xfs_buf *bp,
+ int *count)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ node = bp->b_addr;
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ if (count)
+ *count = nodehdr.count;
+ if (!nodehdr.count)
+ return 0;
+ btree = xfs_da3_node_tree_p(node);
+ return be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*
@@ -982,13 +1295,16 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* when we stop making changes, return.
*/
void
-xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+xfs_da3_fixhashpath(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path)
{
- xfs_da_state_blk_t *blk;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dahash_t lasthash=0;
- int level, count;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ xfs_dahash_t lasthash=0;
+ int level;
+ int count;
trace_xfs_da_fixhashpath(state->args);
@@ -1006,23 +1322,26 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
return;
break;
case XFS_DA_NODE_MAGIC:
- lasthash = xfs_da_node_lasthash(blk->bp, &count);
+ lasthash = xfs_da3_node_lasthash(blk->bp, &count);
if (count == 0)
return;
break;
}
for (blk--, level--; level >= 0; blk--, level--) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
node = blk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- btree = &node->btree[ blk->index ];
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
if (be32_to_cpu(btree->hashval) == lasthash)
break;
blk->hashval = lasthash;
- btree->hashval = cpu_to_be32(lasthash);
+ btree[blk->index].hashval = cpu_to_be32(lasthash);
xfs_trans_log_buf(state->args->trans, blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+ XFS_DA_LOGRANGE(node, &btree[blk->index],
+ sizeof(*btree)));
- lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
}
@@ -1030,104 +1349,120 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
* Remove an entry from an intermediate node.
*/
STATIC void
-xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+xfs_da3_node_remove(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int index;
+ int tmp;
trace_xfs_da_node_remove(state->args);
node = drop_blk->bp->b_addr;
- ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
/*
* Copy over the offending entry, or just zero it out.
*/
- btree = &node->btree[drop_blk->index];
- if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
- tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
+ index = drop_blk->index;
+ btree = xfs_da3_node_tree_p(node);
+ if (index < nodehdr.count - 1) {
+ tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, btree + 1, tmp);
+ memmove(&btree[index], &btree[index + 1], tmp);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp));
- btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
+ XFS_DA_LOGRANGE(node, &btree[index], tmp));
+ index = nodehdr.count - 1;
}
- memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+ memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
- be16_add_cpu(&node->hdr.count, -1);
+ XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+ nodehdr.count -= 1;
+ xfs_da3_node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
/*
* Copy the last hash value from the block to propagate upwards.
*/
- btree--;
- drop_blk->hashval = be32_to_cpu(btree->hashval);
+ drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
}
/*
- * Unbalance the btree elements between two intermediate nodes,
+ * Unbalance the elements between two intermediate nodes,
* move all Btree elements from one node into another.
*/
STATIC void
-xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_node_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_intnode_t *drop_node, *save_node;
- xfs_da_node_entry_t *btree;
- int tmp;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *drop_node;
+ struct xfs_da_intnode *save_node;
+ struct xfs_da_node_entry *drop_btree;
+ struct xfs_da_node_entry *save_btree;
+ struct xfs_da3_icnode_hdr drop_hdr;
+ struct xfs_da3_icnode_hdr save_hdr;
+ struct xfs_trans *tp;
+ int sindex;
+ int tmp;
trace_xfs_da_node_unbalance(state->args);
drop_node = drop_blk->bp->b_addr;
save_node = save_blk->bp->b_addr;
- ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
+ xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
+ drop_btree = xfs_da3_node_tree_p(drop_node);
+ save_btree = xfs_da3_node_tree_p(save_node);
tp = state->args->trans;
/*
* If the dying block has lower hashvals, then move all the
* elements in the remaining block up to make a hole.
*/
- if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
- (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
- be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
- {
- btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
- tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, &save_node->btree[0], tmp);
- btree = &save_node->btree[0];
+ if ((be32_to_cpu(drop_btree[0].hashval) <
+ be32_to_cpu(save_btree[0].hashval)) ||
+ (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+ be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+ /* XXX: check this - is memmove dst correct? */
+ tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+ memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+ sindex = 0;
xfs_trans_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
- sizeof(xfs_da_node_entry_t)));
+ XFS_DA_LOGRANGE(save_node, &save_btree[0],
+ (save_hdr.count + drop_hdr.count) *
+ sizeof(xfs_da_node_entry_t)));
} else {
- btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
+ sindex = save_hdr.count;
xfs_trans_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- be16_to_cpu(drop_node->hdr.count) *
- sizeof(xfs_da_node_entry_t)));
+ XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+ drop_hdr.count * sizeof(xfs_da_node_entry_t)));
}
/*
* Move all the B-tree elements from drop_blk to save_blk.
*/
- tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
- memcpy(btree, &drop_node->btree[0], tmp);
- be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+ tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+ memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+ save_hdr.count += drop_hdr.count;
+ xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- sizeof(save_node->hdr)));
+ xfs_da3_node_hdr_size(save_node)));
/*
* Save the last hashval in the remaining block for upward propagation.
*/
- save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
+ save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
}
/*========================================================================
@@ -1146,16 +1481,24 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* pruned depth-first tree search.
*/
int /* error */
-xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+xfs_da3_node_lookup_int(
+ struct xfs_da_state *state,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *curr;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dablk_t blkno;
- int probe, span, max, error, retval;
- xfs_dahash_t hashval, btreehashval;
- xfs_da_args_t *args;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *curr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ xfs_dablk_t blkno;
+ xfs_dahash_t hashval;
+ xfs_dahash_t btreehashval;
+ int probe;
+ int span;
+ int max;
+ int error;
+ int retval;
args = state->args;
@@ -1171,7 +1514,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Read the next node down in the tree.
*/
blk->blkno = blkno;
- error = xfs_da_node_read(args->trans, args->dp, blkno,
+ error = xfs_da3_node_read(args->trans, args->dp, blkno,
-1, &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
@@ -1180,66 +1523,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
}
curr = blk->bp->b_addr;
blk->magic = be16_to_cpu(curr->magic);
- ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
- blk->magic == XFS_DIR2_LEAFN_MAGIC ||
- blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ }
+
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+ break;
+ }
+
+ blk->magic = XFS_DA_NODE_MAGIC;
+
/*
* Search an intermediate node for a match.
*/
- if (blk->magic == XFS_DA_NODE_MAGIC) {
- node = blk->bp->b_addr;
- max = be16_to_cpu(node->hdr.count);
- blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
+ node = blk->bp->b_addr;
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
- /*
- * Binary search. (note: small blocks will skip loop)
- */
- probe = span = max / 2;
- hashval = args->hashval;
- for (btree = &node->btree[probe]; span > 4;
- btree = &node->btree[probe]) {
- span /= 2;
- btreehashval = be32_to_cpu(btree->hashval);
- if (btreehashval < hashval)
- probe += span;
- else if (btreehashval > hashval)
- probe -= span;
- else
- break;
- }
- ASSERT((probe >= 0) && (probe < max));
- ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
+ max = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[max - 1].hashval);
- /*
- * Since we may have duplicate hashval's, find the first
- * matching hashval in the node.
- */
- while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
- btree--;
- probe--;
- }
- while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
- btree++;
- probe++;
- }
+ /*
+ * Binary search. (note: small blocks will skip loop)
+ */
+ probe = span = max / 2;
+ hashval = args->hashval;
+ while (span > 4) {
+ span /= 2;
+ btreehashval = be32_to_cpu(btree[probe].hashval);
+ if (btreehashval < hashval)
+ probe += span;
+ else if (btreehashval > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ ASSERT((probe >= 0) && (probe < max));
+ ASSERT((span <= 4) ||
+ (be32_to_cpu(btree[probe].hashval) == hashval));
- /*
- * Pick the right block to descend on.
- */
- if (probe == max) {
- blk->index = max-1;
- blkno = be32_to_cpu(node->btree[max-1].before);
- } else {
- blk->index = probe;
- blkno = be32_to_cpu(btree->before);
- }
- } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
- break;
- } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
- break;
+ /*
+ * Since we may have duplicate hashval's, find the first
+ * matching hashval in the node.
+ */
+ while (probe > 0 &&
+ be32_to_cpu(btree[probe].hashval) >= hashval) {
+ probe--;
+ }
+ while (probe < max &&
+ be32_to_cpu(btree[probe].hashval) < hashval) {
+ probe++;
+ }
+
+ /*
+ * Pick the right block to descend on.
+ */
+ if (probe == max) {
+ blk->index = max - 1;
+ blkno = be32_to_cpu(btree[max - 1].before);
+ } else {
+ blk->index = probe;
+ blkno = be32_to_cpu(btree[probe].before);
}
}
@@ -1254,7 +1606,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
&blk->index, state);
} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
- retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+ retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
blk->index = args->index;
args->blkno = blk->blkno;
} else {
@@ -1263,7 +1615,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
}
if (((retval == ENOENT) || (retval == ENOATTR)) &&
(blk->hashval == args->hashval)) {
- error = xfs_da_path_shift(state, &state->path, 1, 1,
+ error = xfs_da3_path_shift(state, &state->path, 1, 1,
&retval);
if (error)
return(error);
@@ -1285,16 +1637,52 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
*========================================================================*/
/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+ struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp)
+{
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da3_icnode_hdr node1hdr;
+ struct xfs_da3_icnode_hdr node2hdr;
+
+ node1 = node1_bp->b_addr;
+ node2 = node2_bp->b_addr;
+ xfs_da3_node_hdr_from_disk(&node1hdr, node1);
+ xfs_da3_node_hdr_from_disk(&node2hdr, node2);
+ btree1 = xfs_da3_node_tree_p(node1);
+ btree2 = xfs_da3_node_tree_p(node2);
+
+ if (node1hdr.count > 0 && node2hdr.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+ be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Link a new block into a doubly linked list of blocks (of whatever type).
*/
int /* error */
-xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
- xfs_da_state_blk_t *new_blk)
+xfs_da3_blk_link(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *old_blk,
+ struct xfs_da_state_blk *new_blk)
{
- xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
- xfs_da_args_t *args;
- int before=0, error;
- struct xfs_buf *bp;
+ struct xfs_da_blkinfo *old_info;
+ struct xfs_da_blkinfo *new_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int before = 0;
+ int error;
/*
* Set up environment.
@@ -1306,9 +1694,6 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
old_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
- ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
- ASSERT(old_blk->magic == new_blk->magic);
switch (old_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
@@ -1318,7 +1703,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
break;
case XFS_DA_NODE_MAGIC:
- before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+ before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
break;
}
@@ -1333,14 +1718,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
- ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
+ ASSERT(tmp_info->magic == old_info->magic);
ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
tmp_info->forw = cpu_to_be32(new_blk->blkno);
xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
@@ -1354,7 +1739,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1375,59 +1760,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
}
/*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da_node_order(
- struct xfs_buf *node1_bp,
- struct xfs_buf *node2_bp)
-{
- xfs_da_intnode_t *node1, *node2;
-
- node1 = node1_bp->b_addr;
- node2 = node2_bp->b_addr;
- ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) &&
- node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
- ((be32_to_cpu(node2->btree[0].hashval) <
- be32_to_cpu(node1->btree[0].hashval)) ||
- (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
- be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
- return(1);
- }
- return(0);
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da_node_lasthash(
- struct xfs_buf *bp,
- int *count)
-{
- xfs_da_intnode_t *node;
-
- node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if (count)
- *count = be16_to_cpu(node->hdr.count);
- if (!node->hdr.count)
- return(0);
- return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
-}
-
-/*
* Unlink a block from a doubly linked list of blocks.
*/
STATIC int /* error */
-xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_blk_unlink(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
- xfs_da_args_t *args;
- struct xfs_buf *bp;
- int error;
+ struct xfs_da_blkinfo *drop_info;
+ struct xfs_da_blkinfo *save_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int error;
/*
* Set up environment.
@@ -1439,8 +1785,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
save_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
- ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
ASSERT(save_blk->magic == drop_blk->magic);
ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
(be32_to_cpu(save_info->back) == drop_blk->blkno));
@@ -1454,7 +1798,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
-1, &bp, args->whichfork);
if (error)
@@ -1471,7 +1815,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1499,15 +1843,22 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* the new bottom and the root.
*/
int /* error */
-xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
- int forward, int release, int *result)
+xfs_da3_path_shift(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path,
+ int forward,
+ int release,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_da_args_t *args;
- xfs_dablk_t blkno=0;
- int level, error;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ struct xfs_da_intnode *node;
+ struct xfs_da_args *args;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno = 0;
+ int level;
+ int error;
trace_xfs_da_path_shift(state->args);
@@ -1522,16 +1873,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
- ASSERT(blk->bp != NULL);
node = blk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+
+ if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
- blkno = be32_to_cpu(node->btree[blk->index].before);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
} else if (!forward && (blk->index > 0)) {
blk->index--;
- blkno = be32_to_cpu(node->btree[blk->index].before);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
}
}
@@ -1557,45 +1909,60 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+ error = xfs_da3_node_read(args->trans, args->dp, blkno, -1,
&blk->bp, args->whichfork);
if (error)
return(error);
- ASSERT(blk->bp != NULL);
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
- info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- blk->magic = be16_to_cpu(info->magic);
- if (blk->magic == XFS_DA_NODE_MAGIC) {
+ info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+ /*
+ * Note: we flatten the magic number to a single type so we
+ * don't have to compare against crc/non-crc types elsewhere.
+ */
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ blk->magic = XFS_DA_NODE_MAGIC;
node = (xfs_da_intnode_t *)info;
- blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ btree = xfs_da3_node_tree_p(node);
+ blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
else
- blk->index = be16_to_cpu(node->hdr.count)-1;
- blkno = be32_to_cpu(node->btree[blk->index].before);
- } else {
+ blk->index = nodehdr.count - 1;
+ blkno = be32_to_cpu(btree[blk->index].before);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- switch(blk->magic) {
- case XFS_ATTR_LEAF_MAGIC:
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
- NULL);
- break;
- case XFS_DIR2_LEAFN_MAGIC:
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
- NULL);
- break;
- default:
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
- blk->magic == XFS_DIR2_LEAFN_MAGIC);
- break;
- }
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+ NULL);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+ NULL);
+ break;
+ default:
+ ASSERT(0);
+ break;
}
}
*result = 0;
- return(0);
+ return 0;
}
@@ -1782,22 +2149,36 @@ xfs_da_grow_inode(
* a bmap btree split to do that.
*/
STATIC int
-xfs_da_swap_lastblock(
- xfs_da_args_t *args,
- xfs_dablk_t *dead_blknop,
- struct xfs_buf **dead_bufp)
+xfs_da3_swap_lastblock(
+ struct xfs_da_args *args,
+ xfs_dablk_t *dead_blknop,
+ struct xfs_buf **dead_bufp)
{
- xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
- struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf;
- xfs_fileoff_t lastoff;
- xfs_inode_t *ip;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- int error, w, entno, level, dead_level;
- xfs_da_blkinfo_t *dead_info, *sib_info;
- xfs_da_intnode_t *par_node, *dead_node;
- xfs_dir2_leaf_t *dead_leaf2;
- xfs_dahash_t dead_hash;
+ struct xfs_da_blkinfo *dead_info;
+ struct xfs_da_blkinfo *sib_info;
+ struct xfs_da_intnode *par_node;
+ struct xfs_da_intnode *dead_node;
+ struct xfs_dir2_leaf *dead_leaf2;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr par_hdr;
+ struct xfs_inode *ip;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_buf *dead_buf;
+ struct xfs_buf *last_buf;
+ struct xfs_buf *sib_buf;
+ struct xfs_buf *par_buf;
+ xfs_dahash_t dead_hash;
+ xfs_fileoff_t lastoff;
+ xfs_dablk_t dead_blkno;
+ xfs_dablk_t last_blkno;
+ xfs_dablk_t sib_blkno;
+ xfs_dablk_t par_blkno;
+ int error;
+ int w;
+ int entno;
+ int level;
+ int dead_level;
trace_xfs_da_swap_lastblock(args);
@@ -1821,7 +2202,7 @@ xfs_da_swap_lastblock(
* Read the last block in the btree space.
*/
last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+ error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w);
if (error)
return error;
/*
@@ -1833,22 +2214,31 @@ xfs_da_swap_lastblock(
/*
* Get values from the moved block.
*/
- if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+ if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+ ents = xfs_dir3_leaf_ents_p(dead_leaf2);
dead_level = 0;
- dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
+ dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
- ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ struct xfs_da3_icnode_hdr deadhdr;
+
dead_node = (xfs_da_intnode_t *)dead_info;
- dead_level = be16_to_cpu(dead_node->hdr.level);
- dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
+ xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
+ btree = xfs_da3_node_tree_p(dead_node);
+ dead_level = deadhdr.level;
+ dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
sib_buf = par_buf = NULL;
/*
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -1870,7 +2260,7 @@ xfs_da_swap_lastblock(
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -1894,31 +2284,31 @@ xfs_da_swap_lastblock(
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- if (unlikely(par_node->hdr.info.magic !=
- cpu_to_be16(XFS_DA_NODE_MAGIC) ||
- (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
+ xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+ if (level >= 0 && level != par_hdr.level + 1) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- level = be16_to_cpu(par_node->hdr.level);
+ level = par_hdr.level;
+ btree = xfs_da3_node_tree_p(par_node);
for (entno = 0;
- entno < be16_to_cpu(par_node->hdr.count) &&
- be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].hashval) < dead_hash;
entno++)
continue;
- if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
+ if (entno == par_hdr.count) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- par_blkno = be32_to_cpu(par_node->btree[entno].before);
+ par_blkno = be32_to_cpu(btree[entno].before);
if (level == dead_level + 1)
break;
xfs_trans_brelse(tp, par_buf);
@@ -1930,13 +2320,13 @@ xfs_da_swap_lastblock(
*/
for (;;) {
for (;
- entno < be16_to_cpu(par_node->hdr.count) &&
- be32_to_cpu(par_node->btree[entno].before) != last_blkno;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].before) != last_blkno;
entno++)
continue;
- if (entno < be16_to_cpu(par_node->hdr.count))
+ if (entno < par_hdr.count)
break;
- par_blkno = be32_to_cpu(par_node->hdr.info.forw);
+ par_blkno = par_hdr.forw;
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (unlikely(par_blkno == 0)) {
@@ -1945,27 +2335,27 @@ xfs_da_swap_lastblock(
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- if (unlikely(
- be16_to_cpu(par_node->hdr.level) != level ||
- par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) {
+ xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+ if (par_hdr.level != level) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
+ btree = xfs_da3_node_tree_p(par_node);
entno = 0;
}
/*
* Update the parent entry pointing to the moved block.
*/
- par_node->btree[entno].before = cpu_to_be32(dead_blkno);
+ btree[entno].before = cpu_to_be32(dead_blkno);
xfs_trans_log_buf(tp, par_buf,
- XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
- sizeof(par_node->btree[entno].before)));
+ XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+ sizeof(btree[entno].before)));
*dead_blknop = last_blkno;
*dead_bufp = last_buf;
return 0;
@@ -2007,14 +2397,15 @@ xfs_da_shrink_inode(
* Remove extents. If we get ENOSPC for a dir we have to move
* the last block to the place we want to kill.
*/
- if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist,
- &done)) == ENOSPC) {
+ error = xfs_bunmapi(tp, dp, dead_blkno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+ 0, args->firstblock, args->flist, &done);
+ if (error == ENOSPC) {
if (w != XFS_DATA_FORK)
break;
- if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
- &dead_buf)))
+ error = xfs_da3_swap_lastblock(args, &dead_blkno,
+ &dead_buf);
+ if (error)
break;
} else {
break;
@@ -2074,7 +2465,8 @@ xfs_buf_map_from_irec(
ASSERT(nirecs >= 1);
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+ KM_SLEEP | KM_NOFS);
if (!map)
return ENOMEM;
*mapp = map;
@@ -2130,7 +2522,8 @@ xfs_dabuf_map(
* Optimize the one-block case.
*/
if (nfsb != 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
+ irecs = kmem_zalloc(sizeof(irec) * nfsb,
+ KM_SLEEP | KM_NOFS);
nirecs = nfsb;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
@@ -2279,12 +2672,21 @@ xfs_da_read_buf(
magic1 = be32_to_cpu(hdr->magic);
if (unlikely(
XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+ (magic != XFS_DA3_NODE_MAGIC) &&
(magic != XFS_ATTR_LEAF_MAGIC) &&
+ (magic != XFS_ATTR3_LEAF_MAGIC) &&
(magic != XFS_DIR2_LEAF1_MAGIC) &&
+ (magic != XFS_DIR3_LEAF1_MAGIC) &&
(magic != XFS_DIR2_LEAFN_MAGIC) &&
+ (magic != XFS_DIR3_LEAFN_MAGIC) &&
(magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+ (magic1 != XFS_DIR3_BLOCK_MAGIC) &&
(magic1 != XFS_DIR2_DATA_MAGIC) &&
- (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)),
+ (magic1 != XFS_DIR3_DATA_MAGIC) &&
+ (free->hdr.magic !=
+ cpu_to_be32(XFS_DIR2_FREE_MAGIC)) &&
+ (free->hdr.magic !=
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC)),
mp, XFS_ERRTAG_DA_READ_BUF,
XFS_RANDOM_DA_READ_BUF))) {
trace_xfs_da_btree_corrupt(bp, _RET_IP_);
@@ -2342,41 +2744,3 @@ out_free:
return -1;
return mappedbno;
}
-
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
-{
- int i;
-
- for (i = 0; i < state->altpath.active; i++)
- state->altpath.blk[i].bp = NULL;
- state->altpath.active = 0;
-}
-
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
- xfs_da_state_kill_altpath(state);
-#ifdef DEBUG
- memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
- kmem_zone_free(xfs_da_state_zone, state);
-}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index ee5170c46ae1..6fb3371c63cf 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -20,7 +21,6 @@
struct xfs_bmap_free;
struct xfs_inode;
-struct xfs_mount;
struct xfs_trans;
struct zone;
@@ -47,6 +47,33 @@ typedef struct xfs_da_blkinfo {
} xfs_da_blkinfo_t;
/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+ /*
+ * the node link manipulation code relies on the fact that the first
+ * element of this structure is the struct xfs_da_blkinfo so it can
+ * ignore the differences in the rest of the structures.
+ */
+ struct xfs_da_blkinfo hdr;
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+/*
* This is the structure of the root and intermediate nodes in the Btree.
* The leaf nodes are defined above.
*
@@ -57,19 +84,76 @@ typedef struct xfs_da_blkinfo {
*/
#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+typedef struct xfs_da_node_hdr {
+ struct xfs_da_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+ struct xfs_da3_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+ __be32 __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+} xfs_da_node_entry_t;
+
typedef struct xfs_da_intnode {
- struct xfs_da_node_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __be16 count; /* count of active entries */
- __be16 level; /* level above leaves (leaf == 0) */
- } hdr;
- struct xfs_da_node_entry {
- __be32 hashval; /* hash value for this descendant */
- __be32 before; /* Btree block before this key */
- } btree[1]; /* variable sized array of keys */
+ struct xfs_da_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
} xfs_da_intnode_t;
-typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
-typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+struct xfs_da3_intnode {
+ struct xfs_da3_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t level;
+};
+
+extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from);
+extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from);
+
+static inline int
+xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+{
+ if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC))
+ return sizeof(struct xfs_da3_node_hdr);
+ return sizeof(struct xfs_da_node_hdr);
+}
+
+static inline struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+ if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
+ return dap3->__btree;
+ }
+ return dap->__btree;
+}
+
+extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from);
+extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from);
#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
@@ -191,32 +275,34 @@ struct xfs_nameops {
/*
* Routines used for growing the Btree.
*/
-int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- struct xfs_buf **bpp, int whichfork);
-int xfs_da_split(xfs_da_state_t *state);
+int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+ int level, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_split(xfs_da_state_t *state);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_da_join(xfs_da_state_t *state);
-void xfs_da_fixhashpath(xfs_da_state_t *state,
- xfs_da_state_path_t *path_to_to_fix);
+int xfs_da3_join(xfs_da_state_t *state);
+void xfs_da3_fixhashpath(struct xfs_da_state *state,
+ struct xfs_da_state_path *path_to_to_fix);
/*
* Routines used for finding things in the Btree.
*/
-int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
-int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
int forward, int release, int *result);
/*
* Utility routines.
*/
-int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
-int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int which_fork);
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+
/*
* Utility routines.
*/
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f852b082a084..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
*/
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
return EINVAL;
if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
return EINVAL;
-
if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
@@ -219,6 +221,14 @@ xfs_swap_extents(
int taforkblks = 0;
__uint64_t tmp;
+ /*
+ * We have no way of updating owner information in the BMBT blocks for
+ * each inode on CRC enabled filesystems, so to avoid corrupting the
+ * this metadata we simply don't allow extent swaps to occur.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return XFS_ERROR(EINVAL);
+
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = XFS_ERROR(ENOMEM);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 1d9643b3dce6..f7a0e95d197a 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -19,7 +19,7 @@
#define __XFS_DINODE_H__
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2))
+#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
typedef struct xfs_timestamp {
__be32 t_sec; /* timestamp seconds */
@@ -70,11 +70,36 @@ typedef struct xfs_dinode {
/* di_next_unlinked is the only non-core field in the old dinode */
__be32 di_next_unlinked;/* agi unlinked list ptr */
-} __attribute__((packed)) xfs_dinode_t;
+
+ /* start of the extended dinode, writable fields */
+ __le32 di_crc; /* CRC of the inode */
+ __be64 di_changecount; /* number of attribute changes */
+ __be64 di_lsn; /* flush sequence */
+ __be64 di_flags2; /* more random flags */
+ __u8 di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_timestamp_t di_crtime; /* time created */
+ __be64 di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
#define DI_MAX_FLUSH 0xffff
/*
+ * Size of the core inode on disk. Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_dinode);
+ return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
* The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
@@ -104,11 +129,11 @@ typedef enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp) \
- ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+#define XFS_LITINO(mp, version) \
+ ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-#define XFS_BROOT_SIZE_ADJ \
- (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
+#define XFS_BROOT_SIZE_ADJ(ip) \
+ (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
/*
* Inode data & attribute fork sizes, per inode.
@@ -119,10 +144,10 @@ typedef enum xfs_dinode_fmt {
#define XFS_DFORK_DSIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp))
+ XFS_LITINO(mp, (dip)->di_version))
#define XFS_DFORK_ASIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \
+ XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
@@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt {
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
- ((char *)(dip) + sizeof(struct xfs_dinode))
+ ((char *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
int
xfs_readdir(
xfs_inode_t *dp,
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx,
+ size_t bufsize)
{
int rval; /* return value */
int v; /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
XFS_STATS_INC(xs_dir_getdents);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+ rval = xfs_dir2_sf_getdents(dp, ctx);
else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
;
else if (v)
- rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+ rval = xfs_dir2_block_getdents(dp, ctx);
else
- rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
- filldir);
+ rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
return rval;
}
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 12afe07a91d7..09aea0247d96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -28,11 +29,13 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
/*
* Local function prototypes.
@@ -56,52 +59,110 @@ xfs_dir_startup(void)
xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
}
-static void
-xfs_dir2_block_verify(
+static bool
+xfs_dir3_block_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- int block_ok = 0;
-
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
- block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
-
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ return false;
}
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
}
static void
-xfs_dir2_block_read_verify(
+xfs_dir3_block_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_block_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_DIR3_DATA_CRC_OFF)) ||
+ !xfs_dir3_block_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
-xfs_dir2_block_write_verify(
+xfs_dir3_block_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_block_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_block_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
}
-const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
- .verify_read = xfs_dir2_block_read_verify,
- .verify_write = xfs_dir2_block_write_verify,
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .verify_read = xfs_dir3_block_read_verify,
+ .verify_write = xfs_dir3_block_write_verify,
};
static int
-xfs_dir2_block_read(
+xfs_dir3_block_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ int err;
- return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
- XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+ err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ return err;
+}
+
+static void
+xfs_dir3_block_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
+{
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ return;
+
+ }
+ hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
}
static void
@@ -121,7 +182,7 @@ xfs_dir2_block_need_space(
struct xfs_dir2_data_unused *enddup = NULL;
*compact = 0;
- bf = hdr->bestfree;
+ bf = xfs_dir3_data_bestfree_p(hdr);
/*
* If there are stale entries we'll use one for the leaf.
@@ -303,7 +364,7 @@ xfs_dir2_block_addname(
mp = dp->i_mount;
/* Read the (one and only) directory block into bp. */
- error = xfs_dir2_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
return error;
@@ -498,7 +559,7 @@ xfs_dir2_block_addname(
xfs_dir2_data_log_header(tp, bp);
xfs_dir2_block_log_tail(tp, bp);
xfs_dir2_data_log_entry(tp, bp, dep);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -508,9 +569,7 @@ xfs_dir2_block_addname(
int /* error */
xfs_dir2_block_getdents(
xfs_inode_t *dp, /* incore inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx)
{
xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
@@ -528,10 +587,10 @@ xfs_dir2_block_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
return 0;
- error = xfs_dir2_block_read(NULL, dp, &bp);
+ error = xfs_dir3_block_read(NULL, dp, &bp);
if (error)
return error;
@@ -539,14 +598,14 @@ xfs_dir2_block_getdents(
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
- wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+ wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
/*
* Set up values for the loop.
*/
btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)(hdr + 1);
+ ptr = (char *)xfs_dir3_data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
/*
@@ -578,13 +637,12 @@ xfs_dir2_block_getdents(
cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
(char *)dep - (char *)hdr);
+ ctx->pos = cook & 0x7fffffff;
/*
* If it didn't fit, set the final offset to here & return.
*/
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- cook & 0x7fffffff, be64_to_cpu(dep->inumber),
- DT_UNKNOWN)) {
- *offset = cook & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -594,7 +652,7 @@ xfs_dir2_block_getdents(
* Reached the end of the block.
* Set the offset to a non-existent block 1 and return.
*/
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
0x7fffffff;
xfs_trans_brelse(NULL, bp);
return 0;
@@ -665,7 +723,7 @@ xfs_dir2_block_lookup(
dp = args->dp;
mp = dp->i_mount;
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -711,12 +769,12 @@ xfs_dir2_block_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
return error;
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -853,7 +911,7 @@ xfs_dir2_block_removename(
xfs_dir2_data_freescan(mp, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
/*
* See if the size as a shortform is good enough.
*/
@@ -910,7 +968,7 @@ xfs_dir2_block_replace(
*/
dep->inumber = cpu_to_be64(args->inumber);
xfs_dir2_data_log_entry(args->trans, bp, dep);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -958,6 +1016,8 @@ xfs_dir2_leaf_to_block(
__be16 *tagp; /* end of entry (tag) */
int to; /* block/leaf to index */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_to_block(args);
@@ -965,8 +1025,12 @@ xfs_dir2_leaf_to_block(
tp = args->trans;
mp = dp->i_mount;
leaf = lbp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
/*
* If there are data blocks other than the first one, take this
* opportunity to remove trailing empty data blocks that may have
@@ -974,9 +1038,12 @@ xfs_dir2_leaf_to_block(
* These will show up in the leaf bests table.
*/
while (dp->i_d.di_size > mp->m_dirblksize) {
+ int hdrsz;
+
+ hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
- mp->m_dirblksize - (uint)sizeof(*hdr)) {
+ mp->m_dirblksize - hdrsz) {
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
(xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
@@ -988,17 +1055,19 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
if (error)
return error;
}
hdr = dbp->b_addr;
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
/*
* Size of the "leaf" area in the block.
*/
size = (uint)sizeof(xfs_dir2_block_tail_t) +
- (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
/*
* Look at the last data entry.
*/
@@ -1014,8 +1083,8 @@ xfs_dir2_leaf_to_block(
/*
* Start converting it to block form.
*/
- dbp->b_ops = &xfs_dir2_block_buf_ops;
- hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ xfs_dir3_block_init(mp, tp, dbp, dp);
+
needlog = 1;
needscan = 0;
/*
@@ -1027,18 +1096,17 @@ xfs_dir2_leaf_to_block(
* Initialize the block tail.
*/
btp = xfs_dir2_block_tail_p(mp, hdr);
- btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
btp->stale = 0;
xfs_dir2_block_log_tail(tp, dbp);
/*
* Initialize the block leaf area. We compact out stale entries.
*/
lep = xfs_dir2_block_leaf_p(btp);
- for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
- if (leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ for (from = to = 0; from < leafhdr.count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
- lep[to++] = leaf->ents[from];
+ lep[to++] = ents[from];
}
ASSERT(to == be32_to_cpu(btp->count));
xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
@@ -1137,16 +1205,16 @@ xfs_dir2_sf_to_block(
return error;
}
/*
- * Initialize the data block.
+ * Initialize the data block, then convert it to block format.
*/
- error = xfs_dir2_data_init(args, blkno, &bp);
+ error = xfs_dir3_data_init(args, blkno, &bp);
if (error) {
kmem_free(sfp);
return error;
}
- bp->b_ops = &xfs_dir2_block_buf_ops;
+ xfs_dir3_block_init(mp, tp, bp, dp);
hdr = bp->b_addr;
- hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+
/*
* Compute size of block "tail" area.
*/
@@ -1156,7 +1224,7 @@ xfs_dir2_sf_to_block(
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+ dup = xfs_dir3_data_unused_p(hdr);
needlog = needscan = 0;
xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
&needscan);
@@ -1178,8 +1246,7 @@ xfs_dir2_sf_to_block(
/*
* Create entry for .
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET);
+ dep = xfs_dir3_data_dot_entry_p(hdr);
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
@@ -1192,8 +1259,7 @@ xfs_dir2_sf_to_block(
/*
* Create entry for ..
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET);
+ dep = xfs_dir3_data_dotdot_entry_p(hdr);
dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
@@ -1203,7 +1269,7 @@ xfs_dir2_sf_to_block(
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
(char *)dep - (char *)hdr));
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ offset = xfs_dir3_data_first_offset(hdr);
/*
* Loop over existing entries, stuff them in.
*/
@@ -1273,6 +1339,6 @@ xfs_dir2_sf_to_block(
ASSERT(needscan == 0);
xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index ffcf1774152e..c2930238005c 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -30,6 +31,8 @@
#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
STATIC xfs_dir2_data_free_t *
xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
@@ -40,7 +43,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
* Return 0 is the buffer is good, otherwise an error.
*/
int
-__xfs_dir2_data_check(
+__xfs_dir3_data_check(
struct xfs_inode *dp, /* incore inode pointer */
struct xfs_buf *bp) /* data block's buffer */
{
@@ -65,15 +68,17 @@ __xfs_dir2_data_check(
mp = bp->b_target->bt_mount;
hdr = bp->b_addr;
- bf = hdr->bestfree;
- p = (char *)(hdr + 1);
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ p = (char *)xfs_dir3_data_entry_p(hdr);
switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
btp = xfs_dir2_block_tail_p(mp, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
break;
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
endp = (char *)hdr + mp->m_dirblksize;
break;
@@ -148,7 +153,8 @@ __xfs_dir2_data_check(
(char *)dep - (char *)hdr);
count++;
lastfree = 0;
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
(xfs_dir2_data_aoff_t)
((char *)dep - (char *)hdr));
@@ -168,7 +174,8 @@ __xfs_dir2_data_check(
* Need to have seen all the entries and all the bestfree slots.
*/
XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
if (lep[i].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
@@ -185,21 +192,27 @@ __xfs_dir2_data_check(
return 0;
}
-static void
-xfs_dir2_data_verify(
+static bool
+xfs_dir3_data_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- int block_ok = 0;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
-
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+ return false;
}
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
}
/*
@@ -208,7 +221,7 @@ xfs_dir2_data_verify(
* format buffer or a data format buffer on readahead.
*/
static void
-xfs_dir2_data_reada_verify(
+xfs_dir3_data_reada_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
@@ -216,11 +229,13 @@ xfs_dir2_data_reada_verify(
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- bp->b_ops = &xfs_dir2_block_buf_ops;
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir3_block_buf_ops;
bp->b_ops->verify_read(bp);
return;
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- xfs_dir2_data_verify(bp);
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ xfs_dir3_data_verify(bp);
return;
default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
@@ -230,51 +245,80 @@ xfs_dir2_data_reada_verify(
}
static void
-xfs_dir2_data_read_verify(
+xfs_dir3_data_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_data_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_DIR3_DATA_CRC_OFF)) ||
+ !xfs_dir3_data_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
-xfs_dir2_data_write_verify(
+xfs_dir3_data_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_data_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_data_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
}
-const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
- .verify_read = xfs_dir2_data_read_verify,
- .verify_write = xfs_dir2_data_write_verify,
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .verify_read = xfs_dir3_data_read_verify,
+ .verify_write = xfs_dir3_data_write_verify,
};
-static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
- .verify_read = xfs_dir2_data_reada_verify,
- .verify_write = xfs_dir2_data_write_verify,
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .verify_read = xfs_dir3_data_reada_verify,
+ .verify_write = xfs_dir3_data_write_verify,
};
int
-xfs_dir2_data_read(
+xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
- XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ return err;
}
int
-xfs_dir2_data_readahead(
+xfs_dir3_data_readahead(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno)
{
return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
- XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
+ XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
}
/*
@@ -288,12 +332,15 @@ xfs_dir2_data_freefind(
{
xfs_dir2_data_free_t *dfp; /* bestfree entry */
xfs_dir2_data_aoff_t off; /* offset value needed */
+ struct xfs_dir2_data_free *bf;
#if defined(DEBUG) && defined(__KERNEL__)
int matched; /* matched the value */
int seenzero; /* saw a 0 bestfree entry */
#endif
off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+ bf = xfs_dir3_data_bestfree_p(hdr);
+
#if defined(DEBUG) && defined(__KERNEL__)
/*
* Validate some consistency in the bestfree table.
@@ -301,9 +348,11 @@ xfs_dir2_data_freefind(
* one we're looking for it has to be exact.
*/
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
- for (dfp = &hdr->bestfree[0], seenzero = matched = 0;
- dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ for (dfp = &bf[0], seenzero = matched = 0;
+ dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
dfp++) {
if (!dfp->offset) {
ASSERT(!dfp->length);
@@ -319,7 +368,7 @@ xfs_dir2_data_freefind(
else
ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
- if (dfp > &hdr->bestfree[0])
+ if (dfp > &bf[0])
ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
}
#endif
@@ -328,14 +377,12 @@ xfs_dir2_data_freefind(
* it can't be there since they're sorted.
*/
if (be16_to_cpu(dup->length) <
- be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
return NULL;
/*
* Look at the three bestfree entries for our guy.
*/
- for (dfp = &hdr->bestfree[0];
- dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
- dfp++) {
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
if (!dfp->offset)
return NULL;
if (be16_to_cpu(dfp->offset) == off)
@@ -359,11 +406,12 @@ xfs_dir2_data_freeinsert(
xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
xfs_dir2_data_free_t new; /* new bestfree entry */
-#ifdef __KERNEL__
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
- dfp = hdr->bestfree;
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ dfp = xfs_dir3_data_bestfree_p(hdr);
new.length = dup->length;
new.offset = cpu_to_be16((char *)dup - (char *)hdr);
@@ -400,32 +448,36 @@ xfs_dir2_data_freeremove(
xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
int *loghead) /* out: log data header */
{
-#ifdef __KERNEL__
+ struct xfs_dir2_data_free *bf;
+
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
/*
* It's the first entry, slide the next 2 up.
*/
- if (dfp == &hdr->bestfree[0]) {
- hdr->bestfree[0] = hdr->bestfree[1];
- hdr->bestfree[1] = hdr->bestfree[2];
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ if (dfp == &bf[0]) {
+ bf[0] = bf[1];
+ bf[1] = bf[2];
}
/*
* It's the second entry, slide the 3rd entry up.
*/
- else if (dfp == &hdr->bestfree[1])
- hdr->bestfree[1] = hdr->bestfree[2];
+ else if (dfp == &bf[1])
+ bf[1] = bf[2];
/*
* Must be the last entry.
*/
else
- ASSERT(dfp == &hdr->bestfree[2]);
+ ASSERT(dfp == &bf[2]);
/*
* Clear the 3rd entry, must be zero now.
*/
- hdr->bestfree[2].length = 0;
- hdr->bestfree[2].offset = 0;
+ bf[2].length = 0;
+ bf[2].offset = 0;
*loghead = 1;
}
@@ -441,23 +493,27 @@ xfs_dir2_data_freescan(
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* active data entry */
xfs_dir2_data_unused_t *dup; /* unused data entry */
+ struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
-#ifdef __KERNEL__
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
/*
* Start by clearing the table.
*/
- memset(hdr->bestfree, 0, sizeof(hdr->bestfree));
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)(hdr + 1);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ p = (char *)xfs_dir3_data_entry_p(hdr);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
btp = xfs_dir2_block_tail_p(mp, hdr);
endp = (char *)xfs_dir2_block_leaf_p(btp);
} else
@@ -493,7 +549,7 @@ xfs_dir2_data_freescan(
* Give back the buffer for the created block.
*/
int /* error */
-xfs_dir2_data_init(
+xfs_dir3_data_init(
xfs_da_args_t *args, /* directory operation args */
xfs_dir2_db_t blkno, /* logical dir block number */
struct xfs_buf **bpp) /* output block buffer */
@@ -502,6 +558,7 @@ xfs_dir2_data_init(
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ struct xfs_dir2_data_free *bf;
int error; /* error return value */
int i; /* bestfree index */
xfs_mount_t *mp; /* filesystem mount point */
@@ -518,27 +575,40 @@ xfs_dir2_data_init(
XFS_DATA_FORK);
if (error)
return error;
- bp->b_ops = &xfs_dir2_data_buf_ops;
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
/*
* Initialize the header.
*/
hdr = bp->b_addr;
- hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ } else
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr));
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
- hdr->bestfree[i].length = 0;
- hdr->bestfree[i].offset = 0;
+ bf[i].length = 0;
+ bf[i].offset = 0;
}
/*
* Set up an unused entry for the block's body.
*/
- dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+ dup = xfs_dir3_data_unused_p(hdr);
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
- t = mp->m_dirblksize - (uint)sizeof(*hdr);
- hdr->bestfree[0].length = cpu_to_be16(t);
+ t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr);
+ bf[0].length = cpu_to_be16(t);
dup->length = cpu_to_be16(t);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
/*
@@ -562,7 +632,9 @@ xfs_dir2_data_log_entry(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
(uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
@@ -580,9 +652,11 @@ xfs_dir2_data_log_header(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1);
+ xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1);
}
/*
@@ -597,7 +671,9 @@ xfs_dir2_data_log_unused(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
/*
* Log the first part of the unused entry.
@@ -635,6 +711,7 @@ xfs_dir2_data_make_free(
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ struct xfs_dir2_data_free *bf;
mp = tp->t_mountp;
hdr = bp->b_addr;
@@ -642,12 +719,14 @@ xfs_dir2_data_make_free(
/*
* Figure out where the end of the data area is.
*/
- if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
endptr = (char *)hdr + mp->m_dirblksize;
else {
xfs_dir2_block_tail_t *btp; /* block tail */
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
btp = xfs_dir2_block_tail_p(mp, hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
}
@@ -655,7 +734,7 @@ xfs_dir2_data_make_free(
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > sizeof(*hdr)) {
+ if (offset > xfs_dir3_data_entry_offset(hdr)) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -681,6 +760,7 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
+ bf = xfs_dir3_data_bestfree_p(hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
@@ -695,7 +775,7 @@ xfs_dir2_data_make_free(
* since the third bestfree is there, there might be more
* entries.
*/
- needscan = (hdr->bestfree[2].length != 0);
+ needscan = (bf[2].length != 0);
/*
* Fix up the new big freespace.
*/
@@ -711,10 +791,10 @@ xfs_dir2_data_make_free(
* Remove entry 1 first then entry 0.
*/
ASSERT(dfp && dfp2);
- if (dfp == &hdr->bestfree[1]) {
- dfp = &hdr->bestfree[0];
+ if (dfp == &bf[1]) {
+ dfp = &bf[0];
ASSERT(dfp2 == dfp);
- dfp2 = &hdr->bestfree[1];
+ dfp2 = &bf[1];
}
xfs_dir2_data_freeremove(hdr, dfp2, needlogp);
xfs_dir2_data_freeremove(hdr, dfp, needlogp);
@@ -722,7 +802,7 @@ xfs_dir2_data_make_free(
* Now insert the new entry.
*/
dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
- ASSERT(dfp == &hdr->bestfree[0]);
+ ASSERT(dfp == &bf[0]);
ASSERT(dfp->length == prevdup->length);
ASSERT(!dfp[1].length);
ASSERT(!dfp[2].length);
@@ -751,7 +831,7 @@ xfs_dir2_data_make_free(
*/
else {
needscan = be16_to_cpu(prevdup->length) >
- be16_to_cpu(hdr->bestfree[2].length);
+ be16_to_cpu(bf[2].length);
}
}
/*
@@ -779,7 +859,7 @@ xfs_dir2_data_make_free(
*/
else {
needscan = be16_to_cpu(newdup->length) >
- be16_to_cpu(hdr->bestfree[2].length);
+ be16_to_cpu(bf[2].length);
}
}
/*
@@ -818,10 +898,13 @@ xfs_dir2_data_use_free(
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
int oldlen; /* old unused entry's length */
+ struct xfs_dir2_data_free *bf;
hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
ASSERT(offset >= (char *)dup - (char *)hdr);
ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
@@ -831,7 +914,8 @@ xfs_dir2_data_use_free(
*/
dfp = xfs_dir2_data_freefind(hdr, dup);
oldlen = be16_to_cpu(dup->length);
- ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length));
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
* Check for alignment with front and back of the entry.
*/
@@ -845,7 +929,7 @@ xfs_dir2_data_use_free(
*/
if (matchfront && matchback) {
if (dfp) {
- needscan = (hdr->bestfree[2].offset != 0);
+ needscan = (bf[2].offset != 0);
if (!needscan)
xfs_dir2_data_freeremove(hdr, dfp, needlogp);
}
@@ -875,7 +959,7 @@ xfs_dir2_data_use_free(
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &hdr->bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -902,7 +986,7 @@ xfs_dir2_data_use_free(
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &hdr->bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -930,7 +1014,7 @@ xfs_dir2_data_use_free(
* the 2 new will work.
*/
if (dfp) {
- needscan = (hdr->bestfree[2].length != 0);
+ needscan = (bf[2].length != 0);
if (!needscan) {
xfs_dir2_data_freeremove(hdr, dfp, needlogp);
xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index 07270981f48f..7826782b8d78 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -36,6 +37,38 @@
#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories. The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ * - a larger block header for CRC and identification purposes and so the
+ * offsets of all the structures inside the blocks are different.
+ *
+ * - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */
+#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */
+#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
+
+/*
* Byte offset in data block and shortform entry.
*/
typedef __uint16_t xfs_dir2_data_off_t;
@@ -195,16 +228,6 @@ xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr,
xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
/*
- * Offsets of . and .. in data space (always block 0)
- */
-#define XFS_DIR2_DATA_DOT_OFFSET \
- ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr))
-#define XFS_DIR2_DATA_DOTDOT_OFFSET \
- (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
-#define XFS_DIR2_DATA_FIRST_OFFSET \
- (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
-
-/*
* Describe a free area in the data block.
*
* The freespace will be formatted as a xfs_dir2_data_unused_t.
@@ -226,6 +249,40 @@ typedef struct xfs_dir2_data_hdr {
} xfs_dir2_data_hdr_t;
/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
+ __be32 pad; /* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+static inline struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
+ return hdr3->best_free;
+ }
+ return hdr->bestfree;
+}
+
+/*
* Active entry in a data block.
*
* Aligned to 8 bytes. After the variable length name field there is a
@@ -280,6 +337,94 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
be16_to_cpu(dup->length) - sizeof(__be16));
}
+static inline size_t
+xfs_dir3_data_hdr_size(bool dir3)
+{
+ if (dir3)
+ return sizeof(struct xfs_dir3_data_hdr);
+ return sizeof(struct xfs_dir2_data_hdr);
+}
+
+static inline size_t
+xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
+{
+ bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ return xfs_dir3_data_hdr_size(dir3);
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ *
+ * The macros are used for shortform directories as they have no headers to read
+ * the magic number out of. Shortform directories need to know the size of the
+ * data block header because the sfe embeds the block offset of the entry into
+ * it so that it doesn't change when format conversion occurs. Bad Things Happen
+ * if we don't follow this rule.
+ */
+#define XFS_DIR3_DATA_DOT_OFFSET(mp) \
+ xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
+#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
+ (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1))
+#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \
+ (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2))
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
+{
+ return xfs_dir3_data_entry_offset(hdr);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
+{
+ return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
+{
+ return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2);
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + xfs_dir3_data_dot_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + xfs_dir3_data_first_offset(hdr));
+}
+
/*
* Leaf block structures.
*
@@ -329,6 +474,21 @@ typedef struct xfs_dir2_leaf_hdr {
__be16 stale; /* count of stale entries */
} xfs_dir2_leaf_hdr_t;
+struct xfs_dir3_leaf_hdr {
+ struct xfs_da3_blkinfo info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t stale;
+};
+
/*
* Leaf block entry.
*/
@@ -348,23 +508,50 @@ typedef struct xfs_dir2_leaf_tail {
* Leaf block.
*/
typedef struct xfs_dir2_leaf {
- xfs_dir2_leaf_hdr_t hdr; /* leaf header */
- xfs_dir2_leaf_entry_t ents[]; /* entries */
+ xfs_dir2_leaf_hdr_t hdr; /* leaf header */
+ xfs_dir2_leaf_entry_t __ents[]; /* entries */
} xfs_dir2_leaf_t;
-/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
+struct xfs_dir3_leaf {
+ struct xfs_dir3_leaf_hdr hdr; /* leaf header */
+ struct xfs_dir2_leaf_entry __ents[]; /* entries */
+};
-static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+static inline int
+xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
{
- return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) /
+ if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
+ return sizeof(struct xfs_dir3_leaf_hdr);
+ return sizeof(struct xfs_dir2_leaf_hdr);
+}
+
+static inline int
+xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
+{
+ return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
(uint)sizeof(struct xfs_dir2_leaf_entry);
}
/*
* Get address of the bestcount field in the single-leaf block.
*/
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
+ return lp3->__ents;
+ }
+ return lp->__ents;
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
static inline struct xfs_dir2_leaf_tail *
xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
{
@@ -383,6 +570,10 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
}
/*
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
* Convert dataptr to byte in file space
*/
static inline xfs_dir2_off_t
@@ -520,19 +711,66 @@ typedef struct xfs_dir2_free {
/* unused entries are -1 */
} xfs_dir2_free_t;
-static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp)
+struct xfs_dir3_free_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+ struct xfs_dir3_free_hdr hdr;
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+ __uint32_t magic;
+ __uint32_t firstdb;
+ __uint32_t nvalid;
+ __uint32_t nused;
+
+};
+
+void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from);
+
+static inline int
+xfs_dir3_free_hdr_size(struct xfs_mount *mp)
{
- return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) /
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return sizeof(struct xfs_dir3_free_hdr);
+ return sizeof(struct xfs_dir2_free_hdr);
+}
+
+static inline int
+xfs_dir3_free_max_bests(struct xfs_mount *mp)
+{
+ return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
sizeof(xfs_dir2_data_off_t);
}
+static inline __be16 *
+xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
+}
+
/*
* Convert data space db to the corresponding free db.
*/
static inline xfs_dir2_db_t
xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
{
- return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp);
+ return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
}
/*
@@ -541,7 +779,7 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
static inline int
xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
{
- return db % xfs_dir2_free_max_bests(mp);
+ return db % xfs_dir3_free_max_bests(mp);
}
/*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 60cd2fa4e047..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -33,97 +34,371 @@
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Local function declarations.
*/
-#ifdef DEBUG
-static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp);
-#else
-#define xfs_dir2_leaf_check(dp, bp)
-#endif
static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
+static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
int first, int last);
-static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
-static void
-xfs_dir2_leaf_verify(
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(mp, bp) \
+do { \
+ if (!xfs_dir3_leaf1_check((mp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(mp, bp)
+#endif
+
+void
+xfs_dir3_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+ } else {
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->stale = be16_to_cpu(hdr3->stale);
+ }
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+void
+xfs_dir3_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ if (from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC) {
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+ } else {
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->stale = cpu_to_be16(from->stale);
+ }
+}
+
+bool
+xfs_dir3_leaf_check_int(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf)
+{
+ struct xfs_dir2_leaf_entry *ents;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
+
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+
+ /*
+ * XXX (dgc): This value is not restrictive enough.
+ * Should factor in the size of the bests table as well.
+ * We can deduce a value for that from di_size.
+ */
+ if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf))
+ return false;
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ return false;
+
+ /* Check hash value order, count stale entries. */
+ for (i = stale = 0; i < hdr->count; i++) {
+ if (i + 1 < hdr->count) {
+ if (be32_to_cpu(ents[i].hashval) >
+ be32_to_cpu(ents[i + 1].hashval))
+ return false;
+ }
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (hdr->stale != stale)
+ return false;
+ return true;
+}
+
+static bool
+xfs_dir3_leaf_verify(
struct xfs_buf *bp,
- __be16 magic)
+ __uint16_t magic)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
- int block_ok = 0;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+ if ((magic == XFS_DIR2_LEAF1_MAGIC &&
+ leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) ||
+ (magic == XFS_DIR2_LEAFN_MAGIC &&
+ leafhdr.magic != XFS_DIR3_LEAFN_MAGIC))
+ return false;
- block_ok = hdr->info.magic == magic;
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (leafhdr.magic != magic)
+ return false;
+ }
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+
+static void
+__read_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_DIR3_LEAF_CRC_OFF)) ||
+ !xfs_dir3_leaf_verify(bp, magic)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+__write_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_leaf_verify(bp, magic)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
}
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF);
}
static void
-xfs_dir2_leaf1_read_verify(
+xfs_dir3_leaf1_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
}
static void
-xfs_dir2_leaf1_write_verify(
+xfs_dir3_leaf1_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
}
-void
-xfs_dir2_leafn_read_verify(
+static void
+xfs_dir3_leafn_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
}
-void
-xfs_dir2_leafn_write_verify(
+static void
+xfs_dir3_leafn_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
}
-static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
- .verify_read = xfs_dir2_leaf1_read_verify,
- .verify_write = xfs_dir2_leaf1_write_verify,
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .verify_read = xfs_dir3_leaf1_read_verify,
+ .verify_write = xfs_dir3_leaf1_write_verify,
};
-const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
- .verify_read = xfs_dir2_leafn_read_verify,
- .verify_write = xfs_dir2_leafn_write_verify,
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .verify_read = xfs_dir3_leafn_read_verify,
+ .verify_write = xfs_dir3_leafn_write_verify,
};
static int
-xfs_dir2_leaf_read(
+xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ return err;
}
int
-xfs_dir2_leafn_read(
+xfs_dir3_leafn_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ __uint16_t type)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+ memset(leaf3, 0, sizeof(*leaf3));
+
+ leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+ ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+ : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+ leaf3->info.owner = cpu_to_be64(owner);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ memset(leaf, 0, sizeof(*leaf));
+ leaf->hdr.info.magic = cpu_to_be16(type);
+ }
+
+ /*
+ * If it's a leaf-format directory initialize the tail.
+ * Caller is responsible for initialising the bests table.
+ */
+ if (type == XFS_DIR2_LEAF1_MAGIC) {
+ struct xfs_dir2_leaf_tail *ltp;
+
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp->bestcount = 0;
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+ } else {
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ }
+}
+
+int
+xfs_dir3_leaf_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t bno,
+ struct xfs_buf **bpp,
+ __uint16_t magic)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+ bno < XFS_DIR2_FREE_FIRSTDB(mp));
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_log_header(tp, bp);
+ if (magic == XFS_DIR2_LEAF1_MAGIC)
+ xfs_dir3_leaf_log_tail(tp, bp);
+ *bpp = bp;
+ return 0;
}
/*
@@ -149,6 +424,9 @@ xfs_dir2_block_to_leaf(
int needlog; /* need to log block header */
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_block_to_leaf(args);
@@ -168,26 +446,33 @@ xfs_dir2_block_to_leaf(
/*
* Initialize the leaf block, get a buffer for it.
*/
- if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+ error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
+
leaf = lbp->b_addr;
hdr = dbp->b_addr;
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+
/*
* Set the counts in the leaf header.
*/
- leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count));
- leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale));
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ leafhdr.count = be32_to_cpu(btp->count);
+ leafhdr.stale = be32_to_cpu(btp->stale);
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, lbp);
+
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
- memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1);
+ memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
@@ -202,8 +487,13 @@ xfs_dir2_block_to_leaf(
/*
* Fix up the block header, make it a data block.
*/
- dbp->b_ops = &xfs_dir2_data_buf_ops;
- hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ dbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ else
+ hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
if (needscan)
xfs_dir2_data_freescan(mp, hdr, &needlog);
/*
@@ -212,21 +502,22 @@ xfs_dir2_block_to_leaf(
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
ltp->bestcount = cpu_to_be32(1);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- bestsp[0] = hdr->bestfree[0].length;
+ bestsp[0] = bf[0].length;
/*
* Log the data header and leaf bests table.
*/
if (needlog)
xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_dir2_data_check(dp, dbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+ xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir3_data_check(dp, dbp);
+ xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
return 0;
}
STATIC void
-xfs_dir2_leaf_find_stale(
- struct xfs_dir2_leaf *leaf,
+xfs_dir3_leaf_find_stale(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int index,
int *lowstale,
int *highstale)
@@ -235,7 +526,7 @@ xfs_dir2_leaf_find_stale(
* Find the first stale entry before our index, if any.
*/
for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
- if (leaf->ents[*lowstale].address ==
+ if (ents[*lowstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
break;
}
@@ -245,10 +536,8 @@ xfs_dir2_leaf_find_stale(
* Stop if the result would require moving more entries than using
* lowstale.
*/
- for (*highstale = index;
- *highstale < be16_to_cpu(leaf->hdr.count);
- ++*highstale) {
- if (leaf->ents[*highstale].address ==
+ for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+ if (ents[*highstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
break;
if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
@@ -257,8 +546,9 @@ xfs_dir2_leaf_find_stale(
}
struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(
- xfs_dir2_leaf_t *leaf, /* leaf structure */
+xfs_dir3_leaf_find_entry(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int index, /* leaf table position */
int compact, /* need to compact leaves */
int lowstale, /* index of prev stale leaf */
@@ -266,7 +556,7 @@ xfs_dir2_leaf_find_entry(
int *lfloglow, /* low leaf logging index */
int *lfloghigh) /* high leaf logging index */
{
- if (!leaf->hdr.stale) {
+ if (!leafhdr->stale) {
xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
/*
@@ -274,18 +564,16 @@ xfs_dir2_leaf_find_entry(
*
* If there are no stale entries, just insert a hole at index.
*/
- lep = &leaf->ents[index];
- if (index < be16_to_cpu(leaf->hdr.count))
+ lep = &ents[index];
+ if (index < leafhdr->count)
memmove(lep + 1, lep,
- (be16_to_cpu(leaf->hdr.count) - index) *
- sizeof(*lep));
+ (leafhdr->count - index) * sizeof(*lep));
/*
* Record low and high logging indices for the leaf.
*/
*lfloglow = index;
- *lfloghigh = be16_to_cpu(leaf->hdr.count);
- be16_add_cpu(&leaf->hdr.count, 1);
+ *lfloghigh = leafhdr->count++;
return lep;
}
@@ -299,16 +587,17 @@ xfs_dir2_leaf_find_entry(
* entries before and after our insertion point.
*/
if (compact == 0)
- xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+ &lowstale, &highstale);
/*
* If the low one is better, use it.
*/
if (lowstale >= 0 &&
- (highstale == be16_to_cpu(leaf->hdr.count) ||
+ (highstale == leafhdr->count ||
index - lowstale - 1 < highstale - index)) {
ASSERT(index - lowstale - 1 >= 0);
- ASSERT(leaf->ents[lowstale].address ==
+ ASSERT(ents[lowstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
/*
@@ -316,37 +605,34 @@ xfs_dir2_leaf_find_entry(
* for the new entry.
*/
if (index - lowstale - 1 > 0) {
- memmove(&leaf->ents[lowstale],
- &leaf->ents[lowstale + 1],
+ memmove(&ents[lowstale], &ents[lowstale + 1],
(index - lowstale - 1) *
- sizeof(xfs_dir2_leaf_entry_t));
+ sizeof(xfs_dir2_leaf_entry_t));
}
*lfloglow = MIN(lowstale, *lfloglow);
*lfloghigh = MAX(index - 1, *lfloghigh);
- be16_add_cpu(&leaf->hdr.stale, -1);
- return &leaf->ents[index - 1];
+ leafhdr->stale--;
+ return &ents[index - 1];
}
/*
* The high one is better, so use that one.
*/
ASSERT(highstale - index >= 0);
- ASSERT(leaf->ents[highstale].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+ ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
/*
* Copy entries down to cover the stale entry and make room for the
* new entry.
*/
if (highstale - index > 0) {
- memmove(&leaf->ents[index + 1],
- &leaf->ents[index],
+ memmove(&ents[index + 1], &ents[index],
(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
}
*lfloglow = MIN(index, *lfloglow);
*lfloghigh = MAX(highstale, *lfloghigh);
- be16_add_cpu(&leaf->hdr.stale, -1);
- return &leaf->ents[index];
+ leafhdr->stale--;
+ return &ents[index];
}
/*
@@ -383,6 +669,9 @@ xfs_dir2_leaf_addname(
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
@@ -390,7 +679,7 @@ xfs_dir2_leaf_addname(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
if (error)
return error;
@@ -403,16 +692,19 @@ xfs_dir2_leaf_addname(
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
length = xfs_dir2_data_entsize(args->namelen);
+
/*
* See if there are any entries with the same hash value
* and space in their block for the new entry.
* This is good because it puts multiple same-hash value entries
* in a data block, improving the lookup of those entries.
*/
- for (use_block = -1, lep = &leaf->ents[index];
- index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+ for (use_block = -1, lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
index++, lep++) {
if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
@@ -445,7 +737,7 @@ xfs_dir2_leaf_addname(
* How many bytes do we need in the leaf block?
*/
needbytes = 0;
- if (!leaf->hdr.stale)
+ if (!leafhdr.stale)
needbytes += sizeof(xfs_dir2_leaf_entry_t);
if (use_block == -1)
needbytes += sizeof(xfs_dir2_data_off_t);
@@ -460,16 +752,15 @@ xfs_dir2_leaf_addname(
* If we don't have enough free bytes but we can make enough
* by compacting out stale entries, we'll do that.
*/
- if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
- needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
+ if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+ leafhdr.stale > 1)
compact = 1;
- }
+
/*
* Otherwise if we don't have enough free bytes we need to
* convert to node form.
*/
- else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
- leaf->hdr.count)] < needbytes) {
+ else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
/*
* Just checking or no space reservation, give up.
*/
@@ -517,15 +808,15 @@ xfs_dir2_leaf_addname(
* point later.
*/
if (compact) {
- xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
}
/*
* There are stale entries, so we'll need log-low and log-high
* impossibly bad values later.
*/
- else if (be16_to_cpu(leaf->hdr.stale)) {
- lfloglow = be16_to_cpu(leaf->hdr.count);
+ else if (leafhdr.stale) {
+ lfloglow = leafhdr.count;
lfloghigh = -1;
}
/*
@@ -544,7 +835,7 @@ xfs_dir2_leaf_addname(
/*
* Initialize the block.
*/
- if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+ if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
xfs_trans_brelse(tp, lbp);
return error;
}
@@ -557,23 +848,24 @@ xfs_dir2_leaf_addname(
memmove(&bestsp[0], &bestsp[1],
be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
be32_add_cpu(&ltp->bestcount, 1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(tp, lbp);
+ xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
}
/*
* If we're filling in a previously empty block just log it.
*/
else
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
hdr = dbp->b_addr;
- bestsp[use_block] = hdr->bestfree[0].length;
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ bestsp[use_block] = bf[0].length;
grown = 1;
} else {
/*
* Already had space in some data block.
* Just read that one in.
*/
- error = xfs_dir2_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(mp, use_block),
-1, &dbp);
if (error) {
@@ -581,13 +873,14 @@ xfs_dir2_leaf_addname(
return error;
}
hdr = dbp->b_addr;
+ bf = xfs_dir3_data_bestfree_p(hdr);
grown = 0;
}
/*
* Point to the biggest freespace in our data block.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
ASSERT(be16_to_cpu(dup->length) >= length);
needscan = needlog = 0;
/*
@@ -620,13 +913,13 @@ xfs_dir2_leaf_addname(
* If the bests table needs to be changed, do it.
* Log the change unless we've already done that.
*/
- if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) {
- bestsp[use_block] = hdr->bestfree[0].length;
+ if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+ bestsp[use_block] = bf[0].length;
if (!grown)
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
}
- lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
highstale, &lfloglow, &lfloghigh);
/*
@@ -638,82 +931,40 @@ xfs_dir2_leaf_addname(
/*
* Log the leaf fields and give up the buffers.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
- xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, lbp);
+ xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir3_data_check(dp, dbp);
return 0;
}
-#ifdef DEBUG
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-STATIC void
-xfs_dir2_leaf_check(
- struct xfs_inode *dp, /* incore directory inode */
- struct xfs_buf *bp) /* leaf's buffer */
-{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
-
- leaf = bp->b_addr;
- mp = dp->i_mount;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
- /*
- * This value is not restrictive enough.
- * Should factor in the size of the bests table as well.
- * We can deduce a value for that from di_size.
- */
- ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- /*
- * Leaves and bests don't overlap.
- */
- ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
- (char *)xfs_dir2_leaf_bests_p(ltp));
- /*
- * Check hash value order, count stale entries.
- */
- for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
- if (i + 1 < be16_to_cpu(leaf->hdr.count))
- ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
- be32_to_cpu(leaf->ents[i + 1].hashval));
- if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
- stale++;
- }
- ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
-}
-#endif /* DEBUG */
-
/*
* Compact out any stale entries in the leaf.
* Log the header and changed leaf entries, if any.
*/
void
-xfs_dir2_leaf_compact(
+xfs_dir3_leaf_compact(
xfs_da_args_t *args, /* operation arguments */
+ struct xfs_dir3_icleaf_hdr *leafhdr,
struct xfs_buf *bp) /* leaf buffer */
{
int from; /* source leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
+ struct xfs_dir2_leaf_entry *ents;
leaf = bp->b_addr;
- if (!leaf->hdr.stale) {
+ if (!leafhdr->stale)
return;
- }
+
/*
* Compress out the stale entries in place.
*/
- for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) {
- if (leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
/*
* Only actually copy the entries that are different.
@@ -721,19 +972,21 @@ xfs_dir2_leaf_compact(
if (from > to) {
if (loglow == -1)
loglow = to;
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
}
to++;
}
/*
* Update and log the header, log the leaf entries.
*/
- ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
- be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(args->trans, bp);
+ ASSERT(leafhdr->stale == from - to);
+ leafhdr->count -= leafhdr->stale;
+ leafhdr->stale = 0;
+
+ xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir3_leaf_log_header(args->trans, bp);
if (loglow != -1)
- xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1);
}
/*
@@ -745,8 +998,9 @@ xfs_dir2_leaf_compact(
* and leaf logging indices.
*/
void
-xfs_dir2_leaf_compact_x1(
- struct xfs_buf *bp, /* leaf buffer */
+xfs_dir3_leaf_compact_x1(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int *indexp, /* insertion index */
int *lowstalep, /* out: stale entry before us */
int *highstalep, /* out: stale entry after us */
@@ -757,22 +1011,20 @@ xfs_dir2_leaf_compact_x1(
int highstale; /* stale entry at/after index */
int index; /* insertion index */
int keepstale; /* source index of kept stale */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int lowstale; /* stale entry before index */
int newindex=0; /* new insertion index */
int to; /* destination copy index */
- leaf = bp->b_addr;
- ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
+ ASSERT(leafhdr->stale > 1);
index = *indexp;
- xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
/*
* Pick the better of lowstale and highstale.
*/
if (lowstale >= 0 &&
- (highstale == be16_to_cpu(leaf->hdr.count) ||
+ (highstale == leafhdr->count ||
index - lowstale <= highstale - index))
keepstale = lowstale;
else
@@ -781,15 +1033,14 @@ xfs_dir2_leaf_compact_x1(
* Copy the entries in place, removing all the stale entries
* except keepstale.
*/
- for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+ for (from = to = 0; from < leafhdr->count; from++) {
/*
* Notice the new value of index.
*/
if (index == from)
newindex = to;
if (from != keepstale &&
- leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
if (from == to)
*lowlogp = to;
continue;
@@ -803,7 +1054,7 @@ xfs_dir2_leaf_compact_x1(
* Copy only the entries that have moved.
*/
if (from > to)
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
to++;
}
ASSERT(from > to);
@@ -817,8 +1068,8 @@ xfs_dir2_leaf_compact_x1(
/*
* Adjust the leaf header values.
*/
- be16_add_cpu(&leaf->hdr.count, -(from - to));
- leaf->hdr.stale = cpu_to_be16(1);
+ leafhdr->count -= from - to;
+ leafhdr->stale = 1;
/*
* Remember the low/high stale value only in the "right"
* direction.
@@ -826,8 +1077,8 @@ xfs_dir2_leaf_compact_x1(
if (lowstale >= newindex)
lowstale = -1;
else
- highstale = be16_to_cpu(leaf->hdr.count);
- *highlogp = be16_to_cpu(leaf->hdr.count) - 1;
+ highstale = leafhdr->count;
+ *highlogp = leafhdr->count - 1;
*lowstalep = lowstale;
*highstalep = highstale;
}
@@ -857,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp = *bpp;
struct xfs_bmbt_irec *map = mip->map;
+ struct blk_plug plug;
int error = 0;
int length;
int i;
@@ -965,7 +1217,7 @@ xfs_dir2_leaf_readbuf(
* Read the directory block starting at the first mapping.
*/
mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
- error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
+ error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
map->br_blockcount >= mp->m_dirblkfsbs ?
XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
@@ -985,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
*/
+ blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
i += mp->m_dirblkfsbs) {
@@ -994,7 +1247,7 @@ xfs_dir2_leaf_readbuf(
*/
if (i > mip->ra_current &&
map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
- xfs_dir2_data_readahead(NULL, dp,
+ xfs_dir3_data_readahead(NULL, dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(mp,
map[mip->ra_index].br_startblock +
@@ -1007,7 +1260,7 @@ xfs_dir2_leaf_readbuf(
* use our mapping, but this is a very rare case.
*/
else if (i > mip->ra_current) {
- xfs_dir2_data_readahead(NULL, dp,
+ xfs_dir3_data_readahead(NULL, dp,
map[mip->ra_index].br_startoff +
mip->ra_offset, -1);
mip->ra_current = i;
@@ -1036,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
}
}
}
+ blk_finish_plug(&plug);
out:
*bpp = bp;
@@ -1049,10 +1303,8 @@ out:
int /* error */
xfs_dir2_leaf_getdents(
xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx,
+ size_t bufsize)
{
struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
@@ -1071,7 +1323,7 @@ xfs_dir2_leaf_getdents(
* If the offset is at or past the largest allowed value,
* give up right away.
*/
- if (*offset >= XFS_DIR2_MAX_DATAPTR)
+ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
return 0;
mp = dp->i_mount;
@@ -1085,14 +1337,14 @@ xfs_dir2_leaf_getdents(
mp->m_sb.sb_blocksize);
map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
(length * sizeof(struct xfs_bmbt_irec)),
- KM_SLEEP);
+ KM_SLEEP | KM_NOFS);
map_info->map_size = length;
/*
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
- curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+ curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
/*
* Force this conversion through db so we truncate the offset
@@ -1133,17 +1385,17 @@ xfs_dir2_leaf_getdents(
ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
map_info->curdb);
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
*/
- ptr = (char *)(hdr + 1);
+ ptr = (char *)xfs_dir3_data_entry_p(hdr);
byteoff = xfs_dir2_byte_to_off(mp, curoff);
/*
* Skip past the header.
*/
if (byteoff == 0)
- curoff += (uint)sizeof(*hdr);
+ curoff += xfs_dir3_data_entry_offset(hdr);
/*
* Skip past entries until we reach our offset.
*/
@@ -1193,8 +1445,8 @@ xfs_dir2_leaf_getdents(
dep = (xfs_dir2_data_entry_t *)ptr;
length = xfs_dir2_data_entsize(dep->namelen);
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+ ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber), DT_UNKNOWN))
break;
@@ -1211,78 +1463,21 @@ xfs_dir2_leaf_getdents(
* All done. Set output offset value to current offset.
*/
if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
- *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+ ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
- *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
kmem_free(map_info);
if (bp)
xfs_trans_brelse(NULL, bp);
return error;
}
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-int
-xfs_dir2_leaf_init(
- xfs_da_args_t *args, /* operation arguments */
- xfs_dir2_db_t bno, /* directory block number */
- struct xfs_buf **bpp, /* out: leaf buffer */
- int magic) /* magic number for block */
-{
- struct xfs_buf *bp; /* leaf buffer */
- xfs_inode_t *dp; /* incore directory inode */
- int error; /* error return code */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
-
- dp = args->dp;
- ASSERT(dp != NULL);
- tp = args->trans;
- mp = dp->i_mount;
- ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
- bno < XFS_DIR2_FREE_FIRSTDB(mp));
- /*
- * Get the buffer for the block.
- */
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
- if (error)
- return error;
-
- /*
- * Initialize the header.
- */
- leaf = bp->b_addr;
- leaf->hdr.info.magic = cpu_to_be16(magic);
- leaf->hdr.info.forw = 0;
- leaf->hdr.info.back = 0;
- leaf->hdr.count = 0;
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(tp, bp);
- /*
- * If it's a leaf-format directory initialize the tail.
- * In this case our caller has the real bests table to copy into
- * the block.
- */
- if (magic == XFS_DIR2_LEAF1_MAGIC) {
- bp->b_ops = &xfs_dir2_leaf1_buf_ops;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- ltp->bestcount = 0;
- xfs_dir2_leaf_log_tail(tp, bp);
- } else
- bp->b_ops = &xfs_dir2_leafn_buf_ops;
- *bpp = bp;
- return 0;
-}
/*
* Log the bests entries indicated from a leaf1 block.
*/
static void
-xfs_dir2_leaf_log_bests(
+xfs_dir3_leaf_log_bests(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
@@ -1290,11 +1485,12 @@ xfs_dir2_leaf_log_bests(
{
__be16 *firstb; /* pointer to first entry */
__be16 *lastb; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
firstb = xfs_dir2_leaf_bests_p(ltp) + first;
lastb = xfs_dir2_leaf_bests_p(ltp) + last;
@@ -1306,7 +1502,7 @@ xfs_dir2_leaf_log_bests(
* Log the leaf entries indicated from a leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_ents(
+xfs_dir3_leaf_log_ents(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
@@ -1314,13 +1510,17 @@ xfs_dir2_leaf_log_ents(
{
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
- leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
- leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- firstlep = &leaf->ents[first];
- lastlep = &leaf->ents[last];
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ firstlep = &ents[first];
+ lastlep = &ents[last];
xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1329,34 +1529,38 @@ xfs_dir2_leaf_log_ents(
* Log the header of the leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_header(
+xfs_dir3_leaf_log_header(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
- leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
- leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
- (uint)(sizeof(leaf->hdr) - 1));
+ xfs_dir3_leaf_hdr_size(leaf) - 1);
}
/*
* Log the tail of the leaf1 block.
*/
STATIC void
-xfs_dir2_leaf_log_tail(
+xfs_dir3_leaf_log_tail(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_mount *mp = tp->t_mountp;
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- mp = tp->t_mountp;
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
(uint)(mp->m_dirblksize - 1));
@@ -1380,6 +1584,7 @@ xfs_dir2_leaf_lookup(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leaf_lookup(args);
@@ -1391,12 +1596,14 @@ xfs_dir2_leaf_lookup(
}
tp = args->trans;
dp = args->dp;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp->i_mount, lbp);
leaf = lbp->b_addr;
+ ents = xfs_dir3_leaf_ents_p(leaf);
/*
* Get to the leaf entry and contained data entry address.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Point to the data entry.
*/
@@ -1440,18 +1647,23 @@ xfs_dir2_leaf_lookup_int(
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t cidb = -1; /* case match data block no. */
enum xfs_dacmp cmp; /* name compare result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
if (error)
return error;
*lbpp = lbp;
leaf = lbp->b_addr;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(mp, lbp);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
/*
* Look for the first leaf entry with our hash value.
*/
@@ -1460,9 +1672,9 @@ xfs_dir2_leaf_lookup_int(
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip over stale leaf entries.
*/
@@ -1479,7 +1691,7 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_trans_brelse(tp, dbp);
- error = xfs_dir2_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(mp, newdb),
-1, &dbp);
if (error) {
@@ -1520,7 +1732,7 @@ xfs_dir2_leaf_lookup_int(
ASSERT(cidb != -1);
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
- error = xfs_dir2_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(mp, cidb),
-1, &dbp);
if (error) {
@@ -1566,6 +1778,9 @@ xfs_dir2_leaf_removename(
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_removename(args);
@@ -1580,16 +1795,19 @@ xfs_dir2_leaf_removename(
mp = dp->i_mount;
leaf = lbp->b_addr;
hdr = dbp->b_addr;
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
/*
* Point to the leaf entry, use that to point to the data entry.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
needscan = needlog = 0;
- oldbest = be16_to_cpu(hdr->bestfree[0].length);
+ oldbest = be16_to_cpu(bf[0].length);
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
@@ -1602,10 +1820,13 @@ xfs_dir2_leaf_removename(
/*
* We just mark the leaf entry stale by putting a null in it.
*/
- be16_add_cpu(&leaf->hdr.stale, 1);
- xfs_dir2_leaf_log_header(tp, lbp);
+ leafhdr.stale++;
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, lbp);
+
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+ xfs_dir3_leaf_log_ents(tp, lbp, index, index);
+
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
@@ -1618,16 +1839,16 @@ xfs_dir2_leaf_removename(
* If the longest freespace in the data block has changed,
* put the new value in the bests table and log that.
*/
- if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) {
- bestsp[db] = hdr->bestfree[0].length;
- xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+ if (be16_to_cpu(bf[0].length) != oldbest) {
+ bestsp[db] = bf[0].length;
+ xfs_dir3_leaf_log_bests(tp, lbp, db, db);
}
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the data block is now empty then get rid of the data block.
*/
- if (be16_to_cpu(hdr->bestfree[0].length) ==
- mp->m_dirblksize - (uint)sizeof(*hdr)) {
+ if (be16_to_cpu(bf[0].length) ==
+ mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) {
ASSERT(db != mp->m_dirdatablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
@@ -1638,7 +1859,7 @@ xfs_dir2_leaf_removename(
*/
if (error == ENOSPC && args->total == 0)
error = 0;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(mp, lbp);
return error;
}
dbp = NULL;
@@ -1661,8 +1882,8 @@ xfs_dir2_leaf_removename(
memmove(&bestsp[db - i], bestsp,
(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
be32_add_cpu(&ltp->bestcount, -(db - i));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(tp, lbp);
+ xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
} else
bestsp[db] = cpu_to_be16(NULLDATAOFF);
}
@@ -1672,7 +1893,7 @@ xfs_dir2_leaf_removename(
else if (db != mp->m_dirdatablk)
dbp = NULL;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(mp, lbp);
/*
* See if we can convert to block form.
*/
@@ -1695,6 +1916,7 @@ xfs_dir2_leaf_replace(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leaf_replace(args);
@@ -1706,10 +1928,11 @@ xfs_dir2_leaf_replace(
}
dp = args->dp;
leaf = lbp->b_addr;
+ ents = xfs_dir3_leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
/*
* Point to the data entry.
*/
@@ -1723,7 +1946,7 @@ xfs_dir2_leaf_replace(
dep->inumber = cpu_to_be64(args->inumber);
tp = args->trans;
xfs_dir2_data_log_entry(tp, dbp, dep);
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp->i_mount, lbp);
xfs_trans_brelse(tp, lbp);
return 0;
}
@@ -1745,17 +1968,22 @@ xfs_dir2_leaf_search_hash(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
leaf = lbp->b_addr;
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
#ifndef __KERNEL__
- if (!leaf->hdr.count)
+ if (!leafhdr.count)
return 0;
#endif
/*
* Note, the table cannot be empty, so we have to go through the loop.
* Binary search the leaf entries looking for our hash value.
*/
- for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1,
+ for (lep = ents, low = 0, high = leafhdr.count - 1,
hashwant = args->hashval;
low <= high; ) {
mid = (low + high) >> 1;
@@ -1807,7 +2035,7 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
if (error)
return error;
@@ -1817,10 +2045,12 @@ xfs_dir2_leaf_trim_data(
#ifdef DEBUG
{
struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+ struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr);
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
- ASSERT(be16_to_cpu(hdr->bestfree[0].length) ==
- mp->m_dirblksize - (uint)sizeof(*hdr));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+ ASSERT(be16_to_cpu(bf[0].length) ==
+ mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr));
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1839,23 +2069,29 @@ xfs_dir2_leaf_trim_data(
bestsp = xfs_dir2_leaf_bests_p(ltp);
be32_add_cpu(&ltp->bestcount, -1);
memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(tp, lbp);
+ xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
return 0;
}
static inline size_t
-xfs_dir2_leaf_size(
- struct xfs_dir2_leaf_hdr *hdr,
+xfs_dir3_leaf_size(
+ struct xfs_dir3_icleaf_hdr *hdr,
int counts)
{
- int entries;
+ int entries;
+ int hdrsize;
+
+ entries = hdr->count - hdr->stale;
+ if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+ hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+ else
+ hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
- entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale);
- return sizeof(xfs_dir2_leaf_hdr_t) +
- entries * sizeof(xfs_dir2_leaf_entry_t) +
- counts * sizeof(xfs_dir2_data_off_t) +
- sizeof(xfs_dir2_leaf_tail_t);
+ return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+ + counts * sizeof(xfs_dir2_data_off_t)
+ + sizeof(xfs_dir2_leaf_tail_t);
}
/*
@@ -1879,6 +2115,8 @@ xfs_dir2_node_to_leaf(
xfs_mount_t *mp; /* filesystem mount point */
int rval; /* successful free trim? */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir3_icfree_hdr freehdr;
/*
* There's more than a leaf level in the btree, so there must
@@ -1928,7 +2166,11 @@ xfs_dir2_node_to_leaf(
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
/*
* Read the freespace block.
*/
@@ -1936,44 +2178,49 @@ xfs_dir2_node_to_leaf(
if (error)
return error;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- ASSERT(!free->hdr.firstdb);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
+ ASSERT(!freehdr.firstdb);
/*
* Now see if the leafn and free data will fit in a leaf1.
* If not, release the buffer and give up.
*/
- if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) >
- mp->m_dirblksize) {
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) {
xfs_trans_brelse(tp, fbp);
return 0;
}
/*
* If the leaf has any stale entries in it, compress them out.
- * The compact routine will log the header.
*/
- if (be16_to_cpu(leaf->hdr.stale))
- xfs_dir2_leaf_compact(args, lbp);
- else
- xfs_dir2_leaf_log_header(tp, lbp);
+ if (leafhdr.stale)
+ xfs_dir3_leaf_compact(args, &leafhdr, lbp);
- lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
- leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+ lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+ leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+ ? XFS_DIR2_LEAF1_MAGIC
+ : XFS_DIR3_LEAF1_MAGIC;
/*
* Set up the leaf tail from the freespace block.
*/
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- ltp->bestcount = free->hdr.nvalid;
+ ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
/*
* Set up the leaf bests table.
*/
- memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
- be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t));
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_check(dp, lbp);
+ memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free),
+ freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, lbp);
+ xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(tp, lbp);
+ xfs_dir3_leaf_check(mp, lbp);
+
/*
* Get rid of the freespace block.
*/
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5980f9b7fa9b..2226a00acd15 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -32,20 +33,14 @@
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Function declarations.
*/
static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
int index);
-#ifdef DEBUG
-static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp);
-#else
-#define xfs_dir2_leafn_check(dp, bp)
-#endif
-static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s,
- int start_s, struct xfs_buf *bp_d,
- int start_d, int count);
static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
@@ -55,52 +50,126 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
xfs_da_state_blk_t *fblk);
-static void
-xfs_dir2_free_verify(
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(mp, bp) \
+do { \
+ if (!xfs_dir3_leafn_check((mp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(mp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
- int block_ok = 0;
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- if (!block_ok) {
- XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
- XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+ return false;
}
+
+ /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+ return true;
}
static void
-xfs_dir2_free_read_verify(
+xfs_dir3_free_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_free_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_DIR3_FREE_CRC_OFF)) ||
+ !xfs_dir3_free_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
-xfs_dir2_free_write_verify(
+xfs_dir3_free_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_free_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_free_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF);
}
-static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
- .verify_read = xfs_dir2_free_read_verify,
- .verify_write = xfs_dir2_free_write_verify,
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .verify_read = xfs_dir3_free_read_verify,
+ .verify_write = xfs_dir3_free_write_verify,
};
static int
-__xfs_dir2_free_read(
+__xfs_dir3_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+ /* try read returns without an error or *bpp if it lands in a hole */
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+ return err;
}
int
@@ -110,7 +179,7 @@ xfs_dir2_free_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
}
static int
@@ -120,7 +189,96 @@ xfs_dir2_free_try_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+
+void
+xfs_dir3_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) {
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ } else {
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+ to->magic = be32_to_cpu(hdr3->hdr.magic);
+ to->firstdb = be32_to_cpu(hdr3->firstdb);
+ to->nvalid = be32_to_cpu(hdr3->nvalid);
+ to->nused = be32_to_cpu(hdr3->nused);
+ }
+
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC ||
+ to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC ||
+ from->magic == XFS_DIR3_FREE_MAGIC);
+
+ if (from->magic == XFS_DIR2_FREE_MAGIC) {
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+ } else {
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+ hdr3->hdr.magic = cpu_to_be32(from->magic);
+ hdr3->firstdb = cpu_to_be32(from->firstdb);
+ hdr3->nvalid = cpu_to_be32(from->nvalid);
+ hdr3->nused = cpu_to_be32(from->nused);
+ }
+}
+
+static int
+xfs_dir3_free_get_buf(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dir2_db_t fbno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_dir3_icfree_hdr hdr;
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+
+ /*
+ * Initialize the new block to be empty, and remember
+ * its first slot as our empty slot.
+ */
+ memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+ hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ } else
+ hdr.magic = XFS_DIR2_FREE_MAGIC;
+ xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
+ *bpp = bp;
+ return 0;
}
/*
@@ -134,13 +292,16 @@ xfs_dir2_free_log_bests(
int last) /* last entry to log */
{
xfs_dir2_free_t *free; /* freespace structure */
+ __be16 *bests;
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
xfs_trans_log_buf(tp, bp,
- (uint)((char *)&free->bests[first] - (char *)free),
- (uint)((char *)&free->bests[last] - (char *)free +
- sizeof(free->bests[0]) - 1));
+ (uint)((char *)&bests[first] - (char *)free),
+ (uint)((char *)&bests[last] - (char *)free +
+ sizeof(bests[0]) - 1));
}
/*
@@ -154,9 +315,9 @@ xfs_dir2_free_log_header(
xfs_dir2_free_t *free; /* freespace structure */
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
- (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+ xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
}
/*
@@ -183,6 +344,7 @@ xfs_dir2_leaf_to_node(
xfs_dir2_data_off_t off; /* freespace entry value */
__be16 *to; /* pointer to freespace entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
trace_xfs_dir2_leaf_to_node(args);
@@ -199,44 +361,53 @@ xfs_dir2_leaf_to_node(
/*
* Get the buffer for the new freespace block.
*/
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
- XFS_DATA_FORK);
+ error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp);
if (error)
return error;
- fbp->b_ops = &xfs_dir2_free_buf_ops;
free = fbp->b_addr;
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- /*
- * Initialize the freespace block header.
- */
- free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- free->hdr.firstdb = 0;
- ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
- free->hdr.nvalid = ltp->bestcount;
+ ASSERT(be32_to_cpu(ltp->bestcount) <=
+ (uint)dp->i_d.di_size / mp->m_dirblksize);
+
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
- for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
- i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+ from = xfs_dir2_leaf_bests_p(ltp);
+ to = xfs_dir3_free_bests_p(mp, free);
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
n++;
*to = cpu_to_be16(off);
}
- free->hdr.nused = cpu_to_be32(n);
-
- lbp->b_ops = &xfs_dir2_leafn_buf_ops;
- leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
/*
- * Log everything.
+ * Now initialize the freespace block header.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
+ freehdr.nused = n;
+ freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+ xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1);
xfs_dir2_free_log_header(tp, fbp);
- xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
- xfs_dir2_leafn_check(dp, lbp);
+
+ /*
+ * Converting the leaf to a leafnode is just a matter of changing the
+ * magic number and the ops. Do the change directly to the buffer as
+ * it's less work (and less code) than decoding the header to host
+ * format and back again.
+ */
+ if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ else
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+ xfs_dir3_leaf_log_header(tp, lbp);
+ xfs_dir3_leaf_check(mp, lbp);
return 0;
}
@@ -260,6 +431,8 @@ xfs_dir2_leafn_add(
int lowstale; /* previous stale entry */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_add(args, index);
@@ -267,6 +440,8 @@ xfs_dir2_leafn_add(
mp = dp->i_mount;
tp = args->trans;
leaf = bp->b_addr;
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
/*
* Quick check just to make sure we are not going to index
@@ -282,15 +457,15 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
- if (!leaf->hdr.stale)
+ if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) {
+ if (!leafhdr.stale)
return XFS_ERROR(ENOSPC);
- compact = be16_to_cpu(leaf->hdr.stale) > 1;
+ compact = leafhdr.stale > 1;
} else
compact = 0;
- ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval);
- ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
- be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
+ ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+ ASSERT(index == leafhdr.count ||
+ be32_to_cpu(ents[index].hashval) >= args->hashval);
if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
@@ -299,61 +474,51 @@ xfs_dir2_leafn_add(
* Compact out all but one stale leaf entry. Leaves behind
* the entry closest to index.
*/
- if (compact) {
- xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
- }
- /*
- * Set impossible logging indices for this case.
- */
- else if (leaf->hdr.stale) {
- lfloglow = be16_to_cpu(leaf->hdr.count);
+ if (compact)
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
+ else if (leafhdr.stale) {
+ /*
+ * Set impossible logging indices for this case.
+ */
+ lfloglow = leafhdr.count;
lfloghigh = -1;
}
/*
* Insert the new entry, log everything.
*/
- lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
highstale, &lfloglow, &lfloghigh);
lep->hashval = cpu_to_be32(args->hashval);
lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
args->blkno, args->index));
- xfs_dir2_leaf_log_header(tp, bp);
- xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
- xfs_dir2_leafn_check(dp, bp);
+
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, bp);
+ xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(mp, bp);
return 0;
}
#ifdef DEBUG
-/*
- * Check internal consistency of a leafn block.
- */
-void
-xfs_dir2_leafn_check(
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+static void
+xfs_dir2_free_hdr_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_dir2_db_t db)
{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
+ struct xfs_dir3_icfree_hdr hdr;
- leaf = bp->b_addr;
- mp = dp->i_mount;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
- for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
- if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
- ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
- be32_to_cpu(leaf->ents[i + 1].hashval));
- }
- if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
- stale++;
- }
- ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+ xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr);
+
+ ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0);
+ ASSERT(hdr.firstdb <= db);
+ ASSERT(db < hdr.firstdb + hdr.nvalid);
}
+#else
+#define xfs_dir2_free_hdr_check(mp, dp, db)
#endif /* DEBUG */
/*
@@ -365,15 +530,22 @@ xfs_dir2_leafn_lasthash(
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
if (count)
- *count = be16_to_cpu(leaf->hdr.count);
- if (!leaf->hdr.count)
+ *count = leafhdr.count;
+ if (!leafhdr.count)
return 0;
- return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval);
+
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ return be32_to_cpu(ents[leafhdr.count - 1].hashval);
}
/*
@@ -402,16 +574,19 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_dir2_db_t newdb; /* new data block number */
xfs_dir2_db_t newfdb; /* new free block number */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
- ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
- xfs_dir2_leafn_check(dp, bp);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(mp, bp);
+ ASSERT(leafhdr.count > 0);
+
/*
* Look up the hash value in the leaf entries.
*/
@@ -424,15 +599,16 @@ xfs_dir2_leafn_lookup_for_addname(
curbp = state->extrablk.bp;
curfdb = state->extrablk.blkno;
free = curbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
length = xfs_dir2_data_entsize(args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip stale leaf entries.
*/
@@ -451,6 +627,8 @@ xfs_dir2_leafn_lookup_for_addname(
* in hand, take a look at it.
*/
if (newdb != curdb) {
+ __be16 *bests;
+
curdb = newdb;
/*
* Convert the data block to the free block
@@ -473,13 +651,8 @@ xfs_dir2_leafn_lookup_for_addname(
if (error)
return error;
free = curbp->b_addr;
- ASSERT(be32_to_cpu(free->hdr.magic) ==
- XFS_DIR2_FREE_MAGIC);
- ASSERT((be32_to_cpu(free->hdr.firstdb) %
- xfs_dir2_free_max_bests(mp)) == 0);
- ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
- ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
- be32_to_cpu(free->hdr.nvalid));
+
+ xfs_dir2_free_hdr_check(mp, curbp, curdb);
}
/*
* Get the index for our entry.
@@ -488,8 +661,8 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* If it has room, return it.
*/
- if (unlikely(free->bests[fi] ==
- cpu_to_be16(NULLDATAOFF))) {
+ bests = xfs_dir3_free_bests_p(mp, free);
+ if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
XFS_ERRLEVEL_LOW, mp);
if (curfdb != newfdb)
@@ -497,7 +670,7 @@ xfs_dir2_leafn_lookup_for_addname(
return XFS_ERROR(EFSCORRUPTED);
}
curfdb = newfdb;
- if (be16_to_cpu(free->bests[fi]) >= length)
+ if (be16_to_cpu(bests[fi]) >= length)
goto out;
}
}
@@ -511,6 +684,12 @@ out:
state->extrablk.bp = curbp;
state->extrablk.index = fi;
state->extrablk.blkno = curfdb;
+
+ /*
+ * Important: this magic number is not in the buffer - it's for
+ * buffer type information and therefore only the free/data type
+ * matters here, not whether CRCs are enabled or not.
+ */
state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
} else {
state->extravalid = 0;
@@ -545,16 +724,19 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
- ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
- xfs_dir2_leafn_check(dp, bp);
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(mp, bp);
+ ASSERT(leafhdr.count > 0);
+
/*
* Look up the hash value in the leaf entries.
*/
@@ -569,9 +751,9 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip stale leaf entries.
*/
@@ -604,13 +786,13 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_dir2_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(mp, newdb),
-1, &curbp);
if (error)
return error;
}
- xfs_dir2_data_check(dp, curbp);
+ xfs_dir3_data_check(dp, curbp);
curdb = newdb;
}
/*
@@ -638,13 +820,13 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = (int)((char *)dep -
(char *)curbp->b_addr);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- curbp->b_ops = &xfs_dir2_data_buf_ops;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
}
}
- ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
- (args->op_flags & XFS_DA_OP_OKNOENT));
+ ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
if (curbp) {
if (args->cmpresult == XFS_CMP_DIFFERENT) {
/* Giving back last used data block. */
@@ -653,7 +835,8 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = -1;
state->extrablk.blkno = curdb;
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- curbp->b_ops = &xfs_dir2_data_buf_ops;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
} else {
/* If the curbp is not the CI match block, drop it */
if (state->extrablk.bp != curbp)
@@ -689,52 +872,50 @@ xfs_dir2_leafn_lookup_int(
* Log entries and headers. Stale entries are preserved.
*/
static void
-xfs_dir2_leafn_moveents(
- xfs_da_args_t *args, /* operation arguments */
- struct xfs_buf *bp_s, /* source leaf buffer */
- int start_s, /* source leaf index */
- struct xfs_buf *bp_d, /* destination leaf buffer */
- int start_d, /* destination leaf index */
- int count) /* count of leaves to copy */
+xfs_dir3_leafn_moveents(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *bp_s, /* source */
+ struct xfs_dir3_icleaf_hdr *shdr,
+ struct xfs_dir2_leaf_entry *sents,
+ int start_s,/* source leaf index */
+ struct xfs_buf *bp_d, /* destination */
+ struct xfs_dir3_icleaf_hdr *dhdr,
+ struct xfs_dir2_leaf_entry *dents,
+ int start_d,/* destination leaf index */
+ int count) /* count of leaves to copy */
{
- xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */
- xfs_dir2_leaf_t *leaf_s; /* source leaf structure */
- int stale; /* count stale leaves copied */
- xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_trans *tp = args->trans;
+ int stale; /* count stale leaves copied */
trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
/*
* Silently return if nothing to do.
*/
- if (count == 0) {
+ if (count == 0)
return;
- }
- tp = args->trans;
- leaf_s = bp_s->b_addr;
- leaf_d = bp_d->b_addr;
+
/*
* If the destination index is not the end of the current
* destination leaf entries, open up a hole in the destination
* to hold the new entries.
*/
- if (start_d < be16_to_cpu(leaf_d->hdr.count)) {
- memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
- (be16_to_cpu(leaf_d->hdr.count) - start_d) *
- sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
- count + be16_to_cpu(leaf_d->hdr.count) - 1);
+ if (start_d < dhdr->count) {
+ memmove(&dents[start_d + count], &dents[start_d],
+ (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count,
+ count + dhdr->count - 1);
}
/*
* If the source has stale leaves, count the ones in the copy range
* so we can update the header correctly.
*/
- if (leaf_s->hdr.stale) {
+ if (shdr->stale) {
int i; /* temp leaf index */
for (i = start_s, stale = 0; i < start_s + count; i++) {
- if (leaf_s->ents[i].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (sents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
} else
@@ -742,29 +923,27 @@ xfs_dir2_leafn_moveents(
/*
* Copy the leaf entries from source to destination.
*/
- memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+ memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+
/*
* If there are source entries after the ones we copied,
* delete the ones we copied by sliding the next ones down.
*/
- if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) {
- memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+ if (start_s + count < shdr->count) {
+ memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
}
+
/*
* Update the headers and log them.
*/
- be16_add_cpu(&leaf_s->hdr.count, -(count));
- be16_add_cpu(&leaf_s->hdr.stale, -(stale));
- be16_add_cpu(&leaf_d->hdr.count, count);
- be16_add_cpu(&leaf_d->hdr.stale, stale);
- xfs_dir2_leaf_log_header(tp, bp_s);
- xfs_dir2_leaf_log_header(tp, bp_d);
- xfs_dir2_leafn_check(args->dp, bp_s);
- xfs_dir2_leafn_check(args->dp, bp_d);
+ shdr->count -= count;
+ shdr->stale -= stale;
+ dhdr->count += count;
+ dhdr->stale += stale;
}
/*
@@ -773,21 +952,25 @@ xfs_dir2_leafn_moveents(
*/
int /* sort order */
xfs_dir2_leafn_order(
- struct xfs_buf *leaf1_bp, /* leaf1 buffer */
- struct xfs_buf *leaf2_bp) /* leaf2 buffer */
+ struct xfs_buf *leaf1_bp, /* leaf1 buffer */
+ struct xfs_buf *leaf2_bp) /* leaf2 buffer */
{
- xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
- xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
-
- leaf1 = leaf1_bp->b_addr;
- leaf2 = leaf2_bp->b_addr;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- if (be16_to_cpu(leaf1->hdr.count) > 0 &&
- be16_to_cpu(leaf2->hdr.count) > 0 &&
- (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) ||
- be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) <
- be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval)))
+ struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr;
+ struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
+ xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = xfs_dir3_leaf_ents_p(leaf1);
+ ents2 = xfs_dir3_leaf_ents_p(leaf2);
+
+ if (hdr1.count > 0 && hdr2.count > 0 &&
+ (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+ be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+ be32_to_cpu(ents1[hdr1.count - 1].hashval)))
return 1;
return 0;
}
@@ -811,11 +994,15 @@ xfs_dir2_leafn_rebalance(
xfs_dir2_leaf_t *leaf1; /* first leaf structure */
xfs_dir2_leaf_t *leaf2; /* second leaf structure */
int mid; /* midpoint leaf index */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int oldstale; /* old count of stale leaves */
#endif
int oldsum; /* old total leaf count */
int swap; /* swapped leaf blocks */
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
args = state->args;
/*
@@ -830,11 +1017,17 @@ xfs_dir2_leafn_rebalance(
}
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
-#ifdef DEBUG
- oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
+ xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
+ xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = xfs_dir3_leaf_ents_p(leaf1);
+ ents2 = xfs_dir3_leaf_ents_p(leaf2);
+
+ oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+ oldstale = hdr1.stale + hdr2.stale;
#endif
mid = oldsum >> 1;
+
/*
* If the old leaf count was odd then the new one will be even,
* so we need to divide the new count evenly.
@@ -842,10 +1035,10 @@ xfs_dir2_leafn_rebalance(
if (oldsum & 1) {
xfs_dahash_t midhash; /* middle entry hash value */
- if (mid >= be16_to_cpu(leaf1->hdr.count))
- midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval);
+ if (mid >= hdr1.count)
+ midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
else
- midhash = be32_to_cpu(leaf1->ents[mid].hashval);
+ midhash = be32_to_cpu(ents1[mid].hashval);
isleft = args->hashval <= midhash;
}
/*
@@ -859,30 +1052,42 @@ xfs_dir2_leafn_rebalance(
* Calculate moved entry count. Positive means left-to-right,
* negative means right-to-left. Then move the entries.
*/
- count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0);
+ count = hdr1.count - mid + (isleft == 0);
if (count > 0)
- xfs_dir2_leafn_moveents(args, blk1->bp,
- be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count);
+ xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+ hdr1.count - count, blk2->bp,
+ &hdr2, ents2, 0, count);
else if (count < 0)
- xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
- be16_to_cpu(leaf1->hdr.count), count);
- ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum);
- ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale);
+ xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+ blk1->bp, &hdr1, ents1,
+ hdr1.count, count);
+
+ ASSERT(hdr1.count + hdr2.count == oldsum);
+ ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+ /* log the changes made when moving the entries */
+ xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1);
+ xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir3_leaf_log_header(args->trans, blk1->bp);
+ xfs_dir3_leaf_log_header(args->trans, blk2->bp);
+
+ xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp);
+ xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp);
+
/*
* Mark whether we're inserting into the old or new leaf.
*/
- if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count))
+ if (hdr1.count < hdr2.count)
state->inleaf = swap;
- else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count))
+ else if (hdr1.count > hdr2.count)
state->inleaf = !swap;
else
- state->inleaf =
- swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count));
+ state->inleaf = swap ^ (blk1->index <= hdr1.count);
/*
* Adjust the expected index for insertion.
*/
if (!state->inleaf)
- blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - hdr1.count;
/*
* Finally sanity check just to make sure we are not returning a
@@ -898,7 +1103,7 @@ xfs_dir2_leafn_rebalance(
}
static int
-xfs_dir2_data_block_free(
+xfs_dir3_data_block_free(
xfs_da_args_t *args,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_free *free,
@@ -909,57 +1114,66 @@ xfs_dir2_data_block_free(
{
struct xfs_trans *tp = args->trans;
int logfree = 0;
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
- if (!hdr) {
- /* One less used entry in the free table. */
- be32_add_cpu(&free->hdr.nused, -1);
- xfs_dir2_free_log_header(tp, fbp);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+ if (hdr) {
/*
- * If this was the last entry in the table, we can trim the
- * table size back. There might be other entries at the end
- * referring to non-existent data blocks, get those too.
+ * Data block is not empty, just set the free entry to the new
+ * value.
*/
- if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
- int i; /* free entry index */
+ bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ return 0;
+ }
- for (i = findex - 1; i >= 0; i--) {
- if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
- break;
- }
- free->hdr.nvalid = cpu_to_be32(i + 1);
- logfree = 0;
- } else {
- /* Not the last entry, just punch it out. */
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
- logfree = 1;
- }
- /*
- * If there are no useful entries left in the block,
- * get rid of the block if we can.
- */
- if (!free->hdr.nused) {
- int error;
+ /* One less used entry in the free table. */
+ freehdr.nused--;
- error = xfs_dir2_shrink_inode(args, fdb, fbp);
- if (error == 0) {
- fbp = NULL;
- logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
- return error;
- /*
- * It's possible to get ENOSPC if there is no
- * space reservation. In this case some one
- * else will eventually get rid of this block.
- */
+ /*
+ * If this was the last entry in the table, we can trim the table size
+ * back. There might be other entries at the end referring to
+ * non-existent data blocks, get those too.
+ */
+ if (findex == freehdr.nvalid - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
}
+ freehdr.nvalid = i + 1;
+ logfree = 0;
} else {
+ /* Not the last entry, just punch it out. */
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+
+ xfs_dir3_free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_log_header(tp, fbp);
+
+ /*
+ * If there are no useful entries left in the block, get rid of the
+ * block if we can.
+ */
+ if (!freehdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
/*
- * Data block is not empty, just set the free entry to the new
- * value.
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
*/
- free->bests[findex] = cpu_to_be16(longest);
- logfree = 1;
}
/* Log the free entry that changed, unless we got rid of it. */
@@ -994,6 +1208,9 @@ xfs_dir2_leafn_remove(
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_remove(args, index);
@@ -1001,11 +1218,14 @@ xfs_dir2_leafn_remove(
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+
/*
* Point to the entry we're removing.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Extract the data block and offset from the entry.
*/
@@ -1013,14 +1233,18 @@ xfs_dir2_leafn_remove(
ASSERT(dblk->blkno == db);
off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
+
/*
* Kill the leaf entry by marking it stale.
* Log the leaf block changes.
*/
- be16_add_cpu(&leaf->hdr.stale, 1);
- xfs_dir2_leaf_log_header(tp, bp);
+ leafhdr.stale++;
+ xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(tp, bp);
+
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, bp, index, index);
+ xfs_dir3_leaf_log_ents(tp, bp, index, index);
+
/*
* Make the data entry free. Keep track of the longest freespace
* in the data block in case it changes.
@@ -1028,7 +1252,8 @@ xfs_dir2_leafn_remove(
dbp = dblk->bp;
hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
- longest = be16_to_cpu(hdr->bestfree[0].length);
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
xfs_dir2_data_make_free(tp, dbp, off,
xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
@@ -1040,12 +1265,12 @@ xfs_dir2_leafn_remove(
xfs_dir2_data_freescan(mp, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the longest data block freespace changes, need to update
* the corresponding freeblock entry.
*/
- if (longest < be16_to_cpu(hdr->bestfree[0].length)) {
+ if (longest < be16_to_cpu(bf[0].length)) {
int error; /* error return value */
struct xfs_buf *fbp; /* freeblock buffer */
xfs_dir2_db_t fdb; /* freeblock block number */
@@ -1062,20 +1287,25 @@ xfs_dir2_leafn_remove(
if (error)
return error;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- ASSERT(be32_to_cpu(free->hdr.firstdb) ==
- xfs_dir2_free_max_bests(mp) *
- (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+#ifdef DEBUG
+ {
+ struct xfs_dir3_icfree_hdr freehdr;
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) *
+ (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+ }
+#endif
/*
* Calculate which entry we need to fix.
*/
findex = xfs_dir2_db_to_fdindex(mp, db);
- longest = be16_to_cpu(hdr->bestfree[0].length);
+ longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) {
+ if (longest == mp->m_dirblksize -
+ xfs_dir3_data_entry_offset(hdr)) {
/*
* Try to punch out the data block.
*/
@@ -1096,21 +1326,19 @@ xfs_dir2_leafn_remove(
* If we got rid of the data block, we can eliminate that entry
* in the free block.
*/
- error = xfs_dir2_data_block_free(args, hdr, free,
+ error = xfs_dir3_data_block_free(args, hdr, free,
fdb, findex, fbp, longest);
if (error)
return error;
}
- xfs_dir2_leafn_check(dp, bp);
+ xfs_dir3_leaf_check(mp, bp);
/*
* Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval =
- ((uint)sizeof(leaf->hdr) +
- (uint)sizeof(leaf->ents[0]) *
- (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) <
+ *rval = (xfs_dir3_leaf_hdr_size(leaf) +
+ (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
mp->m_dir_magicpct;
return 0;
}
@@ -1143,11 +1371,11 @@ xfs_dir2_leafn_split(
/*
* Initialize the new leaf block.
*/
- error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
- &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
- if (error) {
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno),
+ &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+ if (error)
return error;
- }
+
newblk->blkno = blkno;
newblk->magic = XFS_DIR2_LEAFN_MAGIC;
/*
@@ -1155,7 +1383,7 @@ xfs_dir2_leafn_split(
* block into the leaves.
*/
xfs_dir2_leafn_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error) {
return error;
}
@@ -1171,8 +1399,8 @@ xfs_dir2_leafn_split(
*/
oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
- xfs_dir2_leafn_check(args->dp, oldblk->bp);
- xfs_dir2_leafn_check(args->dp, newblk->bp);
+ xfs_dir3_leaf_check(mp, oldblk->bp);
+ xfs_dir3_leaf_check(mp, newblk->bp);
return error;
}
@@ -1198,9 +1426,10 @@ xfs_dir2_leafn_toosmall(
int error; /* error return value */
int forward; /* sibling block direction */
int i; /* sibling counter */
- xfs_da_blkinfo_t *info; /* leaf block header */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int rval; /* result from path_shift */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1208,11 +1437,13 @@ xfs_dir2_leafn_toosmall(
* to coalesce with a sibling.
*/
blk = &state->path.blk[state->path.active - 1];
- info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- leaf = (xfs_dir2_leaf_t *)info;
- count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
- bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+ leaf = blk->bp->b_addr;
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp);
+
+ count = leafhdr.count - leafhdr.stale;
+ bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]);
if (bytes > (state->blocksize >> 1)) {
/*
* Blk over 50%, don't try to join.
@@ -1231,9 +1462,9 @@ xfs_dir2_leafn_toosmall(
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = (info->forw != 0);
+ forward = (leafhdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
if (error)
return error;
@@ -1247,15 +1478,17 @@ xfs_dir2_leafn_toosmall(
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
- forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back);
+ forward = leafhdr.forw < leafhdr.back;
for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
- blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back);
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ blkno = forward ? leafhdr.forw : leafhdr.back;
if (blkno == 0)
continue;
/*
* Read the sibling leaf block.
*/
- error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+ error = xfs_dir3_leafn_read(state->args->trans, state->args->dp,
blkno, -1, &bp);
if (error)
return error;
@@ -1263,13 +1496,15 @@ xfs_dir2_leafn_toosmall(
/*
* Count bytes in the two blocks combined.
*/
- leaf = (xfs_dir2_leaf_t *)info;
- count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+ count = leafhdr.count - leafhdr.stale;
bytes = state->blocksize - (state->blocksize >> 2);
+
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
- bytes -= count * (uint)sizeof(leaf->ents[0]);
+ xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf);
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ count += hdr2.count - hdr2.stale;
+ bytes -= count * sizeof(ents[0]);
+
/*
* Fits with at least 25% to spare.
*/
@@ -1291,10 +1526,10 @@ xfs_dir2_leafn_toosmall(
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno)
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
else
- error = xfs_da_path_shift(state, &state->path, forward, 0,
+ error = xfs_da3_path_shift(state, &state->path, forward, 0,
&rval);
if (error) {
return error;
@@ -1316,34 +1551,53 @@ xfs_dir2_leafn_unbalance(
xfs_da_args_t *args; /* operation arguments */
xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
+ struct xfs_dir3_icleaf_hdr savehdr;
+ struct xfs_dir3_icleaf_hdr drophdr;
+ struct xfs_dir2_leaf_entry *sents;
+ struct xfs_dir2_leaf_entry *dents;
args = state->args;
ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+
+ xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf);
+ xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+ sents = xfs_dir3_leaf_ents_p(save_leaf);
+ dents = xfs_dir3_leaf_ents_p(drop_leaf);
+
/*
* If there are any stale leaf entries, take this opportunity
* to purge them.
*/
- if (drop_leaf->hdr.stale)
- xfs_dir2_leaf_compact(args, drop_blk->bp);
- if (save_leaf->hdr.stale)
- xfs_dir2_leaf_compact(args, save_blk->bp);
+ if (drophdr.stale)
+ xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+ if (savehdr.stale)
+ xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
/*
* Move the entries from drop to the appropriate end of save.
*/
- drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval);
+ drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
- be16_to_cpu(drop_leaf->hdr.count));
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents, 0,
+ drophdr.count);
else
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
- be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count));
- save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval);
- xfs_dir2_leafn_check(args->dp, save_blk->bp);
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents,
+ savehdr.count, drophdr.count);
+ save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+ /* log the changes made when moving the entries */
+ xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr);
+ xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir3_leaf_log_header(args->trans, save_blk->bp);
+ xfs_dir3_leaf_log_header(args->trans, drop_blk->bp);
+
+ xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp);
+ xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp);
}
/*
@@ -1372,7 +1626,7 @@ xfs_dir2_node_addname(
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
if (rval != ENOENT) {
@@ -1398,7 +1652,7 @@ xfs_dir2_node_addname(
* It worked, fix the hash values up the btree.
*/
if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
} else {
/*
* It didn't work, we need to split the leaf block.
@@ -1410,7 +1664,7 @@ xfs_dir2_node_addname(
/*
* Split the leaf block and insert the new entry.
*/
- rval = xfs_da_split(state);
+ rval = xfs_da3_split(state);
}
done:
xfs_da_state_free(state);
@@ -1447,6 +1701,9 @@ xfs_dir2_node_addname_int(
int needscan; /* need to rescan data frees */
__be16 *tagp; /* data entry tag pointer */
xfs_trans_t *tp; /* transaction pointer */
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_data_free *bf;
dp = args->dp;
mp = dp->i_mount;
@@ -1464,36 +1721,37 @@ xfs_dir2_node_addname_int(
*/
ifbno = fblk->blkno;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = fblk->index;
+ bests = xfs_dir3_free_bests_p(mp, free);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
/*
* This means the free entry showed that the data block had
* space for our entry, so we remembered it.
* Use that data block.
*/
if (findex >= 0) {
- ASSERT(findex < be32_to_cpu(free->hdr.nvalid));
- ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF);
- ASSERT(be16_to_cpu(free->bests[findex]) >= length);
- dbno = be32_to_cpu(free->hdr.firstdb) + findex;
- }
- /*
- * The data block looked at didn't have enough room.
- * We'll start at the beginning of the freespace entries.
- */
- else {
+ ASSERT(findex < freehdr.nvalid);
+ ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(bests[findex]) >= length);
+ dbno = freehdr.firstdb + findex;
+ } else {
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
dbno = -1;
findex = 0;
}
- }
- /*
- * Didn't come in with a freespace block, so don't have a data block.
- */
- else {
+ } else {
+ /*
+ * Didn't come in with a freespace block, so no data block.
+ */
ifbno = dbno = -1;
fbp = NULL;
findex = 0;
}
+
/*
* If we don't have a data block yet, we're going to scan the
* freespace blocks looking for one. Figure out what the
@@ -1547,20 +1805,26 @@ xfs_dir2_node_addname_int(
if (!fbp)
continue;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = 0;
}
/*
* Look at the current free entry. Is it good enough?
+ *
+ * The bests initialisation should be where the bufer is read in
+ * the above branch. But gcc is too stupid to realise that bests
+ * and the freehdr are actually initialised if they are placed
+ * there, so we have to do it here to avoid warnings. Blech.
*/
- if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(free->bests[findex]) >= length)
- dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+ bests = xfs_dir3_free_bests_p(mp, free);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(bests[findex]) >= length)
+ dbno = freehdr.firstdb + findex;
else {
/*
* Are we done with the freeblock?
*/
- if (++findex == be32_to_cpu(free->hdr.nvalid)) {
+ if (++findex == freehdr.nvalid) {
/*
* Drop the block.
*/
@@ -1588,7 +1852,7 @@ xfs_dir2_node_addname_int(
if (unlikely((error = xfs_dir2_grow_inode(args,
XFS_DIR2_DATA_SPACE,
&dbno)) ||
- (error = xfs_dir2_data_init(args, dbno, &dbp))))
+ (error = xfs_dir3_data_init(args, dbno, &dbp))))
return error;
/*
@@ -1614,11 +1878,11 @@ xfs_dir2_node_addname_int(
* If there wasn't a freespace block, the read will
* return a NULL fbp. Allocate and initialize a new one.
*/
- if( fbp == NULL ) {
- if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
- &fbno))) {
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+ &fbno);
+ if (error)
return error;
- }
if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
xfs_alert(mp,
@@ -1646,27 +1910,22 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- error = xfs_da_get_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- -1, &fbp, XFS_DATA_FORK);
+ error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp);
if (error)
return error;
- fbp->b_ops = &xfs_dir2_free_buf_ops;
+ free = fbp->b_addr;
+ bests = xfs_dir3_free_bests_p(mp, free);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
/*
- * Initialize the new block to be empty, and remember
- * its first slot as our empty slot.
+ * Remember the first slot as our empty slot.
*/
- free = fbp->b_addr;
- free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- free->hdr.firstdb = cpu_to_be32(
- (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
- xfs_dir2_free_max_bests(mp));
- free->hdr.nvalid = 0;
- free->hdr.nused = 0;
+ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+ xfs_dir3_free_max_bests(mp);
} else {
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ bests = xfs_dir3_free_bests_p(mp, free);
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
}
/*
@@ -1677,20 +1936,21 @@ xfs_dir2_node_addname_int(
* If it's after the end of the current entries in the
* freespace block, extend that table.
*/
- if (findex >= be32_to_cpu(free->hdr.nvalid)) {
- ASSERT(findex < xfs_dir2_free_max_bests(mp));
- free->hdr.nvalid = cpu_to_be32(findex + 1);
+ if (findex >= freehdr.nvalid) {
+ ASSERT(findex < xfs_dir3_free_max_bests(mp));
+ freehdr.nvalid = findex + 1;
/*
* Tag new entry so nused will go up.
*/
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
}
/*
* If this entry was for an empty data block
* (this should always be true) then update the header.
*/
- if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) {
- be32_add_cpu(&free->hdr.nused, 1);
+ if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+ freehdr.nused++;
+ xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
xfs_dir2_free_log_header(tp, fbp);
}
/*
@@ -1699,7 +1959,8 @@ xfs_dir2_node_addname_int(
* change again.
*/
hdr = dbp->b_addr;
- free->bests[findex] = hdr->bestfree[0].length;
+ bf = xfs_dir3_data_bestfree_p(hdr);
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
@@ -1715,19 +1976,20 @@ xfs_dir2_node_addname_int(
/*
* Read the data block in.
*/
- error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-1, &dbp);
if (error)
return error;
hdr = dbp->b_addr;
+ bf = xfs_dir3_data_bestfree_p(hdr);
logfree = 0;
}
- ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length);
+ ASSERT(be16_to_cpu(bf[0].length) >= length);
/*
* Point to the existing unused space.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
needscan = needlog = 0;
/*
* Mark the first part of the unused space, inuse for us.
@@ -1758,8 +2020,9 @@ xfs_dir2_node_addname_int(
/*
* If the freespace entry is now wrong, update it.
*/
- if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) {
- free->bests[findex] = hdr->bestfree[0].length;
+ bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */
+ if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
@@ -1777,7 +2040,7 @@ xfs_dir2_node_addname_int(
/*
* Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da_node_lookup_int.
+ * All the real work happens in xfs_da3_node_lookup_int.
* The only real output is the inode number of the entry.
*/
int /* error */
@@ -1802,7 +2065,7 @@ xfs_dir2_node_lookup(
/*
* Fill in the path to the entry in the cursor.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
@@ -1857,7 +2120,7 @@ xfs_dir2_node_removename(
/*
* Look up the entry we're deleting, set up the cursor.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
/*
@@ -1881,12 +2144,12 @@ xfs_dir2_node_removename(
/*
* Fix the hash values up the btree.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* If we need to join leaf blocks, do it.
*/
if (rval && state->path.active > 1)
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
/*
* If no errors so far, try conversion to leaf format.
*/
@@ -1928,7 +2191,7 @@ xfs_dir2_node_replace(
/*
* Lookup the entry to change in the btree.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error) {
rval = error;
}
@@ -1937,19 +2200,22 @@ xfs_dir2_node_replace(
* and locked it. But paranoia is good.
*/
if (rval == EEXIST) {
+ struct xfs_dir2_leaf_entry *ents;
/*
* Find the leaf entry.
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
leaf = blk->bp->b_addr;
- lep = &leaf->ents[blk->index];
+ ents = xfs_dir3_leaf_ents_p(leaf);
+ lep = &ents[blk->index];
ASSERT(state->extravalid);
/*
* Point to the data entry.
*/
hdr = state->extrablk.bp->b_addr;
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
@@ -1995,6 +2261,7 @@ xfs_dir2_node_trim_free(
xfs_dir2_free_t *free; /* freespace structure */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
dp = args->dp;
mp = dp->i_mount;
@@ -2012,11 +2279,12 @@ xfs_dir2_node_trim_free(
if (!bp)
return 0;
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
/*
* If there are used entries, there's nothing to do.
*/
- if (be32_to_cpu(free->hdr.nused) > 0) {
+ if (freehdr.nused > 0) {
xfs_trans_brelse(tp, bp);
*rvalp = 0;
return 0;
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7da79f6515fd..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,11 +30,11 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
const unsigned char *name, int len);
/* xfs_dir2_block.c */
-extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
+ struct dir_context *ctx);
extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -43,17 +43,18 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
/* xfs_dir2_data.c */
#ifdef DEBUG
-#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
+#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
#else
-#define xfs_dir2_data_check(dp,bp)
+#define xfs_dir3_data_check(dp,bp)
#endif
-extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
-extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
-extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mapped_bno);
extern struct xfs_dir2_data_free *
@@ -61,7 +62,7 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_data_unused *dup, int *loghead);
extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
struct xfs_buf **bpp);
extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_dir2_data_entry *dep);
@@ -77,24 +78,26 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
/* xfs_dir2_leaf.c */
-extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
-extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
- struct xfs_buf *bp);
-extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp,
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int *indexp,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
- size_t bufsize, xfs_off_t *offset, filldir_t filldir);
-extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
- struct xfs_buf **bpp, int magic);
-extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
+ size_t bufsize);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+ struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
int first, int last);
-extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
+extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
@@ -104,11 +107,18 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
struct xfs_buf *lbp, xfs_dir2_db_t db);
extern struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact,
- int lowstale, int highstale,
- int *lfloglow, int *lfloghigh);
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int index, int compact,
+ int lowstale, int highstale, int *lfloglow, int *lfloghigh);
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from);
+extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
/* xfs_dir2_node.c */
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
@@ -143,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
int size, xfs_dir2_sf_hdr_t *sfhp);
extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 1b9fc3ec7e4b..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -278,7 +278,7 @@ xfs_dir2_block_to_sf(
* Set up to loop over the block's entries.
*/
btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)(hdr + 1);
+ ptr = (char *)xfs_dir3_data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
sfep = xfs_dir2_sf_firstentry(sfp);
/*
@@ -535,7 +535,7 @@ xfs_dir2_sf_addname_hard(
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+ for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount),
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
add_datasize = xfs_dir2_data_entsize(args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
@@ -617,7 +617,7 @@ xfs_dir2_sf_addname_pick(
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = xfs_dir2_data_entsize(args->namelen);
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -688,7 +688,7 @@ xfs_dir2_sf_check(
dp = args->dp;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount);
ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
int /* error */
xfs_dir2_sf_getdents(
xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
+ struct dir_context *ctx)
{
int i; /* shortform entry number */
xfs_mount_t *mp; /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
return 0;
/*
@@ -812,29 +810,27 @@ xfs_dir2_sf_getdents(
* mp->m_dirdatablk.
*/
dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOT_OFFSET);
+ XFS_DIR3_DATA_DOT_OFFSET(mp));
dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOTDOT_OFFSET);
+ XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
/*
* Put . entry unless we're starting past it.
*/
- if (*offset <= dot_offset) {
- if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
- *offset = dot_offset & 0x7fffffff;
+ if (ctx->pos <= dot_offset) {
+ ctx->pos = dot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
return 0;
- }
}
/*
* Put .. entry unless we're starting past it.
*/
- if (*offset <= dotdot_offset) {
+ if (ctx->pos <= dotdot_offset) {
ino = xfs_dir2_sf_get_parent_ino(sfp);
- if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
- *offset = dotdot_offset & 0x7fffffff;
+ ctx->pos = dotdot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
- }
}
/*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
xfs_dir2_sf_get_offset(sfep));
- if (*offset > off) {
+ if (ctx->pos > off) {
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
continue;
}
ino = xfs_dir2_sfe_get_ino(sfp, sfep);
- if (filldir(dirent, (char *)sfep->name, sfep->namelen,
- off & 0x7fffffff, ino, DT_UNKNOWN)) {
- *offset = off & 0x7fffffff;
+ ctx->pos = off & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
+ ino, DT_UNKNOWN))
return 0;
- }
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
}
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
0x7fffffff;
return 0;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8025eb23ad72..f01012de06d0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -36,6 +36,7 @@
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
/*
@@ -85,17 +86,23 @@ xfs_qm_dqdestroy(
*/
void
xfs_qm_adjust_dqlimits(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
+ struct xfs_mount *mp,
+ struct xfs_dquot *dq)
{
- xfs_quotainfo_t *q = mp->m_quotainfo;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ int prealloc = 0;
ASSERT(d->d_id);
- if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+ if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
- if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+ prealloc = 1;
+ }
+ if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ prealloc = 1;
+ }
if (q->qi_isoftlimit && !d->d_ino_softlimit)
d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
if (q->qi_ihardlimit && !d->d_ino_hardlimit)
@@ -104,6 +111,9 @@ xfs_qm_adjust_dqlimits(
d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+
+ if (prealloc)
+ xfs_dquot_set_prealloc_limits(dq);
}
/*
@@ -239,6 +249,11 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
xfs_trans_dquot_buf(tp, bp,
@@ -248,25 +263,95 @@ xfs_qm_init_dquot_blk(
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
-static void
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
+{
+ __uint64_t space;
+
+ dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ if (!dqp->q_prealloc_lo_wmark) {
+ dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+ do_div(dqp->q_prealloc_lo_wmark, 100);
+ dqp->q_prealloc_lo_wmark *= 95;
+ }
+
+ space = dqp->q_prealloc_hi_wmark;
+
+ do_div(space, 100);
+ dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+ dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+ dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ int ndquots;
+ int i;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return true;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_qm_calc_dquots_per_chunk(mp,
+ XFS_BB_TO_FSB(mp, bp->b_length));
+
+ for (i = 0; i < ndquots; i++, d++) {
+ if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF))
+ return false;
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ }
+ return true;
+}
+
+STATIC bool
xfs_dquot_buf_verify(
+ struct xfs_mount *mp,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
- struct xfs_disk_dquot *ddq;
xfs_dqid_t id = 0;
+ int ndquots;
int i;
/*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length);
+
+ /*
* On the first read of the buffer, verify that each dquot is valid.
* We don't know what the id of the dquot is supposed to be, just that
* they should be increasing monotonically within the buffer. If the
* first id is corrupt, then it will fail on the second dquot in the
* buffer so corruptions could point to the wrong dquot in this case.
*/
- for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
- int error;
+ for (i = 0; i < ndquots; i++) {
+ struct xfs_disk_dquot *ddq;
+ int error;
ddq = &d[i].dd_diskdq;
@@ -274,27 +359,41 @@ xfs_dquot_buf_verify(
id = be32_to_cpu(ddq->d_id);
error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_read_verify");
- if (error) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- break;
- }
+ "xfs_dquot_buf_verify");
+ if (error)
+ return false;
}
+ return true;
}
static void
xfs_dquot_buf_read_verify(
struct xfs_buf *bp)
{
- xfs_dquot_buf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
void
xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
{
- xfs_dquot_buf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify(mp, bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
@@ -471,13 +570,13 @@ xfs_qm_dqtobp(
xfs_buf_t **O_bpp,
uint flags)
{
- xfs_bmbt_irec_t map;
- int nmaps = 1, error;
- xfs_buf_t *bp;
- xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
- xfs_mount_t *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- xfs_trans_t *tp = (tpp ? *tpp : NULL);
+ struct xfs_bmbt_irec map;
+ int nmaps = 1, error;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ struct xfs_trans *tp = (tpp ? *tpp : NULL);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
@@ -648,6 +747,9 @@ xfs_qm_dqread(
dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+ /* initialize the dquot speculative prealloc thresholds */
+ xfs_dquot_set_prealloc_limits(dqp);
+
/* Mark the buf so that this will stay incore a little longer */
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
@@ -702,7 +804,7 @@ xfs_qm_dqget(
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
@@ -1035,6 +1137,23 @@ xfs_qm_dqflush(
&dqp->q_logitem.qli_item.li_lsn);
/*
+ * copy the lsn into the on-disk dquot now while we have the in memory
+ * dquot here. This can't be done later in the write verifier as we
+ * can't get access to the log item at that point in time.
+ *
+ * We also calculate the CRC here so that the on-disk dquot in the
+ * buffer always has a valid CRC. This ensures there is no possibility
+ * of a dquot without an up-to-date CRC getting to disk.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+
+ dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ /*
* Attach an iodone routine so that we can remove this dquot from the
* AIL and release the flush lock once the dquot is synced to disk.
*/
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c694a8469c4a..b596626249b8 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -32,6 +32,13 @@
struct xfs_mount;
struct xfs_trans;
+enum {
+ XFS_QLOWSP_1_PCNT = 0,
+ XFS_QLOWSP_3_PCNT,
+ XFS_QLOWSP_5_PCNT,
+ XFS_QLOWSP_MAX
+};
+
/*
* The incore dquot structure
*/
@@ -51,6 +58,9 @@ typedef struct xfs_dquot {
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
+ xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
+ xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
+ int64_t q_low_space[XFS_QLOWSP_MAX];
struct mutex q_qlock; /* quota lock */
struct completion q_flush; /* flush completion queue */
atomic_t q_pincount; /* dquot pin count */
@@ -133,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
- XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
- XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
@@ -145,14 +151,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
- xfs_disk_dquot_t *);
+extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
+ struct xfs_dquot *);
extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
xfs_dqid_t, uint, uint, xfs_dquot_t **);
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
xfs_dqlock(dqp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 610456054dc2..35d3f5b041dd 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -66,7 +66,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
int i;
int64_t fsid;
- if (random32() % randfactor)
+ if (prandom_u32() % randfactor)
return 0;
memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
@@ -178,7 +178,7 @@ xfs_corruption_error(
inst_t *ra)
{
if (level <= xfs_error_level)
- xfs_hex_dump(p, 16);
+ xfs_hex_dump(p, 64);
xfs_error_report(tag, level, mp, filename, linenum, ra);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index feb36d7551ae..452920a3f03f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -50,9 +50,8 @@ xfs_efi_item_free(
* Freeing the efi requires that we remove it from the AIL if it has already
* been placed there. However, the EFI may not yet have been placed in the AIL
* when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the
- * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
- * the EFI.
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
*/
STATIC void
__xfs_efi_release(
@@ -60,7 +59,7 @@ __xfs_efi_release(
{
struct xfs_ail *ailp = efip->efi_item.li_ailp;
- if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_delete() drops the AIL lock. */
xfs_trans_ail_delete(ailp, &efip->efi_item,
@@ -126,8 +125,8 @@ xfs_efi_item_pin(
* which the EFI is manipulated during a transaction. If we are being asked to
* remove the EFI it's because the transaction has been cancelled and by
* definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
- * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ * transaction and free it. Otherwise coordinate with xfs_efi_release()
+ * to determine who gets to free the EFI.
*/
STATIC void
xfs_efi_item_unpin(
@@ -171,19 +170,13 @@ xfs_efi_item_unlock(
/*
* The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged. For bulk transaction committed
- * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
- * when processing the EFD.
+ * the lsn at which it's been logged.
*/
STATIC xfs_lsn_t
xfs_efi_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
- set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
return lsn;
}
@@ -241,6 +234,7 @@ xfs_efi_init(
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
atomic_set(&efip->efi_next_extent, 0);
+ atomic_set(&efip->efi_refcount, 2);
return efip;
}
@@ -310,8 +304,14 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
uint nextents)
{
ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
- if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+ if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
+ /* recovery needs us to drop the EFI reference, too */
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+ __xfs_efi_release(efip);
+
__xfs_efi_release(efip);
+ /* efip may now have been freed, do not reference it again. */
+ }
}
static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 375f68e42531..432222418c56 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -114,16 +114,20 @@ typedef struct xfs_efd_log_format_64 {
* Define EFI flag bits. Manipulated by set/clear/test_bit operators.
*/
#define XFS_EFI_RECOVERED 1
-#define XFS_EFI_COMMITTED 2
/*
- * This is the "extent free intention" log item. It is used
- * to log the fact that some extents need to be free. It is
- * used in conjunction with the "extent free done" log item
- * described below.
+ * This is the "extent free intention" log item. It is used to log the fact
+ * that some extents need to be free. It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures that when the last
+ * reference goes away the EFI will always be in the AIL as it has been
+ * unpinned, regardless of whether the EFD is processed before or after the EFI.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
+ atomic_t efi_refcount;
atomic_t efi_next_extent;
unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f03bf1a456fb..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
@@ -775,8 +776,6 @@ xfs_file_aio_write(
if (ocount == 0)
return 0;
- sb_start_write(inode->i_sb);
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ret = -EIO;
goto out;
@@ -800,7 +799,6 @@ xfs_file_aio_write(
}
out:
- sb_end_write(inode->i_sb);
return ret;
}
@@ -893,7 +891,7 @@ xfs_dir_open(
*/
mode = xfs_ilock_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_dir2_data_readahead(NULL, ip, 0, -1);
+ xfs_dir3_data_readahead(NULL, ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -908,11 +906,10 @@ xfs_file_release(
STATIC int
xfs_file_readdir(
- struct file *filp,
- void *dirent,
- filldir_t filldir)
+ struct file *file,
+ struct dir_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
int error;
size_t bufsize;
@@ -931,8 +928,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- error = xfs_readdir(ip, dirent, bufsize,
- (xfs_off_t *)&filp->f_pos, filldir);
+ error = xfs_readdir(ip, ctx, bufsize);
if (error)
return -error;
return 0;
@@ -1272,8 +1268,7 @@ xfs_seek_data(
}
out:
- if (offset != file->f_pos)
- file->f_pos = offset;
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
xfs_iunlock_map_shared(ip, lock);
@@ -1381,8 +1376,7 @@ out:
* situation in particular.
*/
offset = min_t(loff_t, offset, isize);
- if (offset != file->f_pos)
- file->f_pos = offset;
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
xfs_iunlock_map_shared(ip, lock);
@@ -1434,7 +1428,7 @@ const struct file_operations xfs_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .readdir = xfs_file_readdir,
+ .iterate = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b04..d04695545397 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2866b8c78b7a..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -99,7 +99,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasattr2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+ (xfs_sb_version_hascrc(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_V5SB : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -174,7 +176,7 @@ xfs_growfs_data_private(
if (!bp)
return EIO;
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
@@ -247,6 +249,9 @@ xfs_growfs_data_private(
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -265,6 +270,11 @@ xfs_growfs_data_private(
}
agfl = XFS_BUF_TO_AGFL(bp);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ }
for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
@@ -296,8 +306,11 @@ xfs_growfs_data_private(
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -316,7 +329,13 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
+ agno, 0);
+
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
@@ -339,7 +358,13 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
+ agno, 0);
+
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
@@ -363,7 +388,12 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
+ agno, 0);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 515bf71ce01c..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -36,6 +36,9 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
/*
@@ -148,12 +151,16 @@ xfs_check_agi_freecount(
#endif
/*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
*/
-STATIC int
+int
xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct list_head *buffer_list,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -165,6 +172,7 @@ xfs_ialloc_inode_init(
int version;
int i, j;
xfs_daddr_t d;
+ xfs_ino_t ino = 0;
/*
* Loop over the new block(s), filling in the inodes.
@@ -183,13 +191,41 @@ xfs_ialloc_inode_init(
}
/*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
+ * Figure out what version number to use in the inodes we create. If
+ * the superblock version has caught up to the one that supports the new
+ * inode format, then use the new inode version. Otherwise use the old
+ * version so that old kernels will continue to be able to use the file
+ * system.
+ *
+ * For v3 inodes, we also need to write the inode number into the inode,
+ * so calculate the first inode number of the chunk here as
+ * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * across multiple filesystem blocks (such as a cluster) and so cannot
+ * be used in the cluster buffer loop below.
+ *
+ * Further, because we are writing the inode directly into the buffer
+ * and calculating a CRC on the entire inode, we have ot log the entire
+ * inode so that the entire range the CRC covers is present in the log.
+ * That means for v3 inode we log the entire buffer rather than just the
+ * inode cores.
*/
- if (xfs_sb_version_hasnlink(&mp->m_sb))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ version = 3;
+ ino = XFS_AGINO_TO_INO(mp, agno,
+ XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+ mp->m_sb.sb_inodesize, length, gen);
+ } else if (xfs_sb_version_hasnlink(&mp->m_sb))
version = 2;
else
version = 1;
@@ -204,27 +240,58 @@ xfs_ialloc_inode_init(
XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction
- * to log a whole cluster of inodes instead of all the
- * individual transactions causing a lot of log traffic.
- */
+
+ /* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
- xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+ xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < ninodes; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = sizeof(struct xfs_dinode);
+ uint isize = xfs_dinode_size(version);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
free->di_version = version;
free->di_gen = cpu_to_be32(gen);
free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+
+ if (version == 3) {
+ free->di_ino = cpu_to_be64(ino);
+ ino++;
+ uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ xfs_dinode_calc_crc(mp, free);
+ } else if (tp) {
+ /* just log the inode core */
+ xfs_trans_log_buf(tp, fbuf, ioffset,
+ ioffset + isize - 1);
+ }
+ }
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
}
- xfs_trans_inode_alloc_buf(tp, fbuf);
}
return 0;
}
@@ -269,7 +336,7 @@ xfs_ialloc_ag_alloc(
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
- */
+ */
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
@@ -368,8 +435,8 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
- args.len, random32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+ args.len, prandom_u32());
if (error)
return error;
@@ -581,8 +648,7 @@ xfs_ialloc_get_rec(
struct xfs_btree_cur *cur,
xfs_agino_t agino,
xfs_inobt_rec_incore_t *rec,
- int *done,
- int left)
+ int *done)
{
int error;
int i;
@@ -690,12 +756,12 @@ xfs_dialloc_ag(
pag->pagl_leftrec != NULLAGINO &&
pag->pagl_rightrec != NULLAGINO) {
error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
- &trec, &doneleft, 1);
+ &trec, &doneleft);
if (error)
goto error1;
error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
- &rec, &doneright, 0);
+ &rec, &doneright);
if (error)
goto error1;
} else {
@@ -1453,6 +1519,7 @@ xfs_ialloc_log_agi(
/*
* Log the allocation group inode header buffer.
*/
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, bp, first, last);
}
@@ -1470,19 +1537,23 @@ xfs_check_agi_unlinked(
#define xfs_check_agi_unlinked(agi)
#endif
-static void
+static bool
xfs_agi_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- int agi_ok;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ return false;
/*
* Validate the magic number of the agi block.
*/
- agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ return false;
+ if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+ return false;
/*
* during growfs operations, the perag is not fully initialised,
@@ -1490,30 +1561,52 @@ xfs_agi_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag)
- agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
- bp->b_pag->pag_agno;
+ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+ return false;
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
xfs_check_agi_unlinked(agi);
+ return true;
}
static void
xfs_agi_read_verify(
struct xfs_buf *bp)
{
- xfs_agi_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int agi_ok = 1;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agi, agi_crc));
+ agi_ok = agi_ok && xfs_agi_verify(bp);
+
+ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
xfs_agi_write_verify(
struct xfs_buf *bp)
{
- xfs_agi_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agi_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_agi, agi_crc));
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
+
extern const struct xfs_buf_ops xfs_agi_buf_ops;
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index bec344b36507..5448eb6b8c12 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -34,6 +34,7 @@
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
STATIC int
@@ -182,52 +183,88 @@ xfs_inobt_key_diff(
cur->bc_rec.i.ir_startino;
}
-void
+static int
xfs_inobt_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
unsigned int level;
- int sblock_ok; /* block passes checks */
- /* magic number and level verification */
- level = be16_to_cpu(block->bb_level);
- sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
- level < mp->m_in_maxlevels;
+ /*
+ * During growfs operations, we can't verify the exact owner as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agi information will not yet have been initialised
+ * from the on disk AGI. We don't currently use any of this information,
+ * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+ * ever do.
+ */
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_IBT_MAGIC):
+ break;
+ default:
+ return 0;
+ }
- /* numrecs verification */
- sblock_ok = sblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+ /* numrecs and level verification */
+ level = be16_to_cpu(block->bb_level);
+ if (level >= mp->m_in_maxlevels)
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+ return false;
/* sibling pointer verification */
- sblock_ok = sblock_ok &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_rightsib;
-
- if (!sblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
static void
xfs_inobt_read_verify(
struct xfs_buf *bp)
{
- xfs_inobt_verify(bp);
+ if (!(xfs_btree_sblock_verify_crc(bp) &&
+ xfs_inobt_verify(bp))) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
}
static void
xfs_inobt_write_verify(
struct xfs_buf *bp)
{
- xfs_inobt_verify(bp);
+ if (!xfs_inobt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ bp->b_target->bt_mount, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
@@ -235,7 +272,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = {
.verify_write = xfs_inobt_write_verify,
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_inobt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -273,7 +310,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
.buf_ops = &xfs_inobt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
#endif
@@ -301,6 +338,8 @@ xfs_inobt_init_cursor(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_inobt_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 25c0239a8eab..3ac36b7642e9 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -29,7 +29,8 @@ struct xfs_mount;
/*
* There is a btree for the inode map per allocation group.
*/
-#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
typedef __uint64_t xfs_inofree_t;
#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
@@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t;
/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+#define XFS_INOBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..9560dc1f15a9 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,8 @@ xfs_iget_cache_miss(
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
+ ip->i_udquot = NULL;
+ ip->i_gdquot = NULL;
xfs_iflags_set(ip, iflags);
/* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
-int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+
+kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+ log_vector->i_len = sizeof(struct xfs_icreate_log);
+ log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ ASSERT(0);
+ return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+ .iop_size = xfs_icreate_item_size,
+ .iop_format = xfs_icreate_item_format,
+ .iop_pin = xfs_icreate_item_pin,
+ .iop_unpin = xfs_icreate_item_unpin,
+ .iop_push = xfs_icreate_item_push,
+ .iop_unlock = xfs_icreate_item_unlock,
+ .iop_committed = xfs_icreate_item_committed,
+ .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ unsigned int count,
+ unsigned int inode_size,
+ xfs_agblock_t length,
+ unsigned int generation)
+{
+ struct xfs_icreate_item *icp;
+
+ icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+ xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+ &xfs_icreate_item_ops);
+
+ icp->ic_format.icl_type = XFS_LI_ICREATE;
+ icp->ic_format.icl_size = 1; /* single vector */
+ icp->ic_format.icl_ag = cpu_to_be32(agno);
+ icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+ icp->ic_format.icl_count = cpu_to_be32(count);
+ icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+ icp->ic_format.icl_length = cpu_to_be32(length);
+ icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+ xfs_trans_add_item(tp, &icp->ic_item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H 1
+
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+ __uint16_t icl_type; /* type of log format structure */
+ __uint16_t icl_size; /* size of log format structure */
+ __be32 icl_ag; /* ag being allocated in */
+ __be32 icl_agbno; /* start block of inode range */
+ __be32 icl_count; /* number of inodes to initialise */
+ __be32 icl_isize; /* size of inodes */
+ __be32 icl_length; /* length of extent to initialise */
+ __be32 icl_gen; /* inode generation number to use */
+};
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+ struct xfs_log_item ic_item;
+ struct xfs_icreate_log ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, unsigned int count,
+ unsigned int inode_size, xfs_agblock_t length,
+ unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4f201656d2d9..9ecfe1e559fc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,6 +44,7 @@
#include "xfs_quota.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
@@ -286,7 +287,7 @@ xfs_ilock_demote(
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int
xfs_isilocked(
xfs_inode_t *ip,
@@ -786,6 +787,7 @@ xfs_iformat_btree(
xfs_dinode_t *dip,
int whichfork)
{
+ struct xfs_mount *mp = ip->i_mount;
xfs_bmdr_block_t *dfp;
xfs_ifork_t *ifp;
/* REFERENCED */
@@ -794,7 +796,7 @@ xfs_iformat_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
- size = XFS_BMAP_BROOT_SPACE(dfp);
+ size = XFS_BMAP_BROOT_SPACE(mp, dfp);
nrecs = be16_to_cpu(dfp->bb_numrecs);
/*
@@ -805,14 +807,14 @@ xfs_iformat_btree(
* blocks.
*/
if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+ XFS_DFORK_SIZE(dip, mp, whichfork) ||
XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
- (unsigned long long) ip->i_ino);
+ xfs_warn(mp, "corrupt inode %Lu (btree).",
+ (unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
+ mp, dip);
return XFS_ERROR(EFSCORRUPTED);
}
@@ -823,8 +825,7 @@ xfs_iformat_btree(
* Copy and convert from the on-disk structure
* to the in-memory structure.
*/
- xfs_bmdr_to_bmbt(ip->i_mount, dfp,
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
ifp->if_broot, size);
ifp->if_flags &= ~XFS_IFEXTENTS;
ifp->if_flags |= XFS_IFBROOT;
@@ -866,6 +867,17 @@ xfs_dinode_from_disk(
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
to->di_gen = be32_to_cpu(from->di_gen);
+
+ if (to->di_version == 3) {
+ to->di_changecount = be64_to_cpu(from->di_changecount);
+ to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_flags2 = be64_to_cpu(from->di_flags2);
+ to->di_ino = be64_to_cpu(from->di_ino);
+ to->di_lsn = be64_to_cpu(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ }
}
void
@@ -902,6 +914,17 @@ xfs_dinode_to_disk(
to->di_dmstate = cpu_to_be16(from->di_dmstate);
to->di_flags = cpu_to_be16(from->di_flags);
to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ }
}
STATIC uint
@@ -962,8 +985,54 @@ xfs_dic2xflags(
(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
}
+static bool
+xfs_dinode_verify(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+ return false;
+
+ /* only version 3 or greater inodes are extensively verified here */
+ if (dip->di_version < 3)
+ return true;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ offsetof(struct xfs_dinode, di_crc)))
+ return false;
+ if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+ return false;
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ return true;
+}
+
+void
+xfs_dinode_calc_crc(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ __uint32_t crc;
+
+ if (dip->di_version < 3)
+ return;
+
+ ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+ crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ offsetof(struct xfs_dinode, di_crc));
+ dip->di_crc = xfs_end_cksum(crc);
+}
+
/*
* Read the disk inode attributes into the in-core inode structure.
+ *
+ * If we are initialising a new inode and we are not utilising the
+ * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
+ * with a random generation number. If we are keeping inodes around, we need to
+ * read the inode cluster to get the existing generation number off disk.
*/
int
xfs_iread(
@@ -983,6 +1052,22 @@ xfs_iread(
if (error)
return error;
+ /* shortcut IO on inode allocation if possible */
+ if ((iget_flags & XFS_IGET_CREATE) &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ /* initialise the on-disk inode core */
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_d.di_magic = XFS_DINODE_MAGIC;
+ ip->i_d.di_gen = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ip->i_d.di_version = 3;
+ ip->i_d.di_ino = ip->i_ino;
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ } else
+ ip->i_d.di_version = 2;
+ return 0;
+ }
+
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
@@ -990,17 +1075,13 @@ xfs_iread(
if (error)
return error;
- /*
- * If we got something that isn't an inode it means someone
- * (nfs or dmi) has a stale handle.
- */
- if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
-#ifdef DEBUG
- xfs_alert(mp,
- "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
- __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-#endif /* DEBUG */
- error = XFS_ERROR(EINVAL);
+ /* even unallocated inodes are verified */
+ if (!xfs_dinode_verify(mp, ip, dip)) {
+ xfs_alert(mp, "%s: validation failed for inode %lld failed",
+ __func__, ip->i_ino);
+
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+ error = XFS_ERROR(EFSCORRUPTED);
goto out_brelse;
}
@@ -1022,10 +1103,20 @@ xfs_iread(
goto out_brelse;
}
} else {
+ /*
+ * Partial initialisation of the in-core inode. Just the bits
+ * that xfs_ialloc won't overwrite or relies on being correct.
+ */
ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
ip->i_d.di_version = dip->di_version;
ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+ if (dip->di_version == 3) {
+ ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+ uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+ }
+
/*
* Make sure to pull in the mode here as well in
* case the inode is released without being used.
@@ -1063,17 +1154,16 @@ xfs_iread(
xfs_buf_set_ref(bp, XFS_INO_REF);
/*
- * Use xfs_trans_brelse() to release the buffer containing the
- * on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal
+ * Use xfs_trans_brelse() to release the buffer containing the on-disk
+ * inode, because it was acquired with xfs_trans_read_buf() in
+ * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
* brelse(). If we're within a transaction, then xfs_trans_brelse()
* will only release the buffer if it is not dirty within the
* transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be
- * locking the new in-core inode before putting it in the hash
- * table where other processes can find it. Thus we don't have
- * to worry about the inode being changed just because we released
- * the buffer.
+ * because inodes on disk are never destroyed and we will be locking the
+ * new in-core inode before putting it in the cache where other
+ * processes can find it. Thus we don't have to worry about the inode
+ * being changed just because we released the buffer.
*/
out_brelse:
xfs_trans_brelse(tp, bp);
@@ -1161,6 +1251,7 @@ xfs_ialloc(
xfs_buf_t **ialloc_context,
xfs_inode_t **ipp)
{
+ struct xfs_mount *mp = tp->t_mountp;
xfs_ino_t ino;
xfs_inode_t *ip;
uint flags;
@@ -1187,7 +1278,7 @@ xfs_ialloc(
* This is because we're setting fields here we need
* to prevent others from looking at until we're done.
*/
- error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+ error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
XFS_ILOCK_EXCL, &ip);
if (error)
return error;
@@ -1208,7 +1299,7 @@ xfs_ialloc(
* the inode version number now. This way we only do the conversion
* here rather than here and in the flush/logging code.
*/
- if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
+ if (xfs_sb_version_hasnlink(&mp->m_sb) &&
ip->i_d.di_version == 1) {
ip->i_d.di_version = 2;
/*
@@ -1258,6 +1349,19 @@ xfs_ialloc(
ip->i_d.di_dmevmask = 0;
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
+
+ if (ip->i_d.di_version == 3) {
+ ASSERT(ip->i_d.di_ino == ino);
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ip->i_d.di_crc = 0;
+ ip->i_d.di_changecount = 1;
+ ip->i_d.di_lsn = 0;
+ ip->i_d.di_flags2 = 0;
+ memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+ ip->i_d.di_crtime = ip->i_d.di_mtime;
+ }
+
+
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
@@ -1554,6 +1658,10 @@ xfs_iunlink(
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1639,6 +1747,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1712,6 +1824,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1725,6 +1841,10 @@ xfs_iunlink_remove(
last_dip->di_next_unlinked = cpu_to_be32(next_agino);
ASSERT(next_agino != 0);
offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, last_dip);
+
xfs_trans_inode_buf(tp, last_ibp);
xfs_trans_log_buf(tp, last_ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1928,8 +2048,6 @@ xfs_ifree(
int error;
int delete;
xfs_ino_t first_ino;
- xfs_dinode_t *dip;
- xfs_buf_t *ibp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -1942,14 +2060,13 @@ xfs_ifree(
* Pull the on-disk inode from the AGI unlinked list.
*/
error = xfs_iunlink_remove(tp, ip);
- if (error != 0) {
+ if (error)
return error;
- }
error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
- if (error != 0) {
+ if (error)
return error;
- }
+
ip->i_d.di_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
@@ -1961,31 +2078,10 @@ xfs_ifree(
* by reincarnations of this inode.
*/
ip->i_d.di_gen++;
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error)
- return error;
-
- /*
- * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
- * from picking up this inode when it is reclaimed (its incore state
- * initialzed but not flushed to disk yet). The in-core di_mode is
- * already cleared and a corresponding transaction logged.
- * The hack here just synchronizes the in-core to on-disk
- * di_mode value in advance before the actual inode sync to disk.
- * This is OK because the inode is already unlinked and would never
- * change its di_mode again for this inode generation.
- * This is a temporary hack that would require a proper fix
- * in the future.
- */
- dip->di_mode = 0;
-
- if (delete) {
+ if (delete)
error = xfs_ifree_cluster(ip, tp, first_ino);
- }
return error;
}
@@ -2037,7 +2133,7 @@ xfs_iroot_realloc(
* allocate it now and get out.
*/
if (ifp->if_broot_bytes == 0) {
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
ifp->if_broot_bytes = (int)new_size;
return;
@@ -2051,9 +2147,9 @@ xfs_iroot_realloc(
*/
cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+ XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
KM_SLEEP | KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
@@ -2061,7 +2157,7 @@ xfs_iroot_realloc(
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+ XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
return;
}
@@ -2076,7 +2172,7 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
ASSERT(new_max >= 0);
if (new_max > 0)
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
else
new_size = 0;
if (new_size > 0) {
@@ -2084,7 +2180,8 @@ xfs_iroot_realloc(
/*
* First copy over the btree block header.
*/
- memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
+ memcpy(new_broot, ifp->if_broot,
+ XFS_BMBT_BLOCK_LEN(ip->i_mount));
} else {
new_broot = NULL;
ifp->if_flags &= ~XFS_IFBROOT;
@@ -2114,7 +2211,7 @@ xfs_iroot_realloc(
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+ XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
return;
}
@@ -2427,7 +2524,7 @@ xfs_iflush_fork(
ASSERT(ifp->if_broot != NULL);
ASSERT(ifp->if_broot_bytes <=
(XFS_IFORK_SIZE(ip, whichfork) +
- XFS_BROOT_SIZE_ADJ));
+ XFS_BROOT_SIZE_ADJ(ip)));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2715,20 +2812,18 @@ abort_out:
STATIC int
xfs_iflush_int(
- xfs_inode_t *ip,
- xfs_buf_t *bp)
+ struct xfs_inode *ip,
+ struct xfs_buf *bp)
{
- xfs_inode_log_item_t *iip;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = ip->i_mount;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ ASSERT(iip != NULL && iip->ili_fields != 0);
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -2789,9 +2884,9 @@ xfs_iflush_int(
}
/*
* bump the flush iteration count, used to detect flushes which
- * postdate a log record during recovery.
+ * postdate a log record during recovery. This is redundant as we now
+ * log every change and hence this can't happen. Still, it doesn't hurt.
*/
-
ip->i_d.di_flushiter++;
/*
@@ -2867,41 +2962,30 @@ xfs_iflush_int(
* need the AIL lock, because it is a 64 bit value that cannot be read
* atomically.
*/
- if (iip != NULL && iip->ili_fields != 0) {
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_logged = 1;
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_logged = 1;
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
- /*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
- */
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+ /*
+ * Attach the function xfs_iflush_done to the inode's
+ * buffer. This will remove the inode from the AIL
+ * and unlock the inode's flush lock when the inode is
+ * completely written to disk.
+ */
+ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
- ASSERT(bp->b_fspriv != NULL);
- ASSERT(bp->b_iodone != NULL);
- } else {
- /*
- * We're flushing an inode which is not in the AIL and has
- * not been logged. For this case we can immediately drop
- * the inode flush lock because we can avoid the whole
- * AIL state thing. It's OK to drop the flush lock now,
- * because we've already locked the buffer and to do anything
- * you really need both.
- */
- if (iip != NULL) {
- ASSERT(iip->ili_logged == 0);
- ASSERT(iip->ili_last_fields == 0);
- ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
- }
- xfs_ifunlock(ip);
- }
+ /* update the lsn in the on disk inode if required */
+ if (ip->i_d.di_version == 3)
+ dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
+
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
+ ASSERT(bp->b_fspriv != NULL);
+ ASSERT(bp->b_iodone != NULL);
return 0;
corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 237e7f6f2ab3..91129794aaec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -150,13 +150,38 @@ typedef struct xfs_icdinode {
__uint16_t di_dmstate; /* DMIG state info */
__uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
__uint32_t di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ __uint32_t di_crc; /* CRC of the inode */
+ __uint64_t di_changecount; /* number of attribute changes */
+ xfs_lsn_t di_lsn; /* flush sequence */
+ __uint64_t di_flags2; /* more random flags */
+ __uint8_t di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_ictimestamp_t di_crtime; /* time created */
+ xfs_ino_t di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
} xfs_icdinode_t;
+static inline uint xfs_icdinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_icdinode);
+ return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
/*
* Flags for xfs_ichgtime().
*/
#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
/*
* Per-fork incore inode flags.
@@ -180,10 +205,11 @@ typedef struct xfs_icdinode {
#define XFS_IFORK_DSIZE(ip) \
(XFS_IFORK_Q(ip) ? \
XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount))
+ XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
#define XFS_IFORK_ASIZE(ip) \
(XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+ XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+ XFS_IFORK_BOFF(ip) : \
0)
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
@@ -555,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
struct xfs_buf **, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
+void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
void xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f034bd1652f0..f76ff52e43c0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -179,7 +179,7 @@ xfs_inode_item_format(
nvecs = 1;
vecp->i_addr = &ip->i_d;
- vecp->i_len = sizeof(struct xfs_icdinode);
+ vecp->i_len = xfs_icdinode_size(ip->i_d.di_version);
vecp->i_type = XLOG_REG_TYPE_ICORE;
vecp++;
nvecs++;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d681e34c2950..5e999680094a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -422,9 +422,12 @@ xfs_attrlist_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
- if (!kbuf)
- goto out_dput;
+ kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
+ if (!kbuf) {
+ kbuf = kmem_zalloc_large(al_hreq.buflen);
+ if (!kbuf)
+ goto out_dput;
+ }
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -436,7 +439,10 @@ xfs_attrlist_by_handle(
error = -EFAULT;
out_kfree:
- kfree(kbuf);
+ if (is_vmalloc_addr(kbuf))
+ kmem_free_large(kbuf);
+ else
+ kmem_free(kbuf);
out_dput:
dput(dentry);
return error;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 63b8fc432151..c0c66259cc91 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -373,9 +373,12 @@ xfs_compat_attrlist_by_handle(
return PTR_ERR(dentry);
error = -ENOMEM;
- kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
- if (!kbuf)
- goto out_dput;
+ kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
+ if (!kbuf) {
+ kbuf = kmem_zalloc_large(al_hreq.buflen);
+ if (!kbuf)
+ goto out_dput;
+ }
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -387,7 +390,10 @@ xfs_compat_attrlist_by_handle(
error = -EFAULT;
out_kfree:
- kfree(kbuf);
+ if (is_vmalloc_addr(kbuf))
+ kmem_free_large(kbuf);
+ else
+ kmem_free(kbuf);
out_dput:
dput(dentry);
return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5a30dd899d2b..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,6 +42,8 @@
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -282,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
return 0;
/*
+ * If the file is smaller than the minimum prealloc and we are using
+ * dynamic preallocation, don't do any preallocation at all as it is
+ * likely this is the only write to the file that is going to be done.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+ return 0;
+
+ /*
* If there are any real blocks past eof, then don't
* do any speculative allocation.
*/
@@ -343,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
return 0;
+ /* If the file is small, then use the minimum prealloc */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+ return 0;
+
/*
* As we write multiple pages, the offset will always align to the
* start of a page and hence point to a hole at EOF. i.e. if the size is
@@ -362,10 +377,65 @@ xfs_iomap_eof_prealloc_initial_size(
if (imap[0].br_startblock == HOLESTARTBLOCK)
return 0;
if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
- return imap[0].br_blockcount;
+ return imap[0].br_blockcount << 1;
return XFS_B_TO_FSB(mp, offset);
}
+STATIC bool
+xfs_quota_need_throttle(
+ struct xfs_inode *ip,
+ int type,
+ xfs_fsblock_t alloc_blocks)
+{
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+ if (!dq || !xfs_this_quota_on(ip->i_mount, type))
+ return false;
+
+ /* no hi watermark, no throttle */
+ if (!dq->q_prealloc_hi_wmark)
+ return false;
+
+ /* under the lo watermark, no throttle */
+ if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+ return false;
+
+ return true;
+}
+
+STATIC void
+xfs_quota_calc_throttle(
+ struct xfs_inode *ip,
+ int type,
+ xfs_fsblock_t *qblocks,
+ int *qshift)
+{
+ int64_t freesp;
+ int shift = 0;
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+ /* over hi wmark, squash the prealloc completely */
+ if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+ *qblocks = 0;
+ return;
+ }
+
+ freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+ shift += 2;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+ shift += 2;
+ }
+
+ /* only overwrite the throttle values if we are more aggressive */
+ if ((freesp >> shift) < (*qblocks >> *qshift)) {
+ *qblocks = freesp;
+ *qshift = shift;
+ }
+}
+
/*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
@@ -381,45 +451,89 @@ xfs_iomap_prealloc_size(
int nimaps)
{
xfs_fsblock_t alloc_blocks = 0;
+ int shift = 0;
+ int64_t freesp;
+ xfs_fsblock_t qblocks;
+ int qshift = 0;
alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
imap, nimaps);
- if (alloc_blocks > 0) {
- int shift = 0;
- int64_t freesp;
-
- alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
- rounddown_pow_of_two(alloc_blocks));
-
- xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
- freesp = mp->m_sb.sb_fdblocks;
- if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
- shift = 2;
- if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
- shift++;
- }
- if (shift)
- alloc_blocks >>= shift;
+ if (!alloc_blocks)
+ goto check_writeio;
+ qblocks = alloc_blocks;
- /*
- * If we are still trying to allocate more space than is
- * available, squash the prealloc hard. This can happen if we
- * have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
- */
- while (alloc_blocks && alloc_blocks >= freesp)
- alloc_blocks >>= 4;
+ /*
+ * MAXEXTLEN is not a power of two value but we round the prealloc down
+ * to the nearest power of two value after throttling. To prevent the
+ * round down from unconditionally reducing the maximum supported prealloc
+ * size, we round up first, apply appropriate throttling, round down and
+ * cap the value to MAXEXTLEN.
+ */
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks);
+
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ freesp = mp->m_sb.sb_fdblocks;
+ if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+ shift++;
}
+ /*
+ * Check each quota to cap the prealloc size and provide a shift
+ * value to throttle with.
+ */
+ if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+ if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+ if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+
+ /*
+ * The final prealloc size is set to the minimum of free space available
+ * in each of the quotas and the overall filesystem.
+ *
+ * The shift throttle value is set to the maximum value as determined by
+ * the global low free space values and per-quota low free space values.
+ */
+ alloc_blocks = MIN(alloc_blocks, qblocks);
+ shift = MAX(shift, qshift);
+
+ if (shift)
+ alloc_blocks >>= shift;
+ /*
+ * rounddown_pow_of_two() returns an undefined result if we pass in
+ * alloc_blocks = 0.
+ */
+ if (alloc_blocks)
+ alloc_blocks = rounddown_pow_of_two(alloc_blocks);
+ if (alloc_blocks > MAXEXTLEN)
+ alloc_blocks = MAXEXTLEN;
+
+ /*
+ * If we are still trying to allocate more space than is
+ * available, squash the prealloc hard. This can happen if we
+ * have a large file on a small filesystem and the above
+ * lowspace thresholds are smaller than MAXEXTLEN.
+ */
+ while (alloc_blocks && alloc_blocks >= freesp)
+ alloc_blocks >>= 4;
+
+check_writeio:
if (alloc_blocks < mp->m_writeio_blocks)
alloc_blocks = mp->m_writeio_blocks;
+ trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
+ mp->m_writeio_blocks);
+
return alloc_blocks;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac73..c69bbc493cb0 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -455,6 +455,28 @@ xfs_vn_getattr(
return 0;
}
+static void
+xfs_setattr_mode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct iattr *iattr)
+{
+ struct inode *inode = VFS_I(ip);
+ umode_t mode = iattr->ia_mode;
+
+ ASSERT(tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+
+ ip->i_d.di_mode &= S_IFMT;
+ ip->i_d.di_mode |= mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+}
+
int
xfs_setattr_nonsize(
struct xfs_inode *ip,
@@ -606,18 +628,8 @@ xfs_setattr_nonsize(
/*
* Change file access modes.
*/
- if (mask & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
-
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= mode & ~S_IFMT;
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
- }
+ if (mask & ATTR_MODE)
+ xfs_setattr_mode(tp, ip, iattr);
/*
* Change file access or modified times.
@@ -714,9 +726,8 @@ xfs_setattr_size(
return XFS_ERROR(error);
ASSERT(S_ISREG(ip->i_d.di_mode));
- ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
- ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+ ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
if (!(flags & XFS_ATTR_NOLOCK)) {
lock_flags |= XFS_IOLOCK_EXCL;
@@ -860,6 +871,12 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
+ /*
+ * Change file access modes.
+ */
+ if (mask & ATTR_MODE)
+ xfs_setattr_mode(tp, ip, iattr);
+
if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
@@ -970,7 +987,8 @@ xfs_fiemap_format(
if (bmv->bmv_oflags & BMV_OF_PREALLOC)
fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
physical = 0; /* no block yet */
}
if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..bc92c5306a17 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
{
return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
(xfs_sb_version_hasquota(&mp->m_sb) &&
- (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+ xfs_is_quota_inode(&mp->m_sb, ino)));
}
/*
@@ -383,11 +383,13 @@ xfs_bulkstat(
* Also start read-ahead now for this chunk.
*/
if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ struct blk_plug plug;
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
+ blk_start_plug(&plug);
agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +401,7 @@ xfs_bulkstat(
agbno, nbcluster,
&xfs_inode_buf_ops);
}
+ blk_finish_plug(&plug);
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index fe7e4df85a7b..800f896a6cc4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -72,6 +72,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/list_sort.h>
+#include <linux/ratelimit.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -292,22 +293,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
#define ASSERT_ALWAYS(expr) \
(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr) ((void)0)
+#ifdef DEBUG
+#define ASSERT(expr) \
+ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#ifndef STATIC
-# define STATIC static noinline
+# define STATIC noinline
#endif
-#else /* DEBUG */
+#else /* !DEBUG */
+
+#ifdef XFS_WARN
#define ASSERT(expr) \
- (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
#ifndef STATIC
-# define STATIC noinline
+# define STATIC static noinline
+#endif
+
+#else /* !DEBUG && !XFS_WARN */
+
+#define ASSERT(expr) ((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
#endif
+#endif /* XFS_WARN */
#endif /* DEBUG */
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index eec226f78a40..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+ continue;
+
headers += lv->lv_niovecs;
for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
- while (lv && index < lv->lv_niovecs) {
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2236,13 +2240,22 @@ xlog_write(
* This loop writes out as many regions as can fit in the amount
* of space which was allocated by xlog_state_get_iclog_space().
*/
- while (lv && index < lv->lv_niovecs) {
- struct xfs_log_iovec *reg = &vecp[index];
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+ struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
+ bool ordered = false;
+
+ /* ordered log vectors have no regions to write */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(lv->lv_niovecs == 0);
+ ordered = true;
+ goto next_lv;
+ }
+ reg = &vecp[index];
ASSERT(reg->i_len % sizeof(__int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2302,12 +2315,13 @@ xlog_write(
break;
if (++index == lv->lv_niovecs) {
+next_lv:
lv = lv->lv_next;
index = 0;
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0) {
+ if (record_cnt == 0 && ordered == false) {
if (!lv)
return 0;
break;
@@ -3485,7 +3499,7 @@ xlog_ticket_alloc(
tic->t_curr_res = unit_bytes;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = random32();
+ tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XLOG_REG_TYPE_UNMOUNT 17
#define XLOG_REG_TYPE_COMMIT 18
#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_MAX 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_MAX 20
typedef struct xfs_log_iovec {
void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
int lv_buf_len; /* size of formatted buffer */
};
+#define XFS_LOG_VEC_ORDERED (-1)
+
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ddc4529d07d3..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
int index;
int len = 0;
uint niovecs;
+ bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
if (!niovecs)
continue;
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ }
+
new_lv = kmem_zalloc(sizeof(*new_lv) +
niovecs * sizeof(struct xfs_log_iovec),
- KM_SLEEP);
+ KM_SLEEP|KM_NOFS);
+
+ new_lv->lv_item = lidp->lid_item;
+ new_lv->lv_niovecs = niovecs;
+ if (ordered) {
+ /* track as an ordered logvec */
+ new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ goto next;
+ }
/* The allocated iovec region lies beyond the log vector. */
new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
- new_lv->lv_niovecs = niovecs;
- new_lv->lv_item = lidp->lid_item;
/* build the vector array and calculate it's length */
IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
}
ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+next:
if (!ret_lv)
ret_lv = new_lv;
else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
if (old) {
/* existing lv on log item, space used is a delta */
- ASSERT(!list_empty(&lv->lv_item->li_cil));
- ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+ ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
+ old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+
+ /*
+ * If the new item is ordered, keep the old one that is already
+ * tracking dirty or ordered regions
+ */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(!lv->lv_buf);
+ kmem_free(lv);
+ return;
+ }
*len += lv->lv_buf_len - old->lv_buf_len;
*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
} else {
/* new lv, must pin the log item */
ASSERT(!lv->lv_item->li_lv);
- ASSERT(list_empty(&lv->lv_item->li_cil));
- *len += lv->lv_buf_len;
- *diff_iovecs += lv->lv_niovecs;
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ *len += lv->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs;
+ }
IOP_PIN(lv->lv_item);
}
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- for (lv = log_vector; lv; lv = lv->lv_next)
- xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
- /* account for space used by new iovec headers */
- len += diff_iovecs * sizeof(xlog_op_header_t);
-
spin_lock(&cil->xc_cil_lock);
+ for (lv = log_vector; lv; ) {
+ struct xfs_log_vec *next = lv->lv_next;
- /* move the items to the tail of the CIL */
- for (lv = log_vector; lv; lv = lv->lv_next)
+ ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
+ lv->lv_next = NULL;
+
+ /*
+ * xfs_cil_prepare_item() may free the lv, so move the item on
+ * the CIL first.
+ */
list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+ xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+ lv = next;
+ }
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
ctx->nvecs += diff_iovecs;
/*
@@ -381,9 +416,7 @@ xlog_cil_push(
struct xfs_cil_ctx *new_ctx;
struct xlog_in_core *commit_iclog;
struct xlog_ticket *tic;
- int num_lv;
int num_iovecs;
- int len;
int error = 0;
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
* side which is currently locked out by the flush lock.
*/
lv = NULL;
- num_lv = 0;
num_iovecs = 0;
- len = 0;
while (!list_empty(&cil->xc_cil)) {
struct xfs_log_item *item;
- int i;
item = list_first_entry(&cil->xc_cil,
struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
lv->lv_next = item->li_lv;
lv = item->li_lv;
item->li_lv = NULL;
-
- num_lv++;
num_iovecs += lv->lv_niovecs;
- for (i = 0; i < lv->lv_niovecs; i++)
- len += lv->lv_iovecp[i].i_len;
}
/*
@@ -668,10 +694,6 @@ xlog_cil_push_foreground(
* transaction to the checkpoint context so we carry the busy extents through
* to checkpoint completion, and then unlock all the items in the transaction.
*
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
* Called with the context lock already held in read mode to lock out
* background commit, returns without it held once background commits are
* allowed again.
@@ -705,6 +727,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = log->l_cilp->xc_ctx->sequence;
+ /* xlog_cil_insert_items() destroys log_vector list */
xlog_cil_insert_items(log, log_vector, tp->t_ticket);
/* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 16d8d12ea3b4..b9ea262dd1c2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -468,7 +468,6 @@ struct xfs_cil {
* threshold, yet give us plenty of space for aggregation on large logs.
*/
#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
-#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
/*
* ticket grant locks, queues and accounting have their own cachlines
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d1dba7ce75ae..6fcc910a50b9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -29,6 +29,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -44,6 +45,15 @@
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_icreate_item.h"
+
+/* Need all the magic numbers and buffer ops structures from these headers */
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
STATIC int
xlog_find_zeroed(
@@ -1590,10 +1600,53 @@ xlog_recover_add_to_trans(
}
/*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ * 1. Every item is only logged once in a given transaction. Hence it
+ * represents the last logged state of the item. Hence ordering is
+ * dependent on the order in which operations need to be performed so
+ * required initial conditions are always met.
+ *
+ * 2. Cancelled buffers are recorded in pass 1 in a separate table and
+ * there's nothing to replay from them so we can simply cull them
+ * from the transaction. However, we can't do that until after we've
+ * replayed all the other items because they may be dependent on the
+ * cancelled buffer and replaying the cancelled buffer can remove it
+ * form the cancelled buffer table. Hence they have tobe done last.
+ *
+ * 3. Inode allocation buffers must be replayed before inode items that
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
+ *
+ * 4. Inode unlink buffers must be replayed after inode items are replayed.
+ * This ensures that inodes are completely flushed to the inode buffer
+ * in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
@@ -1603,19 +1656,32 @@ xlog_recover_reorder_trans(
{
xlog_recover_item_t *item, *n;
LIST_HEAD(sort_list);
+ LIST_HEAD(cancel_list);
+ LIST_HEAD(buffer_list);
+ LIST_HEAD(inode_buffer_list);
+ LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
- list_move(&item->ri_list, &trans->r_itemq);
+ list_move(&item->ri_list, &cancel_list);
+ break;
+ }
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ list_move(&item->ri_list, &inode_buffer_list);
break;
}
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
@@ -1623,7 +1689,7 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
- list_move_tail(&item->ri_list, &trans->r_itemq);
+ list_move_tail(&item->ri_list, &inode_list);
break;
default:
xfs_warn(log->l_mp,
@@ -1634,6 +1700,14 @@ xlog_recover_reorder_trans(
}
}
ASSERT(list_empty(&sort_list));
+ if (!list_empty(&buffer_list))
+ list_splice(&buffer_list, &trans->r_itemq);
+ if (!list_empty(&inode_list))
+ list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&inode_buffer_list))
+ list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+ if (!list_empty(&cancel_list))
+ list_splice_tail(&cancel_list, &trans->r_itemq);
return 0;
}
@@ -1786,6 +1860,13 @@ xlog_recover_do_inode_buffer(
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
+
inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1851,12 +1932,216 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
}
return 0;
}
/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recovery_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_allocbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Bad btree block magic!");
+ ASSERT(0);
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ xfs_warn(mp, "Bad AGF block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_AGFL_MAGIC) {
+ xfs_warn(mp, "Bad AGFL block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ xfs_warn(mp, "Bad AGI block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ xfs_warn(mp, "Bad DQUOT block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ /*
+ * we get here with inode allocation buffers, not buffers that
+ * track unlinked list changes.
+ */
+ if (magic16 != XFS_DINODE_MAGIC) {
+ xfs_warn(mp, "Bad INODE block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ xfs_warn(mp, "Bad symlink block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ xfs_warn(mp, "Bad dir block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ xfs_warn(mp, "Bad dir data magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ xfs_warn(mp, "Bad dir3 free magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ xfs_warn(mp, "Bad dir leaf1 magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ xfs_warn(mp, "Bad dir leafn magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ xfs_warn(mp, "Bad da node magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ xfs_warn(mp, "Bad attr leaf magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ xfs_warn(mp, "Bad attr remote magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ xfs_warn(mp, "Bad SB block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+}
+
+/*
* Perform a 'normal' buffer recovery. Each logged region of the
* buffer should be copied over the corresponding region in the
* given buffer. The bitmap in the buf log format structure indicates
@@ -1892,6 +2177,17 @@ xlog_recover_do_reg_buffer(
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+ /*
* Do a sanity check if this is a dquot buffer. Just checking
* the first dquot in the buffer should do. XXXThis is
* probably a good thing to do for other buf types also.
@@ -1928,6 +2224,17 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
+
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xlog_recovery_validate_buf_type(mp, bp, buf_f);
}
/*
@@ -2048,6 +2355,12 @@ xfs_qm_dqcheck(
d->dd_diskdq.d_flags = type;
d->dd_diskdq.d_id = cpu_to_be32(id);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
return errs;
}
@@ -2213,6 +2526,7 @@ xlog_recover_inode_pass2(
int attr_index;
uint fields;
xfs_icdinode_t *dicp;
+ uint isize;
int need_free = 0;
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
@@ -2238,7 +2552,7 @@ xlog_recover_inode_pass2(
trace_xfs_log_recover_inode_recover(log, in_f);
bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
- NULL);
+ &xfs_inode_buf_ops);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2349,7 +2663,8 @@ xlog_recover_inode_pass2(
error = EFSCORRUPTED;
goto error;
}
- if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+ isize = xfs_icdinode_size(dicp->di_version);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
@@ -2361,13 +2676,13 @@ xlog_recover_inode_pass2(
}
/* The core is in in-core format */
- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, dicp);
/* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
- memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
+ if (item->ri_buf[1].i_len > isize) {
+ memcpy((char *)dip + isize,
+ item->ri_buf[1].i_addr + isize,
+ item->ri_buf[1].i_len - isize);
}
fields = in_f->ilf_fields;
@@ -2451,6 +2766,9 @@ xlog_recover_inode_pass2(
}
write_inode_buffer:
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
@@ -2570,6 +2888,10 @@ xlog_recover_dquot_pass2(
}
memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
@@ -2674,6 +2996,93 @@ xlog_recover_efd_pass2(
}
/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
+
+ /* existing allocation is fixed value */
+ ASSERT(count == XFS_IALLOC_INODES(mp));
+ ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+ if (count != XFS_IALLOC_INODES(mp) ||
+ length != XFS_IALLOC_BLOCKS(mp)) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
+ }
+
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
+ return 0;
+}
+
+/*
* Free up any resources allocated by the transaction
*
* Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -2715,6 +3124,7 @@ xlog_recover_commit_pass1(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
/* nothing to do in pass 1 */
return 0;
default:
@@ -2745,6 +3155,8 @@ xlog_recover_commit_pass2(
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
return xlog_recover_dquot_pass2(log, buffer_list, item);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
@@ -2948,6 +3360,7 @@ xlog_recover_process_efi(
* This will pull the EFI from the AIL and
* free the memory associated with it.
*/
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip, efip->efi_format.efi_nextents);
return XFS_ERROR(EIO);
}
@@ -3751,6 +4164,25 @@ xlog_recover(
return error;
}
+ /*
+ * Version 5 superblock log feature mask validation. We know the
+ * log is dirty so check if there are any unknown log features
+ * in what we need to recover. If there are unknown features
+ * (e.g. unsupported transactions, then simply reject the
+ * attempt at recovery before touching anything.
+ */
+ if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+ xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+ (log->l_mp->m_sb.sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ return EINVAL;
+ }
+
xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
log->l_mp->m_logname ? log->l_mp->m_logname
: "internal");
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 331cd9f83a7f..9163dc140532 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -93,6 +93,14 @@ xfs_alert_tag(
}
void
+asswarn(char *expr, char *file, int line)
+{
+ xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+ expr, file, line);
+ WARN_ON(1);
+}
+
+void
assfail(char *expr, char *file, int line)
{
xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 56dc0c17f16a..85401155750e 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -30,7 +30,34 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
}
#endif
+#define xfs_printk_ratelimited(func, dev, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ func(dev, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define xfs_emerg_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
+#define xfs_alert_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__)
+#define xfs_crit_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__)
+#define xfs_err_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__)
+#define xfs_debug_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+
extern void assfail(char *expr, char *f, int l);
+extern void asswarn(char *expr, char *f, int l);
extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3806088a8f77..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,8 @@
#include "xfs_utils.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
#ifdef HAVE_PERCPU_SB
@@ -109,6 +111,14 @@ static const struct {
{ offsetof(xfs_sb_t, sb_logsunit), 0 },
{ offsetof(xfs_sb_t, sb_features2), 0 },
{ offsetof(xfs_sb_t, sb_bad_features2), 0 },
+ { offsetof(xfs_sb_t, sb_features_compat), 0 },
+ { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
+ { offsetof(xfs_sb_t, sb_features_incompat), 0 },
+ { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+ { offsetof(xfs_sb_t, sb_crc), 0 },
+ { offsetof(xfs_sb_t, sb_pad), 0 },
+ { offsetof(xfs_sb_t, sb_pquotino), 0 },
+ { offsetof(xfs_sb_t, sb_lsn), 0 },
{ sizeof(xfs_sb_t), 0 }
};
@@ -304,7 +314,8 @@ STATIC int
xfs_mount_validate_sb(
xfs_mount_t *mp,
xfs_sb_t *sbp,
- bool check_inprogress)
+ bool check_inprogress,
+ bool check_version)
{
/*
@@ -319,11 +330,63 @@ xfs_mount_validate_sb(
return XFS_ERROR(EWRONGFS);
}
+
if (!xfs_sb_good_version(sbp)) {
xfs_warn(mp, "bad version");
return XFS_ERROR(EWRONGFS);
}
+ if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
+ (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
+ xfs_notice(mp,
+"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Version 5 superblock feature mask validation. Reject combinations the
+ * kernel cannot support up front before checking anything else. For
+ * write validation, we don't need to check feature masks.
+ */
+ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+ xfs_alert(mp,
+"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
+"Use of these features in this kernel is at your own risk!");
+
+ if (xfs_sb_has_compat_feature(sbp,
+ XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+ (sbp->sb_features_compat &
+ XFS_SB_FEAT_COMPAT_UNKNOWN));
+ }
+
+ if (xfs_sb_has_ro_compat_feature(sbp,
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+ return XFS_ERROR(EINVAL);
+ }
+ }
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+ (sbp->sb_features_incompat &
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ return XFS_ERROR(EINVAL);
+ }
+ }
+
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -506,6 +569,18 @@ out_unwind:
return error;
}
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+ if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+ if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+ sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+}
+
void
xfs_sb_from_disk(
struct xfs_sb *to,
@@ -557,6 +632,43 @@ xfs_sb_from_disk(
to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
to->sb_features2 = be32_to_cpu(from->sb_features2);
to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+ to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+ to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+ to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ be32_to_cpu(from->sb_features_log_incompat);
+ to->sb_pad = 0;
+ to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+ to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t *fields)
+{
+ __uint16_t qflags = from->sb_qflags;
+
+ if (*fields & XFS_SB_QFLAGS) {
+ /*
+ * The in-core version of sb_qflags do not have
+ * XFS_OQUOTA_* flags, whereas the on-disk version
+ * does. So, convert incore XFS_{PG}QUOTA_* flags
+ * to on-disk XFS_OQUOTA_* flags.
+ */
+ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+ XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+ if (from->sb_qflags &
+ (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+ qflags |= XFS_OQUOTA_ENFD;
+ if (from->sb_qflags &
+ (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+ qflags |= XFS_OQUOTA_CHKD;
+ to->sb_qflags = cpu_to_be16(qflags);
+ *fields &= ~XFS_SB_QFLAGS;
+ }
}
/*
@@ -580,6 +692,7 @@ xfs_sb_to_disk(
if (!fields)
return;
+ xfs_sb_quota_to_disk(to, from, &fields);
while (fields) {
f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
first = xfs_sb_info[f].offset;
@@ -612,13 +725,13 @@ xfs_sb_to_disk(
}
}
-static void
+static int
xfs_sb_verify(
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool check_version)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_sb sb;
- int error;
xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
@@ -626,16 +739,47 @@ xfs_sb_verify(
* Only check the in progress field for the primary superblock as
* mkfs.xfs doesn't clear it from secondary superblocks.
*/
- error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
- if (error)
- xfs_buf_ioerror(bp, error);
+ return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+ check_version);
}
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid. We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ */
static void
xfs_sb_read_verify(
struct xfs_buf *bp)
{
- xfs_sb_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ int error;
+
+ /*
+ * open code the version check to avoid needing to convert the entire
+ * superblock from disk order just to check the version number
+ */
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+ (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+ XFS_SB_VERSION_5) ||
+ dsb->sb_crc != 0)) {
+
+ if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
+ offsetof(struct xfs_sb, sb_crc))) {
+ error = EFSCORRUPTED;
+ goto out_error;
+ }
+ }
+ error = xfs_sb_verify(bp, true);
+
+out_error:
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, error);
+ }
}
/*
@@ -648,11 +792,10 @@ static void
xfs_sb_quiet_read_verify(
struct xfs_buf *bp)
{
- struct xfs_sb sb;
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
- if (sb.sb_magicnum == XFS_SB_MAGIC) {
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
/* XFS filesystem, verify noisily! */
xfs_sb_read_verify(bp);
return;
@@ -663,9 +806,27 @@ xfs_sb_quiet_read_verify(
static void
xfs_sb_write_verify(
- struct xfs_buf *bp)
+ struct xfs_buf *bp)
{
- xfs_sb_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ int error;
+
+ error = xfs_sb_verify(bp, false);
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, error);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_sb, sb_crc));
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
@@ -687,7 +848,8 @@ int
xfs_readsb(xfs_mount_t *mp, int flags)
{
unsigned int sector_size;
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
+ struct xfs_sb *sbp = &mp->m_sb;
int error;
int loud = !(flags & XFS_MFSI_QUIET);
@@ -714,7 +876,7 @@ reread:
if (bp->b_error) {
error = bp->b_error;
if (loud)
- xfs_warn(mp, "SB validate failed");
+ xfs_warn(mp, "SB validate failed with error %d.", error);
goto release_buf;
}
@@ -723,13 +885,14 @@ reread:
*/
xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(&mp->m_sb);
/*
* We must be able to do sector-sized and sector-aligned IO.
*/
- if (sector_size > mp->m_sb.sb_sectsize) {
+ if (sector_size > sbp->sb_sectsize) {
if (loud)
xfs_warn(mp, "device supports %u byte sectors (not %u)",
- sector_size, mp->m_sb.sb_sectsize);
+ sector_size, sbp->sb_sectsize);
error = ENOSYS;
goto release_buf;
}
@@ -738,15 +901,18 @@ reread:
* If device sector size is smaller than the superblock size,
* re-read the superblock so the buffer is correctly sized.
*/
- if (sector_size < mp->m_sb.sb_sectsize) {
+ if (sector_size < sbp->sb_sectsize) {
xfs_buf_relse(bp);
- sector_size = mp->m_sb.sb_sectsize;
+ sector_size = sbp->sb_sectsize;
goto reread;
}
/* Initialize per-cpu counters */
xfs_icsb_reinit_counters(mp);
+ /* no need to be quiet anymore, so reset the buf ops */
+ bp->b_ops = &xfs_sb_buf_ops;
+
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
@@ -872,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
*/
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. blocksize)");
- return XFS_ERROR(EINVAL);
- }
- mp->m_dalign = mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
} else {
/*
* Convert the stripe unit and width to FSBs.
*/
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. ag size)");
- return XFS_ERROR(EINVAL);
- }
xfs_warn(mp,
- "stripe alignment turned off: sunit(%d)/swidth(%d) "
- "incompatible with agsize(%d)",
- mp->m_dalign, mp->m_swidth,
- sbp->sb_agblocks);
-
- mp->m_dalign = 0;
- mp->m_swidth = 0;
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ sbp->sb_agblocks);
+ return XFS_ERROR(EINVAL);
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "sunit(%d) less than bsize(%d)",
- mp->m_dalign,
- mp->m_blockmask +1);
- return XFS_ERROR(EINVAL);
- }
- mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
}
}
@@ -924,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
sbp->sb_width = mp->m_swidth;
mp->m_update_flags |= XFS_SB_WIDTH;
}
+ } else {
+ xfs_warn(mp,
+ "cannot change alignment: superblock does not support data alignment");
+ return XFS_ERROR(EINVAL);
}
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -1633,6 +1788,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
ASSERT((1LL << f) & XFS_SB_MOD_BITS);
first = xfs_sb_info[f].offset;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, first, last);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bc907061d392..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
uint m_chsize; /* size of next field */
- struct xfs_chash *m_chash; /* fs private inode per-cluster
- * hash table */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -207,7 +205,6 @@ typedef struct xfs_mount {
trimming */
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
- struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
@@ -230,8 +227,6 @@ typedef struct xfs_mount {
operations, typically for
disk errors in metadata */
#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
- user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
allocations */
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
@@ -392,6 +387,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
#endif /* __KERNEL__ */
+extern void xfs_sb_calc_crc(struct xfs_buf *);
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
xfs_agnumber_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e5b5cf973781..7a3e007b49f4 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -41,6 +41,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_cksum.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -69,7 +70,7 @@ xfs_qm_dquot_walk(
void *data)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
uint32_t next_index;
int last_error = 0;
int skipped;
@@ -188,7 +189,7 @@ xfs_qm_dqpurge(
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -298,8 +299,10 @@ xfs_qm_mount_quotas(
*/
if (!XFS_IS_UQUOTA_ON(mp))
mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
- mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+ if (!XFS_IS_GQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+ if (!XFS_IS_PQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_PQUOTA_CHKD;
write_changes:
/*
@@ -488,8 +491,7 @@ xfs_qm_need_dqattach(
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
return false;
- if (ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return false;
return true;
}
@@ -605,8 +607,7 @@ xfs_qm_dqdetach(
trace_xfs_dquot_dqdetach(ip);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
if (ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
@@ -617,6 +618,20 @@ xfs_qm_dqdetach(
}
}
+int
+xfs_qm_calc_dquots_per_chunk(
+ struct xfs_mount *mp,
+ unsigned int nbblks) /* basic block units */
+{
+ unsigned int ndquots;
+
+ ASSERT(nbblks > 0);
+ ndquots = BBTOB(nbblks);
+ do_div(ndquots, sizeof(xfs_dqblk_t));
+
+ return ndquots;
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -656,9 +671,8 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(qinf->qi_dqchunklen);
- qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
- do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+ qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp,
+ qinf->qi_dqchunklen);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -826,7 +840,7 @@ xfs_qm_reset_dqcounts(
xfs_dqid_t id,
uint type)
{
- xfs_disk_dquot_t *ddq;
+ struct xfs_dqblk *dqb;
int j;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -840,8 +854,12 @@ xfs_qm_reset_dqcounts(
do_div(j, sizeof(xfs_dqblk_t));
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
- ddq = bp->b_addr;
+ dqb = bp->b_addr;
for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+ struct xfs_disk_dquot *ddq;
+
+ ddq = (struct xfs_disk_dquot *)&dqb[j];
+
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
@@ -858,7 +876,12 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
- ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)&dqb[j],
+ sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
}
@@ -894,15 +917,29 @@ xfs_qm_dqiter_bufs(
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
+
+ /*
+ * CRC and validation errors will return a EFSCORRUPTED here. If
+ * this occurs, re-read without CRC validation so that we can
+ * repair the damage via xfs_qm_reset_dqcounts(). This process
+ * will leave a trace in the log indicating corruption has
+ * been detected.
+ */
+ if (error == EFSCORRUPTED) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, bno),
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ NULL);
+ }
+
if (error)
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
- /*
- * goto the next block.
- */
+
+ /* goto the next block. */
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
@@ -1057,7 +1094,7 @@ xfs_qm_quotacheck_dqadjust(
* There are no timers for the default values set in the root dquot.
*/
if (dqp->q_core.d_id) {
- xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+ xfs_qm_adjust_dqlimits(mp, dqp);
xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
}
@@ -1115,7 +1152,7 @@ xfs_qm_dqusage_adjust(
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ if (xfs_is_quota_inode(&mp->m_sb, ino)) {
*res = BULKSTAT_RV_NOTHING;
return XFS_ERROR(EINVAL);
}
@@ -1225,19 +1262,20 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
- xfs_inode_t *uip, *gip;
- uint flags;
- LIST_HEAD (buffer_list);
+ int done, count, error, error2;
+ xfs_ino_t lastino;
+ size_t structsz;
+ uint flags;
+ LIST_HEAD (buffer_list);
+ struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
+ struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
count = INT_MAX;
structsz = 1;
lastino = 0;
flags = 0;
- ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+ ASSERT(uip || gip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1247,7 +1285,6 @@ xfs_qm_quotacheck(
* their counters to zero. We need a clean slate.
* We don't log our changes till later.
*/
- uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
@@ -1256,14 +1293,14 @@ xfs_qm_quotacheck(
flags |= XFS_UQUOTA_CHKD;
}
- gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
&buffer_list);
if (error)
goto error_return;
- flags |= XFS_OQUOTA_CHKD;
+ flags |= XFS_IS_GQUOTA_ON(mp) ?
+ XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD;
}
do {
@@ -1358,15 +1395,13 @@ STATIC int
xfs_qm_init_quotainos(
xfs_mount_t *mp)
{
- xfs_inode_t *uip, *gip;
- int error;
- __int64_t sbflags;
- uint flags;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ int error;
+ __int64_t sbflags = 0;
+ uint flags = 0;
ASSERT(mp->m_quotainfo);
- uip = gip = NULL;
- sbflags = 0;
- flags = 0;
/*
* Get the uquota and gquota inodes
@@ -1375,19 +1410,18 @@ xfs_qm_init_quotainos(
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip)))
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip);
+ if (error)
return XFS_ERROR(error);
}
if (XFS_IS_OQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip))) {
- if (uip)
- IRELE(uip);
- return XFS_ERROR(error);
- }
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip);
+ if (error)
+ goto error_rele;
}
} else {
flags |= XFS_QMOPT_SBVERSION;
@@ -1402,10 +1436,11 @@ xfs_qm_init_quotainos(
* temporarily switch to read-write to do this.
*/
if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
- if ((error = xfs_qm_qino_alloc(mp, &uip,
+ error = xfs_qm_qino_alloc(mp, &uip,
sbflags | XFS_SB_UQUOTINO,
- flags | XFS_QMOPT_UQUOTA)))
- return XFS_ERROR(error);
+ flags | XFS_QMOPT_UQUOTA);
+ if (error)
+ goto error_rele;
flags &= ~XFS_QMOPT_SBVERSION;
}
@@ -1414,18 +1449,21 @@ xfs_qm_init_quotainos(
XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
error = xfs_qm_qino_alloc(mp, &gip,
sbflags | XFS_SB_GQUOTINO, flags);
- if (error) {
- if (uip)
- IRELE(uip);
-
- return XFS_ERROR(error);
- }
+ if (error)
+ goto error_rele;
}
mp->m_quotainfo->qi_uquotaip = uip;
mp->m_quotainfo->qi_gquotaip = gip;
return 0;
+
+error_rele:
+ if (uip)
+ IRELE(uip);
+ if (gip)
+ IRELE(gip);
+ return XFS_ERROR(error);
}
STATIC void
@@ -1436,7 +1474,7 @@ xfs_qm_dqfree_one(
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -1622,7 +1660,8 @@ xfs_qm_vop_dqalloc(
struct xfs_dquot **O_gdqpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_dquot *uq, *gq;
+ struct xfs_dquot *uq = NULL;
+ struct xfs_dquot *gq = NULL;
int error;
uint lockflags;
@@ -1647,7 +1686,6 @@ xfs_qm_vop_dqalloc(
}
}
- uq = gq = NULL;
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
if (ip->i_d.di_uid != uid) {
/*
@@ -1660,11 +1698,12 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
XFS_DQ_USER,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &uq))) {
+ &uq);
+ if (error) {
ASSERT(error != ENOENT);
return error;
}
@@ -1686,15 +1725,14 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
XFS_DQ_GROUP,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &gq);
+ if (error) {
ASSERT(error != ENOENT);
- return error;
+ goto error_rele;
}
xfs_dqunlock(gq);
lockflags = XFS_ILOCK_SHARED;
@@ -1706,15 +1744,14 @@ xfs_qm_vop_dqalloc(
} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
XFS_DQ_PROJ,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &gq);
+ if (error) {
ASSERT(error != ENOENT);
- return (error);
+ goto error_rele;
}
xfs_dqunlock(gq);
lockflags = XFS_ILOCK_SHARED;
@@ -1737,6 +1774,11 @@ xfs_qm_vop_dqalloc(
else if (gq)
xfs_qm_dqrele(gq);
return 0;
+
+error_rele:
+ if (uq)
+ xfs_qm_dqrele(uq);
+ return error;
}
/*
@@ -1784,29 +1826,31 @@ xfs_qm_vop_chown(
*/
int
xfs_qm_vop_chown_reserve(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ uint flags)
{
- xfs_mount_t *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
- xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ uint delblks, blkflags, prjflags = 0;
+ struct xfs_dquot *udq_unres = NULL;
+ struct xfs_dquot *gdq_unres = NULL;
+ struct xfs_dquot *udq_delblks = NULL;
+ struct xfs_dquot *gdq_delblks = NULL;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
delblks = ip->i_delayed_blks;
- delblksudq = delblksgdq = unresudq = unresgdq = NULL;
blkflags = XFS_IS_REALTIME_INODE(ip) ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
- delblksudq = udqp;
+ udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
* unreserve those from the old dquot, and add them to the
@@ -1814,7 +1858,7 @@ xfs_qm_vop_chown_reserve(
*/
if (delblks) {
ASSERT(ip->i_udquot);
- unresudq = ip->i_udquot;
+ udq_unres = ip->i_udquot;
}
}
if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
@@ -1825,18 +1869,19 @@ xfs_qm_vop_chown_reserve(
if (prjflags ||
(XFS_IS_GQUOTA_ON(ip->i_mount) &&
ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
- delblksgdq = gdqp;
+ gdq_delblks = gdqp;
if (delblks) {
ASSERT(ip->i_gdquot);
- unresgdq = ip->i_gdquot;
+ gdq_unres = ip->i_gdquot;
}
}
}
- if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags)))
- return (error);
+ error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+ udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
/*
* Do the delayed blks reservations/unreservations now. Since, these
@@ -1848,14 +1893,15 @@ xfs_qm_vop_chown_reserve(
/*
* Do the reservations first. Unreservation can't fail.
*/
- ASSERT(delblksudq || delblksgdq);
- ASSERT(unresudq || unresgdq);
- if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags)))
- return (error);
+ ASSERT(udq_delblks || gdq_delblks);
+ ASSERT(udq_unres || gdq_unres);
+ error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+ udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0,
blkflags);
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 44b858b79d71..bdb4f8b95207 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -69,28 +69,62 @@ typedef struct xfs_quotainfo {
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
-#define XFS_DQUOT_TREE(qi, type) \
- ((type & XFS_DQ_USER) ? \
- &((qi)->qi_uquota_tree) : \
- &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+ struct xfs_quotainfo *qi,
+ int type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return &qi->qi_uquota_tree;
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return &qi->qi_gquota_tree;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+ switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
-extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
- xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
+ unsigned int nbblks);
+extern void xfs_trans_mod_dquot(struct xfs_trans *,
+ struct xfs_dquot *, uint, long);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+ struct xfs_mount *, struct xfs_dquot *,
+ struct xfs_dquot *, long, long, uint);
+extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
/*
* We keep the usr and grp dquots separately so that locking will be easier
* to do at commit time. All transactions that we know of at this point
* affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
*/
+enum {
+ XFS_QM_TRANS_USR = 0,
+ XFS_QM_TRANS_GRP,
+ XFS_QM_TRANS_DQTYPES
+};
#define XFS_QM_TRANS_MAXDQS 2
-typedef struct xfs_dquot_acct {
- xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
- xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+ struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
/*
* Users are allowed to have a usage exceeding their softlimit for
@@ -104,22 +138,23 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_IWARNLIMIT 5
#define XFS_QM_RTBWARNLIMIT 5
-extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_quotacheck(xfs_mount_t *);
-extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int xfs_qm_quotacheck(struct xfs_mount *);
+extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
/* dquot stuff */
-extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
-extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct fs_disk_quota *);
+extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
+ struct fs_disk_quota *);
+extern int xfs_qm_scall_getqstat(struct xfs_mount *,
+ struct fs_quota_stat *);
+extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index cf9a34051e07..a08801ae24e2 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff(
}
if (flags & XFS_GQUOTA_ACCT) {
dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
inactivate_flags |= XFS_GQUOTA_ACTIVE;
} else if (flags & XFS_PQUOTA_ACCT) {
dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
inactivate_flags |= XFS_PQUOTA_ACTIVE;
}
@@ -335,14 +335,14 @@ xfs_qm_scall_quotaon(
* quota acct on ondisk without m_qflags' knowing.
*/
if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
- (flags & XFS_UQUOTA_ENFD))
- ||
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ (flags & XFS_UQUOTA_ENFD)) ||
+ ((flags & XFS_GQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ (flags & XFS_GQUOTA_ENFD)) ||
((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
- (flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))) {
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ (flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
__func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +407,11 @@ xfs_qm_scall_getqstat(
struct fs_quota_stat *out)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip, *gip;
- bool tempuqip, tempgqip;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ bool tempuqip = false;
+ bool tempgqip = false;
- uip = gip = NULL;
- tempuqip = tempgqip = false;
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
@@ -472,15 +472,15 @@ xfs_qm_scall_getqstat(
*/
int
xfs_qm_scall_setqlim(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
fs_disk_quota_t *newlim)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- xfs_disk_dquot_t *ddq;
- xfs_dquot_t *dqp;
- xfs_trans_t *tp;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_dquot *dqp;
+ struct xfs_trans *tp;
int error;
xfs_qcnt_t hard, soft;
@@ -489,31 +489,36 @@ xfs_qm_scall_setqlim(
if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
return 0;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
- 0, 0, XFS_DEFAULT_LOG_COUNT);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return (error);
- }
-
/*
* We don't want to race with a quotaoff so take the quotaoff lock.
- * (We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening). (XXXThis doesn't currently happen
- * because we take the vfslock before calling xfs_qm_sysent).
+ * We don't hold an inode lock, so there's nothing else to stop
+ * a quotaoff from happening.
*/
mutex_lock(&q->qi_quotaofflock);
/*
- * Get the dquot (locked), and join it to the transaction.
- * Allocate the dquot if this doesn't exist.
+ * Get the dquot (locked) before we start, as we need to do a
+ * transaction to allocate it if it doesn't exist. Once we have the
+ * dquot, unlock it so we can start the next transaction safely. We hold
+ * a reference to the dquot, so it's safe to do this unlock/lock without
+ * it being reclaimed in the mean time.
*/
- if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ if (error) {
ASSERT(error != ENOENT);
goto out_unlock;
}
+ xfs_dqunlock(dqp);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+ error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_rele;
+ }
+
+ xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
ddq = &dqp->q_core;
@@ -529,6 +534,7 @@ xfs_qm_scall_setqlim(
if (hard == 0 || hard >= soft) {
ddq->d_blk_hardlimit = cpu_to_be64(hard);
ddq->d_blk_softlimit = cpu_to_be64(soft);
+ xfs_dquot_set_prealloc_limits(dqp);
if (id == 0) {
q->qi_bhardlimit = hard;
q->qi_bsoftlimit = soft;
@@ -620,9 +626,10 @@ xfs_qm_scall_setqlim(
xfs_trans_log_dquot(tp, dqp);
error = xfs_trans_commit(tp, 0);
- xfs_qm_dqrele(dqp);
- out_unlock:
+out_rele:
+ xfs_qm_dqrele(dqp);
+out_unlock:
mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -769,9 +776,12 @@ xfs_qm_scall_getquota(
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_USER) ||
+ (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+ (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_PROJ)) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -779,8 +789,8 @@ xfs_qm_scall_getquota(
#ifdef DEBUG
if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
- (XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+ (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+ (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
dst->d_id != 0) {
if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
@@ -826,16 +836,16 @@ xfs_qm_export_flags(
uflags = 0;
if (flags & XFS_UQUOTA_ACCT)
uflags |= FS_QUOTA_UDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_GQUOTA_ACCT)
uflags |= FS_QUOTA_GDQ_ACCT;
+ if (flags & XFS_PQUOTA_ACCT)
+ uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_UQUOTA_ENFD)
uflags |= FS_QUOTA_UDQ_ENFD;
- if (flags & (XFS_OQUOTA_ENFD)) {
- uflags |= (flags & XFS_GQUOTA_ACCT) ?
- FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
- }
+ if (flags & XFS_GQUOTA_ENFD)
+ uflags |= FS_QUOTA_GDQ_ENFD;
+ if (flags & XFS_PQUOTA_ENFD)
+ uflags |= FS_QUOTA_PDQ_ENFD;
return (uflags);
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b50ec5b95d5a..c3483bab9cde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -77,9 +77,18 @@ typedef struct xfs_disk_dquot {
*/
typedef struct xfs_dqblk {
xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
- char dd_fill[32]; /* filling for posterity */
+ char dd_fill[4]; /* filling for posterity */
+
+ /*
+ * These two are only present on filesystems with the CRC bits set.
+ */
+ __be32 dd_crc; /* checksum */
+ __be64 dd_lsn; /* last modification in log */
+ uuid_t dd_uuid; /* location information */
} xfs_dqblk_t;
+#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
+
/*
* flags for q_flags field in the dquot.
*/
@@ -152,30 +161,42 @@ typedef struct xfs_qoff_logformat {
#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
+
+/*
* Quota Accounting/Enforcement flags
*/
#define XFS_ALL_QUOTA_ACCT \
(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ENFD \
+ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD \
+ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
/*
* Incore only flags for quotaoff - these bits get cleared when quota(s)
* are in the process of getting turned off. These flags are in m_qflags but
* never in sb_qflags.
*/
-#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+ (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
/*
* Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,24 +280,23 @@ typedef struct xfs_qoff_logformat {
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
(XFS_IS_GQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
(XFS_IS_PQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+ XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+ XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD)
#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
- XFS_GQUOTA_ACCT)
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+ XFS_PQUOTA_CHKD)
/*
@@ -380,5 +400,7 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
xfs_dqid_t, uint, uint, char *);
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
#endif /* __KERNEL__ */
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
flags |= XFS_GQUOTA_ACCT;
if (uflags & FS_QUOTA_UDQ_ENFD)
flags |= XFS_UQUOTA_ENFD;
- if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ flags |= XFS_GQUOTA_ENFD;
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ flags |= XFS_PQUOTA_ENFD;
switch (op) {
case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index a05b45175fb0..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -32,6 +32,7 @@ struct xfs_mount;
#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
#define XFS_SB_VERSION_NUMBITS 0x000f
#define XFS_SB_VERSION_ALLFBITS 0xfff0
#define XFS_SB_VERSION_SASHFBITS 0xf000
@@ -161,6 +162,20 @@ typedef struct xfs_sb {
*/
__uint32_t sb_bad_features2;
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __uint32_t sb_features_compat;
+ __uint32_t sb_features_ro_compat;
+ __uint32_t sb_features_incompat;
+ __uint32_t sb_features_log_incompat;
+
+ __uint32_t sb_crc; /* superblock crc */
+ __uint32_t sb_pad;
+
+ xfs_ino_t sb_pquotino; /* project quota inode */
+ xfs_lsn_t sb_lsn; /* last write sequence */
+
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -229,7 +244,21 @@ typedef struct xfs_dsb {
* for features2 bits. Easiest just to mark it bad and not use
* it for anything else.
*/
- __be32 sb_bad_features2;
+ __be32 sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __be32 sb_features_compat;
+ __be32 sb_features_ro_compat;
+ __be32 sb_features_incompat;
+ __be32 sb_features_log_incompat;
+
+ __le32 sb_crc; /* superblock crc */
+ __be32 sb_pad;
+
+ __be64 sb_pquotino; /* project quota inode */
+ __be64 sb_lsn; /* last write sequence */
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
@@ -250,7 +279,10 @@ typedef enum {
XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
- XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
+ XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+ XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+ XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+ XFS_SBS_PQUOTINO, XFS_SBS_LSN,
XFS_SBS_FIELDCOUNT
} xfs_sb_field_t;
@@ -276,6 +308,12 @@ typedef enum {
#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
#define XFS_SB_MOD_BITS \
@@ -283,7 +321,9 @@ typedef enum {
XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
- XFS_SB_BAD_FEATURES2)
+ XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+ XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+ XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
/*
@@ -325,6 +365,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
return 1;
}
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+ return 1;
return 0;
}
@@ -365,7 +407,7 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
{
return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
}
@@ -373,7 +415,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
{
if (sbp->sb_versionnum == XFS_SB_VERSION_1)
sbp->sb_versionnum = XFS_SB_VERSION_2;
- else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
else
sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
@@ -382,7 +424,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
{
return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
}
@@ -396,13 +438,13 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
}
static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
else
sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
@@ -411,13 +453,14 @@ static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
}
static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
}
@@ -429,38 +472,42 @@ static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT));
}
static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT));
}
static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT));
}
static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
}
static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT));
}
/*
@@ -475,14 +522,16 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
}
static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
}
static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
@@ -500,14 +549,79 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
}
static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
{
- return (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_ALL 0
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
}
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ea341cea68cb..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
!strcmp(this_char, MNTOPT_PRJQUOTA)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_PQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
!strcmp(this_char, MNTOPT_GRPQUOTA)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_GQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
xfs_warn(mp,
"delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
}
done:
- if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
/*
* At this point the superblock has not been read
* in, therefore we do not know the block size.
* Before the mount call ends we will convert
* these to FSBs.
*/
- if (dsunit) {
- mp->m_dalign = dsunit;
- mp->m_flags |= XFS_MOUNT_RETERR;
- }
-
- if (dswidth)
- mp->m_swidth = dswidth;
+ mp->m_dalign = dsunit;
+ mp->m_swidth = dswidth;
}
if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
/* Either project or group quotas can be active, not both */
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
seq_puts(m, "," MNTOPT_PRJQUOTA);
else
seq_puts(m, "," MNTOPT_PQUOTANOENF);
} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
seq_puts(m, "," MNTOPT_GRPQUOTA);
else
seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
- (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
return 0;
}
@@ -1373,6 +1369,17 @@ xfs_finish_flags(
}
/*
+ * V5 filesystems always use attr2 format for attributes.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+ MNTOPT_NOATTR2, MNTOPT_ATTR2);
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
* mkfs'ed attr2 will turn on attr2 mount unless explicitly
* told by noattr2 to turn it off
*/
@@ -1470,6 +1477,10 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
+ /* version 5 superblocks support inode version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ sb->s_flags |= MS_I_VERSION;
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1644,9 +1655,15 @@ xfs_init_zones(void)
KM_ZONE_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
+ xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+ "xfs_icr");
+ if (!xfs_icreate_zone)
+ goto out_destroy_ili_zone;
return 0;
+ out_destroy_ili_zone:
+ kmem_zone_destroy(xfs_ili_zone);
out_destroy_inode_zone:
kmem_zone_destroy(xfs_inode_zone);
out_destroy_efi_zone:
@@ -1685,6 +1702,7 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_icreate_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
new file mode 100644
index 000000000000..e830fb56e27f
--- /dev/null
+++ b/fs/xfs/xfs_symlink.c
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_itable.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_log_priv.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+ struct xfs_mount *mp,
+ int pathlen)
+{
+ int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+ return (pathlen + buflen - 1) / buflen;
+}
+
+static int
+xfs_symlink_hdr_set(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return 0;
+
+ dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+ dsl->sl_offset = cpu_to_be32(offset);
+ dsl->sl_bytes = cpu_to_be32(size);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+ dsl->sl_owner = cpu_to_be64(ino);
+ dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (offset != be32_to_cpu(dsl->sl_offset))
+ return false;
+ if (size != be32_to_cpu(dsl->sl_bytes))
+ return false;
+ if (ino != be64_to_cpu(dsl->sl_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static bool
+xfs_symlink_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ return false;
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+ return false;
+ if (be32_to_cpu(dsl->sl_offset) +
+ be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+ return false;
+ if (dsl->sl_owner == 0)
+ return false;
+
+ return true;
+}
+
+static void
+xfs_symlink_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
+ !xfs_symlink_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_symlink_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_symlink_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (bip) {
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ offsetof(struct xfs_dsymlink_hdr, sl_crc));
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .verify_read = xfs_symlink_read_verify,
+ .verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ char *buf;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ bp->b_ops = NULL;
+ memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ return;
+ }
+
+ /*
+ * As this symlink fits in an inode literal area, it must also fit in
+ * the smallest buffer the filesystem supports.
+ */
+ ASSERT(BBTOB(bp->b_length) >=
+ ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+ memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
+
+/* ----- Kernel only functions below ----- */
+STATIC int
+xfs_readlink_bmap(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ char *cur_chunk;
+ int pathlen = ip->i_d.di_size;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int byte_cnt;
+ int n;
+ int error = 0;
+ int fsblocks = 0;
+ int offset;
+
+ fsblocks = xfs_symlink_blocks(mp, pathlen);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ goto out;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &xfs_symlink_buf_ops);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
+ goto out;
+ }
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+
+ cur_chunk = bp->b_addr;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset,
+ byte_cnt, bp)) {
+ error = EFSCORRUPTED;
+ xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
+ offset, byte_cnt, ip->i_ino);
+ xfs_buf_relse(bp);
+ goto out;
+
+ }
+
+ cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+ }
+
+ memcpy(link + offset, bp->b_addr, byte_cnt);
+
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_buf_relse(bp);
+ }
+ ASSERT(pathlen == 0);
+
+ link[ip->i_d.di_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+int
+xfs_readlink(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t pathlen;
+ int error = 0;
+
+ trace_xfs_readlink(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ pathlen = ip->i_d.di_size;
+ if (!pathlen)
+ goto out;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+ __func__, (unsigned long long) ip->i_ino,
+ (long long) pathlen);
+ ASSERT(0);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
+ }
+
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+ link[pathlen] = '\0';
+ } else {
+ error = xfs_readlink_bmap(ip, link);
+ }
+
+ out:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+int
+xfs_symlink(
+ struct xfs_inode *dp,
+ struct xfs_name *link_name,
+ const char *target_path,
+ umode_t mode,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = NULL;
+ struct xfs_inode *ip = NULL;
+ int error = 0;
+ int pathlen;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ bool unlock_dp_on_error = false;
+ uint cancel_flags;
+ int committed;
+ xfs_fileoff_t first_fsb;
+ xfs_filblks_t fs_blocks;
+ int nmaps;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ xfs_daddr_t d;
+ const char *cur_chunk;
+ int byte_cnt;
+ int n;
+ xfs_buf_t *bp;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ uint resblks;
+
+ *ipp = NULL;
+
+ trace_xfs_symlink(dp, link_name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Check component lengths of the target path name.
+ */
+ pathlen = strlen(target_path);
+ if (pathlen >= MAXPATHLEN) /* total string too long */
+ return XFS_ERROR(ENAMETOOLONG);
+
+ udqp = gdqp = NULL;
+ if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ prid = xfs_get_projid(dp);
+ else
+ prid = XFS_PROJID_DEFAULT;
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ /*
+ * The symlink will fit into the inode data fork?
+ * There can't be any attributes so we get the whole variable part.
+ */
+ if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ fs_blocks = 0;
+ else
+ fs_blocks = xfs_symlink_blocks(mp, pathlen);
+ resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+ error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+ if (error == ENOSPC && fs_blocks == 0) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = true;
+
+ /*
+ * Check whether the directory allows new symlinks or not.
+ */
+ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+ error = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+
+ /*
+ * Reserve disk quota : blocks and inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ if (error)
+ goto error_return;
+
+ /*
+ * Check for ability to enter directory entry, if no space reserved.
+ */
+ error = xfs_dir_canenter(tp, dp, link_name, resblks);
+ if (error)
+ goto error_return;
+ /*
+ * Initialize the bmap freelist prior to calling either
+ * bmapi or the directory create code.
+ */
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Allocate an inode for the symlink.
+ */
+ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto error_return;
+ goto error1;
+ }
+
+ /*
+ * An error after we've joined dp to the transaction will result in the
+ * transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = false;
+
+ /*
+ * Also attach the dquot(s) to it, if applicable.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+ if (resblks)
+ resblks -= XFS_IALLOC_SPACE_RES(mp);
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+ memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+ ip->i_d.di_size = pathlen;
+
+ /*
+ * The inode was initially created in extent format.
+ */
+ ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+ ip->i_df.if_flags |= XFS_IFINLINE;
+
+ ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+ } else {
+ int offset;
+
+ first_fsb = 0;
+ nmaps = XFS_SYMLINK_MAPS;
+
+ error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+ XFS_BMAPI_METADATA, &first_block, resblks,
+ mval, &nmaps, &free_list);
+ if (error)
+ goto error2;
+
+ if (resblks)
+ resblks -= fs_blocks;
+ ip->i_d.di_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error2;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+ byte_cnt, bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+ }
+
+ /*
+ * Create the directory entry for the symlink.
+ */
+ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto error2;
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * symlink transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error2;
+ }
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ *ipp = ip;
+ return 0;
+
+ error2:
+ IRELE(ip);
+ error1:
+ xfs_bmap_cancel(&free_list);
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
+ return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+ xfs_inode_t *ip,
+ xfs_trans_t **tpp)
+{
+ xfs_buf_t *bp;
+ int committed;
+ int done;
+ int error;
+ xfs_fsblock_t first_block;
+ xfs_bmap_free_t free_list;
+ int i;
+ xfs_mount_t *mp;
+ xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
+ int nmaps;
+ xfs_trans_t *ntp;
+ int size;
+ xfs_trans_t *tp;
+
+ tp = *tpp;
+ mp = ip->i_mount;
+ ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
+ /*
+ * We're freeing a symlink that has some
+ * blocks allocated to it. Free the
+ * blocks here. We know that we've got
+ * either 1 or 2 extents and that we can
+ * free them all in one bunmapi call.
+ */
+ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+
+ /*
+ * Lock the inode, fix the size, and join it to the transaction.
+ * Hold it so in the normal path, we still have it locked for
+ * the second transaction. In the error paths we need it
+ * held so the cancel won't rele it, see below.
+ */
+ size = (int)ip->i_d.di_size;
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Find the block(s) so we can inval and unmap them.
+ */
+ done = 0;
+ xfs_bmap_init(&free_list, &first_block);
+ nmaps = ARRAY_SIZE(mval);
+ error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
+ mval, &nmaps, 0);
+ if (error)
+ goto error0;
+ /*
+ * Invalidate the block(s). No validation is done.
+ */
+ for (i = 0; i < nmaps; i++) {
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error1;
+ }
+ xfs_trans_binval(tp, bp);
+ }
+ /*
+ * Unmap the dead block(s) to the free_list.
+ */
+ if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ &first_block, &free_list, &done)))
+ goto error1;
+ ASSERT(done);
+ /*
+ * Commit the first transaction. This logs the EFI and the inode.
+ */
+ if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
+ goto error1;
+ /*
+ * The transaction must have been committed, since there were
+ * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
+ * The new tp has the extent freeing and EFDs.
+ */
+ ASSERT(committed);
+ /*
+ * The first xact was committed, so add the inode to the new one.
+ * Mark it dirty so it will be logged and moved forward in the log as
+ * part of every commit.
+ */
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Get a new, empty transaction to return to our caller.
+ */
+ ntp = xfs_trans_dup(tp);
+ /*
+ * Commit the transaction containing extent freeing and EFDs.
+ * If we get an error on the commit here or on the reserve below,
+ * we need to unlock the inode since the new transaction doesn't
+ * have the inode attached.
+ */
+ error = xfs_trans_commit(tp, 0);
+ tp = ntp;
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ goto error0;
+ }
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+
+ /*
+ * Remove the memory for extent descriptions (just bookkeeping).
+ */
+ if (ip->i_df.if_bytes)
+ xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ /*
+ * Put an itruncate log reservation in the new transaction
+ * for our caller.
+ */
+ if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ goto error0;
+ }
+
+ xfs_trans_ijoin(tp, ip, 0);
+ *tpp = tp;
+ return 0;
+
+ error1:
+ xfs_bmap_cancel(&free_list);
+ error0:
+ return error;
+}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+ struct xfs_inode *ip,
+ struct xfs_trans **tp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int pathlen;
+
+ trace_xfs_inactive_symlink(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Zero length symlinks _can_ exist.
+ */
+ pathlen = (int)ip->i_d.di_size;
+ if (!pathlen)
+ return 0;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+ __func__, (unsigned long long)ip->i_ino, pathlen);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ if (ip->i_df.if_bytes > 0)
+ xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+ XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ return 0;
+ }
+
+ /* remove the remote symlink */
+ return xfs_inactive_symlink_rmt(ip, tp);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
new file mode 100644
index 000000000000..374394880c01
--- /dev/null
+++ b/fs/xfs/xfs_symlink.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SYMLINK_H
+#define __XFS_SYMLINK_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+struct xfs_name;
+
+#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
+
+struct xfs_dsymlink_hdr {
+ __be32 sl_magic;
+ __be32 sl_offset;
+ __be32 sl_bytes;
+ __be32 sl_crc;
+ uuid_t sl_uuid;
+ __be64 sl_owner;
+ __be64 sl_blkno;
+ __be64 sl_lsn;
+};
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_dsymlink_hdr) : 0))
+
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+#ifdef __KERNEL__
+
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+ const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
+
+#endif /* __KERNEL__ */
+#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
#ifdef CONFIG_PROC_FS
STATIC int
xfs_stats_clear_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
STATIC int
xfs_panic_mask_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int ret, *valp = ctl->data;
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
{}
};
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
{
.procname = "xfs",
.mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
{}
};
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 624bedd81357..b6e3897c1d9f 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -22,7 +22,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
@@ -30,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_mount.h"
+#include "xfs_da_btree.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 16a812977eab..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
TP_PROTO(struct xfs_buf_log_item *bip), \
TP_ARGS(bip))
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
DEFINE_INODE_EVENT(xfs_getattr);
DEFINE_INODE_EVENT(xfs_setattr);
DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
@@ -619,6 +624,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
(char *)__entry->caller_ip)
)
+TRACE_EVENT(xfs_iomap_prealloc_size,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
+ unsigned int writeio_blocks),
+ TP_ARGS(ip, blocks, shift, writeio_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, blocks)
+ __field(int, shift)
+ __field(unsigned int, writeio_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->blocks = blocks;
+ __entry->shift = shift;
+ __entry->writeio_blocks = writeio_blocks;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
+ "m_writeio_blocks %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+ __entry->blocks, __entry->shift, __entry->writeio_blocks)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
@@ -950,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
- TP_ARGS(inode, page, off),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
+ __field(unsigned int, length)
__field(int, delalloc)
__field(int, unwritten)
),
@@ -971,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
+ __entry->length = len;
__entry->delalloc = delalloc;
__entry->unwritten = unwritten;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "delalloc %d unwritten %d",
+ "length %x delalloc %d unwritten %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
+ __entry->length,
__entry->delalloc,
__entry->unwritten)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off))
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
}
/*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
* the parent directory inode: inode size
* the new inode: inode size
- * the inode btree entry: 1 block
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
* the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
* the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(1, 1024)),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_create_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
}
/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
+ * For icreate we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
struct xfs_mount *mp)
{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- (uint)XFS_FSB_TO_B(mp, 1) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- mp->m_sb.sb_sectsize +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_icreate_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return xfs_calc_icreate_reservation(mp);
+ return __xfs_calc_create_reservation(mp);
+
}
/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
return xfs_calc_create_reservation(mp);
}
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp) +
+ xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
/*
* In freeing an inode we can modify:
* the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index cd29f6171021..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_ICREATE 0x123f
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_SWAPEXT 40
#define XFS_TRANS_SB_COUNT 41
#define XFS_TRANS_CHECKPOINT 42
-#define XFS_TRANS_TYPE_MAX 42
+#define XFS_TRANS_ICREATE 43
+#define XFS_TRANS_TYPE_MAX 43
/* new transaction types need to be reflected in xfs_logprint(8) */
#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
/*
* Per-extent log reservation for the allocation btree changes
* involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
*/
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
/*
* Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
+ * dir blocks: (1 btree block per level + data block + free block)
+ * bmap btree: (levels + 2) * max depth
* v2 directory blocks can be fragmented below the dirblksize down to the fsb
* size, so account for that in the DAENTER macros.
*/
-#define XFS_DIROP_LOG_RES(mp) \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
#define XFS_DIROP_LOG_COUNT(mp) \
(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -405,7 +402,7 @@ typedef struct xfs_trans {
int64_t t_res_fdblocks_delta; /* on-disk only chg */
int64_t t_frextents_delta;/* superblock freextents chg*/
int64_t t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int64_t t_ag_freeblks_delta; /* debugging counter */
int64_t t_ag_flist_delta; /* debugging counter */
int64_t t_ag_btree_delta; /* debugging counter */
@@ -433,7 +430,7 @@ typedef struct xfs_trans {
#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d)
#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 3edf5dbee001..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
return XFS_ERROR(EIO);
}
-
/*
* Release the buffer bp which was previously acquired with one of the
* xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
- bip->bli_flags |= XFS_BLI_LOGGED;
- xfs_buf_item_log(bip, first, last);
+
+ /*
+ * If we have an ordered buffer we are not logging any dirty range but
+ * it still needs to be marked dirty and that it has been logged.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ xfs_buf_item_log(bip, first, last);
}
@@ -659,6 +664,7 @@ xfs_trans_binval(
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
@@ -671,6 +677,7 @@ xfs_trans_binval(
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+ bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK;
for (i = 0; i < bip->bli_format_count; i++) {
memset(bip->bli_formats[i].blf_data_map, 0,
(bip->bli_formats[i].blf_map_size * sizeof(uint)));
@@ -702,12 +709,13 @@ xfs_trans_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_BUF;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
/*
* This call is used to indicate that the buffer is going to
* be staled and was an inode buffer. This means it gets
- * special processing during unpin - where any inodes
+ * special processing during unpin - where any inodes
* associated with the buffer should be removed from ail.
* There is also special processing during recovery,
* any replay of the inodes in the buffer needs to be
@@ -726,6 +734,7 @@ xfs_trans_stale_inode_buf(
bip->bli_flags |= XFS_BLI_STALE_INODE;
bip->bli_item.li_cb = xfs_buf_iodone;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
/*
@@ -749,8 +758,66 @@ xfs_trans_inode_alloc_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
+}
+
+/*
+ * Set the type of the buffer for log recovery so that it can correctly identify
+ * and hence attach the correct buffer ops to the buffer after replay.
+ */
+void
+xfs_trans_buf_set_type(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ enum xfs_blft type)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!tp)
+ return;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ xfs_blft_to_flags(&bip->__bli_format, type);
}
+void
+xfs_trans_buf_copy_type(
+ struct xfs_buf *dst_bp,
+ struct xfs_buf *src_bp)
+{
+ struct xfs_buf_log_item *sbip = src_bp->b_fspriv;
+ struct xfs_buf_log_item *dbip = dst_bp->b_fspriv;
+ enum xfs_blft type;
+
+ type = xfs_blft_from_flags(&sbip->__bli_format);
+ xfs_blft_to_flags(&dbip->__bli_format, type);
+}
/*
* Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
@@ -769,14 +836,28 @@ xfs_trans_dquot_buf(
xfs_buf_t *bp,
uint type)
{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
- ASSERT(bp->b_transp == tp);
- ASSERT(bip != NULL);
ASSERT(type == XFS_BLF_UDQUOT_BUF ||
type == XFS_BLF_PDQUOT_BUF ||
type == XFS_BLF_GDQUOT_BUF);
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->__bli_format.blf_flags |= type;
+
+ switch (type) {
+ case XFS_BLF_UDQUOT_BUF:
+ type = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_BLF_PDQUOT_BUF:
+ type = XFS_BLFT_PDQUOT_BUF;
+ break;
+ case XFS_BLF_GDQUOT_BUF:
+ type = XFS_BLFT_GDQUOT_BUF;
+ break;
+ default:
+ type = XFS_BLFT_UNKNOWN_BUF;
+ break;
+ }
+
+ xfs_trans_buf_set_type(tp, bp, type);
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 642c2d6e1db1..3ba64d540168 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
return;
xfs_trans_alloc_dqinfo(ntp);
- oqa = otp->t_dqinfo->dqa_usrdquots;
- nqa = ntp->t_dqinfo->dqa_usrdquots;
/*
* Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
- for (j = 0; j < 2; j++) {
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ oqa = otp->t_dqinfo->dqs[j];
+ nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (oqa[i].qt_dquot == NULL)
break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
oq->qt_ino_res = oq->qt_ino_res_used;
}
- oqa = otp->t_dqinfo->dqa_grpdquots;
- nqa = ntp->t_dqinfo->dqa_grpdquots;
}
}
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
if (!XFS_IS_QUOTA_RUNNING(mp) ||
!XFS_IS_QUOTA_ON(mp) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
if (tp->t_dqinfo == NULL)
@@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino(
(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
}
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
xfs_trans_get_dqtrx(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
- int i;
- xfs_dqtrx_t *qa;
+ int i;
+ struct xfs_dqtrx *qa;
- qa = XFS_QM_ISUDQ(dqp) ?
- tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+ if (XFS_QM_ISUDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+ else
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -326,12 +325,12 @@ xfs_trans_dqlockedjoin(
*/
void
xfs_trans_apply_dquot_deltas(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
int i, j;
- xfs_dquot_t *dqp;
- xfs_dqtrx_t *qtrx, *qa;
- xfs_disk_dquot_t *d;
+ struct xfs_dquot *dqp;
+ struct xfs_dqtrx *qtrx, *qa;
+ struct xfs_disk_dquot *d;
long totalbdelta;
long totalrtbdelta;
@@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas(
return;
ASSERT(tp->t_dqinfo);
- qa = tp->t_dqinfo->dqa_usrdquots;
- for (j = 0; j < 2; j++) {
- if (qa[0].qt_dquot == NULL) {
- qa = tp->t_dqinfo->dqa_grpdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
+ if (qa[0].qt_dquot == NULL)
continue;
- }
/*
* Lock all of the dquots and join them to the transaction.
@@ -412,7 +409,7 @@ xfs_trans_apply_dquot_deltas(
* Start/reset the timer(s) if needed.
*/
if (d->d_id) {
- xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+ xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
xfs_qm_adjust_dqtimers(tp->t_mountp, d);
}
@@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_res_rtbcount >=
be64_to_cpu(dqp->q_core.d_rtbcount));
}
- /*
- * Do the group quotas next
- */
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -521,9 +514,9 @@ xfs_trans_unreserve_and_mod_dquots(
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
- qa = tp->t_dqinfo->dqa_usrdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
- for (j = 0; j < 2; j++) {
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
@@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqunlock(dqp);
}
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -640,8 +632,8 @@ xfs_trans_dqresv(
if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
dqp->q_core.d_id &&
((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
- (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+ (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+ (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
if (nblks > 0) {
/*
* dquot is locked already. See if we'd go over the
@@ -748,15 +740,15 @@ error_return:
*/
int
xfs_trans_reserve_quota_bydquots(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- long nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ long nblks,
+ long ninos,
+ uint flags)
{
- int resvd = 0, error;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
@@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots(
(flags & ~XFS_QMOPT_ENOSPC));
if (error)
return error;
- resvd = 1;
}
if (gdqp) {
error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
- if (error) {
- /*
- * can't do it, so backout previous reservation
- */
- if (resvd) {
- flags |= XFS_QMOPT_FORCE_RES;
- xfs_trans_dqresv(tp, mp, udqp,
- -nblks, -ninos, flags);
- }
- return error;
- }
+ if (error)
+ goto unwind_usr;
}
/*
* Didn't change anything critical, so, no need to log
*/
return 0;
+
+unwind_usr:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (udqp)
+ xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+ return error;
}
@@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks(
if (XFS_IS_PQUOTA_ON(mp))
flags |= XFS_QMOPT_ENOSPC;
- ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /*
+ * First time we log the inode in a transaction, bump the inode change
+ * counter if it is configured for this to occur.
+ */
+ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ IS_I_VERSION(VFS_I(ip))) {
+ inode_inc_iversion(VFS_I(ip));
+ ip->i_d.di_changecount = VFS_I(ip)->i_version;
+ flags |= XFS_ILOG_CORE;
+ }
+
tp->t_flags |= XFS_TRANS_DIRTY;
ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 77ad74834baa..42c0ef288aeb 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -48,103 +49,8 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_symlink.h"
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-STATIC int
-xfs_readlink_bmap(
- xfs_inode_t *ip,
- char *link)
-{
- xfs_mount_t *mp = ip->i_mount;
- int pathlen = ip->i_d.di_size;
- int nmaps = SYMLINK_MAPS;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- xfs_daddr_t d;
- int byte_cnt;
- int n;
- xfs_buf_t *bp;
- int error = 0;
-
- error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
- 0);
- if (error)
- goto out;
-
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
- if (!bp)
- return XFS_ERROR(ENOMEM);
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
- goto out;
- }
- if (pathlen < byte_cnt)
- byte_cnt = pathlen;
- pathlen -= byte_cnt;
-
- memcpy(link, bp->b_addr, byte_cnt);
- xfs_buf_relse(bp);
- }
-
- link[ip->i_d.di_size] = '\0';
- error = 0;
-
- out:
- return error;
-}
-
-int
-xfs_readlink(
- xfs_inode_t *ip,
- char *link)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fsize_t pathlen;
- int error = 0;
-
- trace_xfs_readlink(ip);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- pathlen = ip->i_d.di_size;
- if (!pathlen)
- goto out;
-
- if (pathlen < 0 || pathlen > MAXPATHLEN) {
- xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
- __func__, (unsigned long long) ip->i_ino,
- (long long) pathlen);
- ASSERT(0);
- error = XFS_ERROR(EFSCORRUPTED);
- goto out;
- }
-
-
- if (ip->i_df.if_flags & XFS_IFINLINE) {
- memcpy(link, ip->i_df.if_u1.if_data, pathlen);
- link[pathlen] = '\0';
- } else {
- error = xfs_readlink_bmap(ip, link);
- }
-
- out:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
-}
/*
* This is called by xfs_inactive to free any blocks beyond eof
@@ -249,145 +155,6 @@ xfs_free_eofblocks(
return error;
}
-/*
- * Free a symlink that has blocks associated with it.
- */
-STATIC int
-xfs_inactive_symlink_rmt(
- xfs_inode_t *ip,
- xfs_trans_t **tpp)
-{
- xfs_buf_t *bp;
- int committed;
- int done;
- int error;
- xfs_fsblock_t first_block;
- xfs_bmap_free_t free_list;
- int i;
- xfs_mount_t *mp;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- int nmaps;
- xfs_trans_t *ntp;
- int size;
- xfs_trans_t *tp;
-
- tp = *tpp;
- mp = ip->i_mount;
- ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
- /*
- * We're freeing a symlink that has some
- * blocks allocated to it. Free the
- * blocks here. We know that we've got
- * either 1 or 2 extents and that we can
- * free them all in one bunmapi call.
- */
- ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
-
- /*
- * Lock the inode, fix the size, and join it to the transaction.
- * Hold it so in the normal path, we still have it locked for
- * the second transaction. In the error paths we need it
- * held so the cancel won't rele it, see below.
- */
- size = (int)ip->i_d.di_size;
- ip->i_d.di_size = 0;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Find the block(s) so we can inval and unmap them.
- */
- done = 0;
- xfs_bmap_init(&free_list, &first_block);
- nmaps = ARRAY_SIZE(mval);
- error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
- mval, &nmaps, 0);
- if (error)
- goto error0;
- /*
- * Invalidate the block(s).
- */
- for (i = 0; i < nmaps; i++) {
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
- if (!bp) {
- error = ENOMEM;
- goto error1;
- }
- xfs_trans_binval(tp, bp);
- }
- /*
- * Unmap the dead block(s) to the free_list.
- */
- if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
- &first_block, &free_list, &done)))
- goto error1;
- ASSERT(done);
- /*
- * Commit the first transaction. This logs the EFI and the inode.
- */
- if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
- goto error1;
- /*
- * The transaction must have been committed, since there were
- * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
- * The new tp has the extent freeing and EFDs.
- */
- ASSERT(committed);
- /*
- * The first xact was committed, so add the inode to the new one.
- * Mark it dirty so it will be logged and moved forward in the log as
- * part of every commit.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Get a new, empty transaction to return to our caller.
- */
- ntp = xfs_trans_dup(tp);
- /*
- * Commit the transaction containing extent freeing and EFDs.
- * If we get an error on the commit here or on the reserve below,
- * we need to unlock the inode since the new transaction doesn't
- * have the inode attached.
- */
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
- }
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
-
- /*
- * Remove the memory for extent descriptions (just bookkeeping).
- */
- if (ip->i_df.if_bytes)
- xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- /*
- * Put an itruncate log reservation in the new transaction
- * for our caller.
- */
- if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
- }
-
- xfs_trans_ijoin(tp, ip, 0);
- *tpp = tp;
- return 0;
-
- error1:
- xfs_bmap_cancel(&free_list);
- error0:
- return error;
-}
-
int
xfs_release(
xfs_inode_t *ip)
@@ -555,18 +322,9 @@ xfs_inactive(
xfs_trans_ijoin(tp, ip, 0);
if (S_ISLNK(ip->i_d.di_mode)) {
- /*
- * Zero length symlinks _can_ exist.
- */
- if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
- error = xfs_inactive_symlink_rmt(ip, &tp);
- if (error)
- goto out_cancel;
- } else if (ip->i_df.if_bytes > 0) {
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- }
+ error = xfs_inactive_symlink(ip, &tp);
+ if (error)
+ goto out_cancel;
} else if (truncate) {
ip->i_d.di_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1353,247 +1111,6 @@ xfs_link(
}
int
-xfs_symlink(
- xfs_inode_t *dp,
- struct xfs_name *link_name,
- const char *target_path,
- umode_t mode,
- xfs_inode_t **ipp)
-{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp;
- xfs_inode_t *ip;
- int error;
- int pathlen;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- bool unlock_dp_on_error = false;
- uint cancel_flags;
- int committed;
- xfs_fileoff_t first_fsb;
- xfs_filblks_t fs_blocks;
- int nmaps;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- xfs_daddr_t d;
- const char *cur_chunk;
- int byte_cnt;
- int n;
- xfs_buf_t *bp;
- prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
- uint resblks;
-
- *ipp = NULL;
- error = 0;
- ip = NULL;
- tp = NULL;
-
- trace_xfs_symlink(dp, link_name);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- /*
- * Check component lengths of the target path name.
- */
- pathlen = strlen(target_path);
- if (pathlen >= MAXPATHLEN) /* total string too long */
- return XFS_ERROR(ENAMETOOLONG);
-
- udqp = gdqp = NULL;
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = xfs_get_projid(dp);
- else
- prid = XFS_PROJID_DEFAULT;
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
- if (error)
- goto std_return;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- /*
- * The symlink will fit into the inode data fork?
- * There can't be any attributes so we get the whole variable part.
- */
- if (pathlen <= XFS_LITINO(mp))
- fs_blocks = 0;
- else
- fs_blocks = XFS_B_TO_FSB(mp, pathlen);
- resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
- if (error == ENOSPC && fs_blocks == 0) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
- }
- if (error) {
- cancel_flags = 0;
- goto error_return;
- }
-
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = true;
-
- /*
- * Check whether the directory allows new symlinks or not.
- */
- if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
- error = XFS_ERROR(EPERM);
- goto error_return;
- }
-
- /*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
- if (error)
- goto error_return;
-
- /*
- * Check for ability to enter directory entry, if no space reserved.
- */
- error = xfs_dir_canenter(tp, dp, link_name, resblks);
- if (error)
- goto error_return;
- /*
- * Initialize the bmap freelist prior to calling either
- * bmapi or the directory create code.
- */
- xfs_bmap_init(&free_list, &first_block);
-
- /*
- * Allocate an inode for the symlink.
- */
- error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == ENOSPC)
- goto error_return;
- goto error1;
- }
-
- /*
- * An error after we've joined dp to the transaction will result in the
- * transaction cancel unlocking dp so don't do it explicitly in the
- * error path.
- */
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = false;
-
- /*
- * Also attach the dquot(s) to it, if applicable.
- */
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-
- if (resblks)
- resblks -= XFS_IALLOC_SPACE_RES(mp);
- /*
- * If the symlink will fit into the inode, write it inline.
- */
- if (pathlen <= XFS_IFORK_DSIZE(ip)) {
- xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
- memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
- ip->i_d.di_size = pathlen;
-
- /*
- * The inode was initially created in extent format.
- */
- ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
- ip->i_df.if_flags |= XFS_IFINLINE;
-
- ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
- } else {
- first_fsb = 0;
- nmaps = SYMLINK_MAPS;
-
- error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, &first_block, resblks,
- mval, &nmaps, &free_list);
- if (error)
- goto error2;
-
- if (resblks)
- resblks -= fs_blocks;
- ip->i_d.di_size = pathlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- cur_chunk = target_path;
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0);
- if (!bp) {
- error = ENOMEM;
- goto error2;
- }
- if (pathlen < byte_cnt) {
- byte_cnt = pathlen;
- }
- pathlen -= byte_cnt;
-
- memcpy(bp->b_addr, cur_chunk, byte_cnt);
- cur_chunk += byte_cnt;
-
- xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
- }
- }
-
- /*
- * Create the directory entry for the symlink.
- */
- error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
- &first_block, &free_list, resblks);
- if (error)
- goto error2;
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- /*
- * If this is a synchronous mount, make sure that the
- * symlink transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- goto error2;
- }
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- *ipp = ip;
- return 0;
-
- error2:
- IRELE(ip);
- error1:
- xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
- return error;
-}
-
-int
xfs_set_dmattrs(
xfs_inode_t *ip,
u_int evmask,
@@ -1927,7 +1444,7 @@ xfs_free_file_space(
xfs_mount_t *mp;
int nimap;
uint resblks;
- uint rounding;
+ xfs_off_t rounding;
int rt;
xfs_fileoff_t startoffset_fsb;
xfs_trans_t *tp;
@@ -1956,7 +1473,7 @@ xfs_free_file_space(
inode_dio_wait(VFS_I(ip));
}
- rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
ioffset = offset & ~(rounding - 1);
error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ioffset, -1);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
- xfs_off_t *offset, filldir_t filldir);
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);