summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/amigaffs.c7
-rw-r--r--fs/affs/bitmap.c6
-rw-r--r--fs/affs/dir.c5
-rw-r--r--fs/affs/super.c19
-rw-r--r--fs/afs/dir.c37
-rw-r--r--fs/afs/fsclient.c3
-rw-r--r--fs/afs/inode.c9
-rw-r--r--fs/afs/internal.h5
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/security.c18
-rw-r--r--fs/afs/super.c18
-rw-r--r--fs/afs/write.c8
-rw-r--r--fs/autofs4/root.c17
-rw-r--r--fs/autofs4/waitq.c1
-rw-r--r--fs/befs/ChangeLog2
-rw-r--r--fs/befs/linuxvfs.c4
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/compression.c150
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c74
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/delayed-inode.c137
-rw-r--r--fs/btrfs/delayed-ref.c2
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c30
-rw-r--r--fs/btrfs/dir-item.c108
-rw-r--r--fs/btrfs/disk-io.c97
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c36
-rw-r--r--fs/btrfs/extent_io.c158
-rw-r--r--fs/btrfs/extent_io.h58
-rw-r--r--fs/btrfs/extent_map.c132
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c520
-rw-r--r--fs/btrfs/free-space-cache.c16
-rw-r--r--fs/btrfs/inode.c332
-rw-r--r--fs/btrfs/ioctl.c51
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c119
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c7
-rw-r--r--fs/btrfs/scrub.c95
-rw-r--r--fs/btrfs/send.c130
-rw-r--r--fs/btrfs/super.c401
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c6
-rw-r--r--fs/btrfs/tests/extent-map-tests.c366
-rw-r--r--fs/btrfs/tests/inode-tests.c29
-rw-r--r--fs/btrfs/transaction.c22
-rw-r--r--fs/btrfs/transaction.h11
-rw-r--r--fs/btrfs/tree-checker.c169
-rw-r--r--fs/btrfs/tree-checker.h14
-rw-r--r--fs/btrfs/tree-log.c60
-rw-r--r--fs/btrfs/volumes.c706
-rw-r--r--fs/btrfs/volumes.h45
-rw-r--r--fs/btrfs/xattr.c7
-rw-r--r--fs/btrfs/zstd.c132
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/mds_client.c42
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_debug.c199
-rw-r--r--fs/cifs/cifs_fs_sb.h2
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c3
-rw-r--r--fs/cifs/cifsfs.c20
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h39
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c22
-rw-r--r--fs/cifs/connect.c251
-rw-r--r--fs/cifs/file.c43
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/misc.c14
-rw-r--r--fs/cifs/smb1ops.c4
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2misc.c2
-rw-r--r--fs/cifs/smb2ops.c80
-rw-r--r--fs/cifs/smb2pdu.c608
-rw-r--r--fs/cifs/smb2pdu.h60
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/smbdirect.c2610
-rw-r--r--fs/cifs/smbdirect.h338
-rw-r--r--fs/cifs/transport.c69
-rw-r--r--fs/cifs/xattr.c8
-rw-r--r--fs/coda/inode.c4
-rw-r--r--fs/cramfs/Kconfig1
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dcache.c10
-rw-r--r--fs/ecryptfs/main.c8
-rw-r--r--fs/efs/super.c4
-rw-r--r--fs/exec.c16
-rw-r--r--fs/exofs/dir.c9
-rw-r--r--fs/exofs/super.c3
-rw-r--r--fs/ext2/balloc.c4
-rw-r--r--fs/ext2/dir.c9
-rw-r--r--fs/ext2/ialloc.c4
-rw-r--r--fs/ext2/super.c25
-rw-r--r--fs/ext4/dir.c9
-rw-r--r--fs/ext4/extents.c1
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inline.c7
-rw-r--r--fs/ext4/inode.c26
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--fs/ext4/namei.c9
-rw-r--r--fs/ext4/super.c55
-rw-r--r--fs/ext4/xattr.c5
-rw-r--r--fs/f2fs/checkpoint.c10
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/gc.c2
-rw-r--r--fs/f2fs/recovery.c10
-rw-r--r--fs/f2fs/super.c28
-rw-r--r--fs/fat/dir.c3
-rw-r--r--fs/fat/fatent.c6
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/fat/namei_msdos.c9
-rw-r--r--fs/fat/namei_vfat.c22
-rw-r--r--fs/fcntl.c1
-rw-r--r--fs/file.c2
-rw-r--r--fs/freevxfs/vxfs_super.c4
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fuse/inode.c12
-rw-r--r--fs/gfs2/ops_fstype.c16
-rw-r--r--fs/gfs2/super.c10
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c16
-rw-r--r--fs/hfsplus/super.c22
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/dnode.c2
-rw-r--r--fs/hpfs/map.c2
-rw-r--r--fs/hpfs/super.c9
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c19
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jfs/super.c10
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/lockd/host.c24
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/lockd/svc.c38
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/mbcache.c3
-rw-r--r--fs/minix/inode.c4
-rw-r--r--fs/namei.c15
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/delegation.c3
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/fscache-index.c5
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs4client.c17
-rw-r--r--fs/nfs/nfs4proc.c10
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfstrace.h5
-rw-r--r--fs/nfs/super.c22
-rw-r--r--fs/nfs/write.c10
-rw-r--r--fs/nfs_common/grace.c10
-rw-r--r--fs/nfsd/auth.c3
-rw-r--r--fs/nfsd/export.c10
-rw-r--r--fs/nfsd/netns.h3
-rw-r--r--fs/nfsd/nfs4idmap.c4
-rw-r--r--fs/nfsd/nfs4state.c279
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/nfsfh.h3
-rw-r--r--fs/nfsd/nfssvc.c14
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nilfs2/super.c24
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/inode.c9
-rw-r--r--fs/ntfs/mft.c6
-rw-r--r--fs/ntfs/super.c32
-rw-r--r--fs/ocfs2/dir.c15
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/quota_global.c3
-rw-r--r--fs/ocfs2/super.c28
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/openpromfs/inode.c4
-rw-r--r--fs/orangefs/devorangefs-req.c3
-rw-r--r--fs/orangefs/file.c7
-rw-r--r--fs/orangefs/orangefs-kernel.h11
-rw-r--r--fs/orangefs/super.c8
-rw-r--r--fs/orangefs/waitqueue.c4
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/dir.c3
-rw-r--r--fs/overlayfs/namei.c18
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c7
-rw-r--r--fs/overlayfs/super.c97
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/qnx4/inode.c4
-rw-r--r--fs/qnx6/inode.c4
-rw-r--r--fs/quota/dquot.c22
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/super.c19
-rw-r--r--fs/reiserfs/xattr.c10
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/statfs.c6
-rw-r--r--fs/super.c43
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/super.c2
-rw-r--r--fs/ubifs/dir.c43
-rw-r--r--fs/ubifs/file.c43
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/super.c20
-rw-r--r--fs/ubifs/tnc.c21
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/super.c6
-rw-r--r--fs/ufs/balloc.c8
-rw-r--r--fs/ufs/dir.c9
-rw-r--r--fs/ufs/ialloc.c10
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--fs/ufs/super.c33
-rw-r--r--fs/userfaultfd.c20
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c20
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c8
-rw-r--r--fs/xfs/libxfs/xfs_defer.c39
-rw-r--r--fs/xfs/libxfs/xfs_defer.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h1
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c52
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c99
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h16
-rw-r--r--fs/xfs/scrub/inode.c14
-rw-r--r--fs/xfs/scrub/quota.c4
-rw-r--r--fs/xfs/scrub/scrub.c1
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/xfs_aops.c12
-rw-r--r--fs/xfs/xfs_bmap_item.c23
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_buf.c15
-rw-r--r--fs/xfs/xfs_dquot.c14
-rw-r--r--fs/xfs/xfs_dquot_item.c40
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_fsops.c5
-rw-r--r--fs/xfs/xfs_icache.c40
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_inode_item.c3
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/xfs/xfs_log.c6
-rw-r--r--fs/xfs/xfs_log_recover.c75
-rw-r--r--fs/xfs/xfs_qm.c50
-rw-r--r--fs/xfs/xfs_refcount_item.c21
-rw-r--r--fs/xfs/xfs_refcount_item.h3
-rw-r--r--fs/xfs/xfs_reflink.c23
-rw-r--r--fs/xfs/xfs_super.c17
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_symlink.c15
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trans_inode.c16
288 files changed, 8259 insertions, 3158 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8b75463cb211..af03c2a901eb 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
- sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+ sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
if (!v9ses->cache)
- sb->s_flags |= MS_SYNCHRONOUS;
+ sb->s_flags |= SB_SYNCHRONOUS;
#ifdef CONFIG_9P_FS_POSIX_ACL
if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
#endif
return 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c9fdfb112933..cfda2c7caedc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options)
static int adfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_NODIRATIME;
+ *flags |= SB_NODIRATIME;
return parse_options(sb, data);
}
@@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
struct inode *root;
int ret = -EINVAL;
- sb->s_flags |= MS_NODIRATIME;
+ sb->s_flags |= SB_NODIRATIME;
asb = kzalloc(sizeof(*asb), GFP_KERNEL);
if (!asb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 185d5ab7e986..14a6c1b90c9f 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,6 +10,7 @@
*/
#include <linux/math64.h>
+#include <linux/iversion.h>
#include "affs.h"
/*
@@ -60,7 +61,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
affs_brelse(dir_bh);
dir->i_mtime = dir->i_ctime = current_time(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
mark_inode_dirty(dir);
return 0;
@@ -114,7 +115,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
dir->i_mtime = dir->i_ctime = current_time(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
mark_inode_dirty(dir);
return retval;
@@ -453,7 +454,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
if (!sb_rdonly(sb))
pr_warn("Remounting filesystem read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
va_end(args);
}
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 2b1399611d9e..5ba9ef2742f6 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
int i, res = 0;
struct affs_sb_info *sbi = AFFS_SB(sb);
- if (*flags & MS_RDONLY)
+ if (*flags & SB_RDONLY)
return 0;
if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
if (affs_checksum_block(sb, bh)) {
pr_warn("Bitmap %u invalid - mounting %s read only.\n",
bm->bm_key, sb->s_id);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
goto out;
}
pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index a105e77df2c1..d180b46453cf 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -14,6 +14,7 @@
*
*/
+#include <linux/iversion.h>
#include "affs.h"
static int affs_readdir(struct file *, struct dir_context *);
@@ -80,7 +81,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
* we can jump directly to where we left off.
*/
ino = (u32)(long)file->private_data;
- if (ino && file->f_version == inode->i_version) {
+ if (ino && inode_cmp_iversion(inode, file->f_version) == 0) {
pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
@@ -130,7 +131,7 @@ inside:
} while (ino);
}
done:
- file->f_version = inode->i_version;
+ file->f_version = inode_query_iversion(inode);
file->private_data = (void *)(long)ino;
affs_brelse(fh_bh);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 884bedab7266..e602619aed9d 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -21,6 +21,7 @@
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
+#include <linux/iversion.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -102,7 +103,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
if (!i)
return NULL;
- i->vfs_inode.i_version = 1;
+ inode_set_iversion(&i->vfs_inode, 1);
i->i_lc = NULL;
i->i_ext_bh = NULL;
i->i_pa_cnt = 0;
@@ -356,7 +357,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
- sb->s_flags |= MS_NODIRATIME;
+ sb->s_flags |= SB_NODIRATIME;
sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -466,7 +467,7 @@ got_root:
if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
|| chksum == MUFS_DCOFS) && !sb_rdonly(sb)) {
pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
switch (chksum) {
case MUFS_FS:
@@ -488,7 +489,7 @@ got_root:
/* fall thru */
case FS_OFS:
affs_set_opt(sbi->s_flags, SF_OFS);
- sb->s_flags |= MS_NOEXEC;
+ sb->s_flags |= SB_NOEXEC;
break;
case MUFS_DCOFS:
case MUFS_INTLOFS:
@@ -497,7 +498,7 @@ got_root:
case FS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_INTL);
affs_set_opt(sbi->s_flags, SF_OFS);
- sb->s_flags |= MS_NOEXEC;
+ sb->s_flags |= SB_NOEXEC;
break;
default:
pr_err("Unknown filesystem on device %s: %08X\n",
@@ -513,7 +514,7 @@ got_root:
sig, sig[3] + '0', blocksize);
}
- sb->s_flags |= MS_NODEV | MS_NOSUID;
+ sb->s_flags |= SB_NODEV | SB_NOSUID;
sbi->s_data_blksize = sb->s_blocksize;
if (affs_test_opt(sbi->s_flags, SF_OFS))
@@ -570,7 +571,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
- *flags |= MS_NODIRATIME;
+ *flags |= SB_NODIRATIME;
memcpy(volume, sbi->s_volume, 32);
if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
@@ -596,10 +597,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
memcpy(sbi->s_volume, volume, 32);
spin_unlock(&sbi->symlink_lock);
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & MS_RDONLY)
+ if (*flags & SB_RDONLY)
affs_free_bitmap(sb);
else
res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ff8d5bf4354f..23c7f395d718 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -895,20 +895,38 @@ error:
* However, if we didn't have a callback promise outstanding, or it was
* outstanding on a different server, then it won't break it either...
*/
-static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
+ unsigned long d_version_before,
+ unsigned long d_version_after)
{
+ bool dir_valid;
int ret = 0;
+ /* There were no intervening changes on the server if the version
+ * number we got back was incremented by exactly 1.
+ */
+ dir_valid = (d_version_after == d_version_before + 1);
+
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- kdebug("AFS_VNODE_DELETED");
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-
- ret = afs_validate(vnode, key);
- if (ret == -ESTALE)
+ if (dir_valid) {
+ drop_nlink(&vnode->vfs_inode);
+ if (vnode->vfs_inode.i_nlink == 0) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ }
ret = 0;
+ } else {
+ clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ kdebug("AFS_VNODE_DELETED");
+
+ ret = afs_validate(vnode, key);
+ if (ret == -ESTALE)
+ ret = 0;
+ }
_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
}
@@ -923,6 +941,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
struct key *key;
+ unsigned long d_version = (unsigned long)dentry->d_fsdata;
int ret;
_enter("{%x:%u},{%pd}",
@@ -955,7 +974,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
ret = afs_end_vnode_operation(&fc);
if (ret == 0)
- ret = afs_dir_remove_link(dentry, key);
+ ret = afs_dir_remove_link(
+ dentry, key, d_version,
+ (unsigned long)dvnode->status.data_version);
}
error_key:
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index b90ef39ae914..88ec38c2d83c 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/circ_buf.h>
+#include <linux/iversion.h>
#include "internal.h"
#include "afs_fs.h"
@@ -124,7 +125,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client;
vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime;
vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
- vnode->vfs_inode.i_version = data_version;
+ inode_set_iversion_raw(&vnode->vfs_inode, data_version);
}
expected_version = status->data_version;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3415eb7484f6..c7f17c44c7ce 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/iversion.h>
#include "internal.h"
static const struct inode_operations afs_symlink_inode_operations = {
@@ -89,7 +90,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
inode->i_atime = inode->i_mtime = inode->i_ctime;
inode->i_blocks = 0;
inode->i_generation = vnode->fid.unique;
- inode->i_version = vnode->status.data_version;
+ inode_set_iversion_raw(inode, vnode->status.data_version);
inode->i_mapping->a_ops = &afs_fs_aops;
read_sequnlock_excl(&vnode->cb_lock);
@@ -218,7 +219,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
inode->i_ctime.tv_nsec = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime;
inode->i_blocks = 0;
- inode->i_version = 0;
+ inode_set_iversion_raw(inode, 0);
inode->i_generation = 0;
set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
@@ -377,6 +378,10 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
}
read_sequnlock_excl(&vnode->cb_lock);
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ clear_nlink(&vnode->vfs_inode);
+
if (valid)
goto valid;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index e03910cebdd4..804d1f905622 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -441,7 +441,10 @@ enum afs_lock_state {
};
/*
- * AFS inode private data
+ * AFS inode private data.
+ *
+ * Note that afs_alloc_inode() *must* reset anything that could incorrectly
+ * leak from one inode to another.
*/
struct afs_vnode {
struct inode vfs_inode; /* the VFS's inode record */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ea1460b9b71a..e1126659f043 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -885,7 +885,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
{
struct afs_net *net = call->net;
enum afs_call_state state;
- u32 remote_abort;
+ u32 remote_abort = 0;
int ret;
_enter("{%s,%zu},,%zu,%d",
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 2b00097101b3..b88b7d45fdaa 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -120,7 +120,7 @@ static void afs_hash_permits(struct afs_permits *permits)
void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
unsigned int cb_break)
{
- struct afs_permits *permits, *xpermits, *replacement, *new = NULL;
+ struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL;
afs_access_t caller_access = READ_ONCE(vnode->status.caller_access);
size_t size = 0;
bool changed = false;
@@ -204,7 +204,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
new = kzalloc(sizeof(struct afs_permits) +
sizeof(struct afs_permit) * size, GFP_NOFS);
if (!new)
- return;
+ goto out_put;
refcount_set(&new->usage, 1);
new->nr_permits = size;
@@ -229,8 +229,6 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
afs_hash_permits(new);
- afs_put_permits(permits);
-
/* Now see if the permit list we want is actually already available */
spin_lock(&afs_permits_lock);
@@ -262,11 +260,15 @@ found:
kfree(new);
spin_lock(&vnode->lock);
- if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break) ||
- permits != rcu_access_pointer(vnode->permit_cache))
- goto someone_else_changed_it_unlock;
- rcu_assign_pointer(vnode->permit_cache, replacement);
+ zap = rcu_access_pointer(vnode->permit_cache);
+ if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) &&
+ zap == permits)
+ rcu_assign_pointer(vnode->permit_cache, replacement);
+ else
+ zap = replacement;
spin_unlock(&vnode->lock);
+ afs_put_permits(zap);
+out_put:
afs_put_permits(permits);
return;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 875b5eb02242..1037dd41a622 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -496,10 +496,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
if (ret < 0)
goto error_sb;
as = NULL;
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
} else {
_debug("reuse");
- ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+ ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
afs_destroy_sbi(as);
as = NULL;
}
@@ -536,7 +536,9 @@ static void afs_kill_super(struct super_block *sb)
}
/*
- * initialise an inode cache slab element prior to any use
+ * Initialise an inode cache slab element prior to any use. Note that
+ * afs_alloc_inode() *must* reset anything that could incorrectly leak from one
+ * inode to another.
*/
static void afs_i_init_once(void *_vnode)
{
@@ -568,11 +570,21 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
atomic_inc(&afs_count_active_inodes);
+ /* Reset anything that shouldn't leak from one inode to the next. */
memset(&vnode->fid, 0, sizeof(vnode->fid));
memset(&vnode->status, 0, sizeof(vnode->status));
vnode->volume = NULL;
+ vnode->lock_key = NULL;
+ vnode->permit_cache = NULL;
+ vnode->cb_interest = NULL;
+#ifdef CONFIG_AFS_FSCACHE
+ vnode->cache = NULL;
+#endif
+
vnode->flags = 1 << AFS_VNODE_UNSET;
+ vnode->cb_type = 0;
+ vnode->lock_state = AFS_VNODE_LOCK_NONE;
_leave(" = %p", &vnode->vfs_inode);
return &vnode->vfs_inode;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index cb5f8a3df577..9370e2feb999 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -198,7 +198,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
ret = afs_fill_page(vnode, key, pos + copied,
len - copied, page);
if (ret < 0)
- return ret;
+ goto out;
}
SetPageUptodate(page);
}
@@ -206,10 +206,12 @@ int afs_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
if (PageDirty(page))
_debug("dirtied");
+ ret = copied;
+
+out:
unlock_page(page);
put_page(page);
-
- return copied;
+ return ret;
}
/*
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d79ced925861..82e8f6edfb48 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
pr_debug("waiting for mount name=%pd\n", path->dentry);
status = autofs4_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
- ino->last_used = jiffies;
}
+ ino->last_used = jiffies;
return status;
}
@@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
*/
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
+ struct autofs_info *ino;
struct dentry *new;
new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
- if (new == dentry)
- dput(new);
- else {
- struct autofs_info *ino;
-
- ino = autofs4_dentry_ino(new);
- ino->last_used = jiffies;
- dput(path->dentry);
- path->dentry = new;
- }
+ ino = autofs4_dentry_ino(new);
+ ino->last_used = jiffies;
+ dput(path->dentry);
+ path->dentry = new;
}
return path->dentry;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9908ecf7fce0..a0c57c37fa21 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -170,7 +170,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
mutex_unlock(&sbi->wq_mutex);
- if (autofs4_write(sbi, pipe, &pkt, pktsz))
switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) {
case 0:
break;
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index 75a461cfaca6..16f2dfe8c2f7 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -365,7 +365,7 @@ Version 0.4 (2001-10-28)
(fs/befs/super.c)
* Tell the kernel to only mount befs read-only.
- By setting the MS_RDONLY flag in befs_read_super().
+ By setting the SB_RDONLY flag in befs_read_super().
Not that it was possible to write before. But now the kernel won't even try.
(fs/befs/super.c)
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a92355cc453b..ee236231cafa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
befs_warning(sb,
"No write support. Marking filesystem read-only");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
/*
@@ -948,7 +948,7 @@ static int
befs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- if (!(*flags & MS_RDONLY))
+ if (!(*flags & SB_RDONLY))
return -EINVAL;
return 0;
}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6fe881d5cb38..0c4373628eb4 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -19,4 +19,4 @@ btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
- tests/free-space-tree-tests.o
+ tests/free-space-tree-tests.o tests/extent-map-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7d0dc100a09a..e4054e533f6d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -216,7 +216,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
return 0;
}
-void update_share_count(struct share_check *sc, int oldcount, int newcount)
+static void update_share_count(struct share_check *sc, int oldcount,
+ int newcount)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b35ce16b3df3..07d049c0c20f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,7 +33,6 @@
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
-#include <linux/sort.h>
#include <linux/log2.h>
#include "ctree.h"
#include "disk-io.h"
@@ -45,6 +44,21 @@
#include "extent_io.h"
#include "extent_map.h"
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ case BTRFS_COMPRESS_NONE:
+ return btrfs_compress_types[type];
+ }
+
+ return NULL;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -295,7 +309,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages)
+ unsigned long nr_pages,
+ unsigned int write_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio *bio = NULL;
@@ -327,7 +342,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = fs_info->fs_devices->latest_bdev;
bio = btrfs_bio_alloc(bdev, first_byte);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
refcount_set(&cb->pending_bios, 1);
@@ -347,8 +362,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
page->mapping = NULL;
if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
- bio_get(bio);
-
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -371,10 +384,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
- bio_put(bio);
-
bio = btrfs_bio_alloc(bdev, first_byte);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -388,7 +399,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
first_byte += PAGE_SIZE;
cond_resched();
}
- bio_get(bio);
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -404,13 +414,12 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
- bio_put(bio);
return 0;
}
static u64 bio_end_offset(struct bio *bio)
{
- struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ struct bio_vec *last = bio_last_bvec_all(bio);
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
@@ -562,7 +571,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
- page_offset(bio->bi_io_vec->bv_page),
+ page_offset(bio_first_page_all(bio)),
PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
@@ -637,8 +646,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
- bio_get(comp_bio);
-
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -665,8 +672,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_endio(comp_bio);
}
- bio_put(comp_bio);
-
comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
@@ -676,7 +681,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
cur_disk_byte += PAGE_SIZE;
}
- bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -692,7 +696,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_endio(comp_bio);
}
- bio_put(comp_bio);
return 0;
fail2:
@@ -751,6 +754,8 @@ struct heuristic_ws {
u32 sample_size;
/* Buckets store counters for each byte value */
struct bucket_item *bucket;
+ /* Sorting buffer */
+ struct bucket_item *bucket_b;
struct list_head list;
};
@@ -762,6 +767,7 @@ static void free_heuristic_ws(struct list_head *ws)
kvfree(workspace->sample);
kfree(workspace->bucket);
+ kfree(workspace->bucket_b);
kfree(workspace);
}
@@ -781,6 +787,10 @@ static struct list_head *alloc_heuristic_ws(void)
if (!ws->bucket)
goto fail;
+ ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
+ if (!ws->bucket_b)
+ goto fail;
+
INIT_LIST_HEAD(&ws->list);
return &ws->list;
fail:
@@ -1277,13 +1287,103 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
return entropy_sum * 100 / entropy_max;
}
-/* Compare buckets by size, ascending */
-static int bucket_comp_rev(const void *lv, const void *rv)
+#define RADIX_BASE 4U
+#define COUNTERS_SIZE (1U << RADIX_BASE)
+
+static u8 get4bits(u64 num, int shift) {
+ u8 low4bits;
+
+ num >>= shift;
+ /* Reverse order */
+ low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
+ return low4bits;
+}
+
+/*
+ * Use 4 bits as radix base
+ * Use 16 u32 counters for calculating new possition in buf array
+ *
+ * @array - array that will be sorted
+ * @array_buf - buffer array to store sorting results
+ * must be equal in size to @array
+ * @num - array size
+ */
+static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
+ int num)
{
- const struct bucket_item *l = (const struct bucket_item *)lv;
- const struct bucket_item *r = (const struct bucket_item *)rv;
+ u64 max_num;
+ u64 buf_num;
+ u32 counters[COUNTERS_SIZE];
+ u32 new_addr;
+ u32 addr;
+ int bitlen;
+ int shift;
+ int i;
- return r->count - l->count;
+ /*
+ * Try avoid useless loop iterations for small numbers stored in big
+ * counters. Example: 48 33 4 ... in 64bit array
+ */
+ max_num = array[0].count;
+ for (i = 1; i < num; i++) {
+ buf_num = array[i].count;
+ if (buf_num > max_num)
+ max_num = buf_num;
+ }
+
+ buf_num = ilog2(max_num);
+ bitlen = ALIGN(buf_num, RADIX_BASE * 2);
+
+ shift = 0;
+ while (shift < bitlen) {
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i++) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array_buf[new_addr] = array[i];
+ }
+
+ shift += RADIX_BASE;
+
+ /*
+ * Normal radix expects to move data from a temporary array, to
+ * the main one. But that requires some CPU time. Avoid that
+ * by doing another sort iteration to original array instead of
+ * memcpy()
+ */
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i ++) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array[new_addr] = array_buf[i];
+ }
+
+ shift += RADIX_BASE;
+ }
}
/*
@@ -1313,7 +1413,7 @@ static int byte_core_set_size(struct heuristic_ws *ws)
struct bucket_item *bucket = ws->bucket;
/* Sort in reverse order */
- sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+ radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
coreset_sum += bucket[i].count;
@@ -1528,5 +1628,5 @@ unsigned int btrfs_compress_str2level(const char *str)
if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
return str[5] - '0';
- return 0;
+ return BTRFS_ZLIB_DEFAULT_LEVEL;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index da20755ebf21..677fa4aa0bd7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,8 @@
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
+#define BTRFS_ZLIB_DEFAULT_LEVEL 3
+
struct compressed_bio {
/* number of bios pending for this compressed extent */
refcount_t pending_bios;
@@ -73,7 +75,7 @@ struct compressed_bio {
u32 sums;
};
-void btrfs_init_compress(void);
+void __init btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
@@ -91,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages);
+ unsigned long nr_pages,
+ unsigned int write_flags);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
@@ -134,6 +137,8 @@ extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 531e0a8645b0..b88a79e69ddf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1032,14 +1032,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
ret = btrfs_inc_ref(trans, root, cow, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
@@ -1049,7 +1052,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
}
if (new_flags != 0) {
int level = btrfs_header_level(buf);
@@ -1068,9 +1072,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
ret = btrfs_dec_ref(trans, root, buf, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ return ret;
}
clean_tree_block(fs_info, buf);
*last_ref = 1;
@@ -1801,8 +1807,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
* simple bin_search frontend that does the right thing for
* leaves vs nodes
*/
-static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+ int level, int *slot)
{
if (level == 0)
return generic_bin_search(eb,
@@ -1818,12 +1824,6 @@ static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
slot);
}
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
-{
- return bin_search(eb, key, level, slot);
-}
-
static void root_add_used(struct btrfs_root *root, u32 size)
{
spin_lock(&root->accounting_lock);
@@ -2608,7 +2608,7 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
if (*prev_cmp != 0) {
- *prev_cmp = bin_search(b, key, level, slot);
+ *prev_cmp = btrfs_bin_search(b, key, level, slot);
return *prev_cmp;
}
@@ -2654,17 +2654,29 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
- * look for key in the tree. path is filled in with nodes along the way
- * if key is found, we return zero and you can find the item in the leaf
- * level of the path (level 0)
+ * btrfs_search_slot - look for a key in a tree and perform necessary
+ * modifications to preserve tree invariants.
*
- * If the key isn't found, the path points to the slot where it should
- * be inserted, and 1 is returned. If there are other errors during the
- * search a negative error number is returned.
+ * @trans: Handle of transaction, used when modifying the tree
+ * @p: Holds all btree nodes along the search path
+ * @root: The root node of the tree
+ * @key: The key we are looking for
+ * @ins_len: Indicates purpose of search, for inserts it is 1, for
+ * deletions it's -1. 0 for plain searches
+ * @cow: boolean should CoW operations be performed. Must always be 1
+ * when modifying the tree.
*
- * if ins_len > 0, nodes and leaves will be split as we walk down the
- * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
- * possible)
+ * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
+ * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
+ *
+ * If @key is found, 0 is returned and you can find the item in the leaf level
+ * of the path (level 0)
+ *
+ * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
+ * points to the slot where it should be inserted
+ *
+ * If an error is encountered while searching the tree a negative error number
+ * is returned
*/
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
@@ -2768,6 +2780,8 @@ again:
* contention with the cow code
*/
if (cow) {
+ bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
+
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
@@ -2792,9 +2806,13 @@ again:
}
btrfs_set_path_blocking(p);
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b);
+ if (last_level)
+ err = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b);
+ else
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
if (err) {
ret = err;
goto done;
@@ -5169,7 +5187,7 @@ again:
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, level, &slot);
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7df5536ab61..1a462ab85c49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -679,7 +679,6 @@ enum btrfs_orphan_cleanup_state {
/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
- wait_queue_head_t wait;
spinlock_t lock;
};
@@ -2957,7 +2956,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
*/
static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
- return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
+ return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
@@ -3060,15 +3059,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf, int slot,
- struct btrfs_dir_item *dir_item);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
- unsigned long start, u16 name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3180,6 +3174,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
struct extent_state **cached_state, int dedupe);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
@@ -3196,7 +3191,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
-int btrfs_init_cachep(void);
+int __init btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@@ -3247,7 +3242,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff);
/* file.c */
-int btrfs_auto_defrag_init(void);
+int __init btrfs_auto_defrag_init(void);
void btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
@@ -3282,7 +3277,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* sysfs.c */
-int btrfs_init_sysfs(void);
+int __init btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5d73f79ded8b..0530f6f2e4ba 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -18,6 +18,7 @@
*/
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
@@ -87,6 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
spin_lock(&root->inode_lock);
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
@@ -94,9 +96,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
spin_unlock(&root->inode_lock);
return node;
}
- btrfs_inode->delayed_node = node;
- /* can be accessed and cached in the inode */
- refcount_add(2, &node->refs);
+
+ /*
+ * It's possible that we're racing into the middle of removing
+ * this node from the radix tree. In this case, the refcount
+ * was zero and it should never go back to one. Just return
+ * NULL like it was never in the radix at all; our release
+ * function is in the process of removing it.
+ *
+ * Some implementations of refcount_inc refuse to bump the
+ * refcount once it has hit zero. If we don't do this dance
+ * here, refcount_inc() may decide to just WARN_ONCE() instead
+ * of actually bumping the refcount.
+ *
+ * If this node is properly in the radix, we want to bump the
+ * refcount twice, once for the inode and once for this get
+ * operation.
+ */
+ if (refcount_inc_not_zero(&node->refs)) {
+ refcount_inc(&node->refs);
+ btrfs_inode->delayed_node = node;
+ } else {
+ node = NULL;
+ }
+
spin_unlock(&root->inode_lock);
return node;
}
@@ -254,17 +277,18 @@ static void __btrfs_release_delayed_node(
mutex_unlock(&delayed_node->mutex);
if (refcount_dec_and_test(&delayed_node->refs)) {
- bool free = false;
struct btrfs_root *root = delayed_node->root;
+
spin_lock(&root->inode_lock);
- if (refcount_read(&delayed_node->refs) == 0) {
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
- free = true;
- }
+ /*
+ * Once our refcount goes to zero, nobody is allowed to bump it
+ * back up. We can delete it now.
+ */
+ ASSERT(refcount_read(&delayed_node->refs) == 0);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
spin_unlock(&root->inode_lock);
- if (free)
- kmem_cache_free(delayed_node_cache, delayed_node);
+ kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -1279,40 +1303,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
if (!path)
goto out;
-again:
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
- goto free_path;
+ do {
+ if (atomic_read(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND / 2)
+ break;
- delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
- if (!delayed_node)
- goto free_path;
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ if (!delayed_node)
+ break;
- path->leave_spinning = 1;
- root = delayed_node->root;
+ path->leave_spinning = 1;
+ root = delayed_node->root;
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- goto release_path;
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
+ continue;
+ }
- block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->delayed_block_rsv;
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
- __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- trans->block_rsv = block_rsv;
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty_nodelay(root->fs_info);
+ trans->block_rsv = block_rsv;
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty_nodelay(root->fs_info);
-release_path:
- btrfs_release_path(path);
- total_done++;
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
- btrfs_release_prepared_delayed_node(delayed_node);
- if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
- total_done < async_work->nr)
- goto again;
+ } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
+ || total_done < async_work->nr);
-free_path:
btrfs_free_path(path);
out:
wake_up(&delayed_root->wait);
@@ -1325,10 +1351,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
{
struct btrfs_async_delayed_work *async_work;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
- btrfs_workqueue_normal_congested(fs_info->delayed_workers))
- return 0;
-
async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
if (!async_work)
return -ENOMEM;
@@ -1364,7 +1386,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
{
struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
+ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
return;
if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
@@ -1610,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index)
{
- struct btrfs_delayed_item *curr, *next;
- int ret;
-
- if (list_empty(del_list))
- return 0;
+ struct btrfs_delayed_item *curr;
+ int ret = 0;
- list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ list_for_each_entry(curr, del_list, readdir_list) {
if (curr->key.offset > index)
break;
-
- list_del(&curr->readdir_list);
- ret = (curr->key.offset == index);
-
- if (refcount_dec_and_test(&curr->refs))
- kfree(curr);
-
- if (ret)
- return 1;
- else
- continue;
+ if (curr->key.offset == index) {
+ ret = 1;
+ break;
+ }
}
- return 0;
+ return ret;
}
/*
@@ -1700,7 +1713,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
btrfs_set_stack_inode_generation(inode_item,
BTRFS_I(inode)->generation);
- btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+ btrfs_set_stack_inode_sequence(inode_item,
+ inode_peek_iversion(inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1768,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
- inode->i_version = btrfs_stack_inode_sequence(inode_item);
+ inode_set_iversion_queried(inode,
+ btrfs_stack_inode_sequence(inode_item));
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 83be8f9fd906..a1a40cf382e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void)
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
-int btrfs_delayed_ref_init(void)
+int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = kmem_cache_create(
"btrfs_delayed_ref_head",
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a43af432f859..c4f625e5a691 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
-int btrfs_delayed_ref_init(void);
+int __init btrfs_delayed_ref_init(void);
void btrfs_delayed_ref_exit(void);
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7c655f9a7a50..7efbc4d1128b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -172,7 +172,8 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev->commit_bytes_used =
dev_replace->srcdev->commit_bytes_used;
}
- dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &dev_replace->tgtdev->dev_state);
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
dev_replace->tgtdev);
}
@@ -304,6 +305,14 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
+static char* btrfs_dev_name(struct btrfs_device *device)
+{
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ return "<missing disk>";
+ else
+ return rcu_str_deref(device->name);
+}
+
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
@@ -363,8 +372,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
@@ -538,8 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
} else {
btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace, 1);
@@ -557,11 +564,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->is_tgtdev_for_dev_replace = 0;
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
@@ -814,12 +820,10 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
btrfs_info_in_rcu(fs_info,
- "continuing dev_replace from %s (devid %llu) to %s @%u%%",
- dev_replace->srcdev->missing ? "<missing disk>"
- : rcu_str_deref(dev_replace->srcdev->name),
+ "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
+ btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
- dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name)
- : "<missing target disk>",
+ btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
btrfs_dev_replace_continue_on_mount(fs_info);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 41cb9196eaa8..cbe421605cd5 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
btrfs_dir_data_len(leaf, dir_item);
name_ptr = (unsigned long)(dir_item + 1);
- if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
- return NULL;
if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
return dir_item;
@@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
}
return ret;
}
-
-int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- int slot,
- struct btrfs_dir_item *dir_item)
-{
- u16 namelen = BTRFS_NAME_LEN;
- int ret;
- u8 type = btrfs_dir_type(leaf, dir_item);
-
- if (type >= BTRFS_FT_MAX) {
- btrfs_crit(fs_info, "invalid dir item type: %d", (int)type);
- return 1;
- }
-
- if (type == BTRFS_FT_XATTR)
- namelen = XATTR_NAME_MAX;
-
- if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
- btrfs_crit(fs_info, "invalid dir item name len: %u",
- (unsigned)btrfs_dir_name_len(leaf, dir_item));
- return 1;
- }
-
- namelen = btrfs_dir_name_len(leaf, dir_item);
- ret = btrfs_is_name_len_valid(leaf, slot,
- (unsigned long)(dir_item + 1), namelen);
- if (!ret)
- return 1;
-
- /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
- if ((btrfs_dir_data_len(leaf, dir_item) +
- btrfs_dir_name_len(leaf, dir_item)) >
- BTRFS_MAX_XATTR_SIZE(fs_info)) {
- btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u",
- (unsigned)btrfs_dir_name_len(leaf, dir_item),
- (unsigned)btrfs_dir_data_len(leaf, dir_item));
- return 1;
- }
-
- return 0;
-}
-
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
- unsigned long start, u16 name_len)
-{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_key key;
- u32 read_start;
- u32 read_end;
- u32 item_start;
- u32 item_end;
- u32 size;
- bool ret = true;
-
- ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
-
- read_start = start - BTRFS_LEAF_DATA_OFFSET;
- read_end = read_start + name_len;
- item_start = btrfs_item_offset_nr(leaf, slot);
- item_end = btrfs_item_end_nr(leaf, slot);
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- switch (key.type) {
- case BTRFS_DIR_ITEM_KEY:
- case BTRFS_XATTR_ITEM_KEY:
- case BTRFS_DIR_INDEX_KEY:
- size = sizeof(struct btrfs_dir_item);
- break;
- case BTRFS_INODE_REF_KEY:
- size = sizeof(struct btrfs_inode_ref);
- break;
- case BTRFS_INODE_EXTREF_KEY:
- size = sizeof(struct btrfs_inode_extref);
- break;
- case BTRFS_ROOT_REF_KEY:
- case BTRFS_ROOT_BACKREF_KEY:
- size = sizeof(struct btrfs_root_ref);
- break;
- default:
- ret = false;
- goto out;
- }
-
- if (read_start < item_start) {
- ret = false;
- goto out;
- }
- if (read_end > item_end) {
- ret = false;
- goto out;
- }
-
- /* there shall be item(s) before name */
- if (read_start - item_start < size) {
- ret = false;
- goto out;
- }
-
-out:
- if (!ret)
- btrfs_crit(fs_info, "invalid dir item name len: %u",
- (unsigned int)name_len);
- return ret;
-}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index efce9a2fa9be..ed095202942f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -61,7 +61,8 @@
BTRFS_HEADER_FLAG_RELOC |\
BTRFS_SUPER_FLAG_ERROR |\
BTRFS_SUPER_FLAG_SEEDING |\
- BTRFS_SUPER_FLAG_METADUMP)
+ BTRFS_SUPER_FLAG_METADUMP |\
+ BTRFS_SUPER_FLAG_METADUMP_V2)
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -220,7 +221,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
* extents on the btree inode are pretty simple, there's one extent
* that covers the entire device
*/
-static struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
@@ -285,7 +286,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
int verify)
{
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- char *result = NULL;
+ char result[BTRFS_CSUM_SIZE];
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
@@ -294,7 +295,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
unsigned long map_len;
int err;
u32 crc = ~(u32)0;
- unsigned long inline_result;
len = buf->len - offset;
while (len > 0) {
@@ -308,13 +308,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
len -= cur_len;
offset += cur_len;
}
- if (csum_size > sizeof(inline_result)) {
- result = kzalloc(csum_size, GFP_NOFS);
- if (!result)
- return -ENOMEM;
- } else {
- result = (char *)&inline_result;
- }
+ memset(result, 0, BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
@@ -329,15 +323,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
"%s checksum verify failed on %llu wanted %X found %X level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
- if (result != (char *)&inline_result)
- kfree(result);
return -EUCLEAN;
}
} else {
write_extent_buffer(buf, result, 0, csum_size);
}
- if (result != (char *)&inline_result)
- kfree(result);
+
return 0;
}
@@ -391,7 +382,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (need_lock)
btrfs_tree_read_unlock_blocking(eb);
return ret;
@@ -455,7 +446,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
- btree_get_extent, mirror_num);
+ mirror_num);
if (!ret) {
if (!verify_parent_transid(io_tree, eb,
parent_transid, 0))
@@ -610,7 +601,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
@@ -1012,7 +1003,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
if (IS_ERR(buf))
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, WAIT_NONE, btree_get_extent, 0);
+ buf, WAIT_NONE, 0);
free_extent_buffer(buf);
}
@@ -1031,7 +1022,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
- btree_get_extent, mirror_num);
+ mirror_num);
if (ret) {
free_extent_buffer(buf);
return ret;
@@ -1243,7 +1234,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
struct btrfs_key key;
int ret = 0;
- uuid_le uuid;
+ uuid_le uuid = NULL_UUID_LE;
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
@@ -1284,7 +1275,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- uuid_le_gen(&uuid);
+ if (is_fstree(objectid))
+ uuid_le_gen(&uuid);
memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
root->root_item.drop_level = 0;
@@ -2875,7 +2867,7 @@ retry_root_backup:
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) {
+ if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writeable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3231,6 +3223,7 @@ static int write_dev_supers(struct btrfs_device *device,
int errors = 0;
u32 crc;
u64 bytenr;
+ int op_flags;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3273,13 +3266,10 @@ static int write_dev_supers(struct btrfs_device *device,
* we fua the first super. The others we allow
* to go down lazy.
*/
- if (i == 0) {
- ret = btrfsic_submit_bh(REQ_OP_WRITE,
- REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh);
- } else {
- ret = btrfsic_submit_bh(REQ_OP_WRITE,
- REQ_SYNC | REQ_META | REQ_PRIO, bh);
- }
+ op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
+ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
+ op_flags |= REQ_FUA;
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
if (ret)
errors++;
}
@@ -3359,7 +3349,7 @@ static void write_dev_flush(struct btrfs_device *device)
bio->bi_private = &device->flush_wait;
btrfsic_submit_bio(bio);
- device->flush_bio_sent = 1;
+ set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
/*
@@ -3369,10 +3359,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
struct bio *bio = device->flush_bio;
- if (!device->flush_bio_sent)
+ if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
- device->flush_bio_sent = 0;
+ clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
wait_for_completion_io(&device->flush_wait);
return bio->bi_status;
@@ -3380,7 +3370,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
static int check_barrier_error(struct btrfs_fs_info *fs_info)
{
- if (!btrfs_check_rw_degradable(fs_info))
+ if (!btrfs_check_rw_degradable(fs_info, NULL))
return -EIO;
return 0;
}
@@ -3396,14 +3386,16 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
int errors_wait = 0;
blk_status_t ret;
+ lockdep_assert_held(&info->fs_devices->device_list_mutex);
/* send down all the barriers */
head = &info->fs_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (dev->missing)
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev)
continue;
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
write_dev_flush(dev);
@@ -3411,14 +3403,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
}
/* wait for all the barriers */
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (dev->missing)
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev) {
errors_wait++;
continue;
}
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_flush(dev);
@@ -3510,12 +3503,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
}
}
- list_for_each_entry_rcu(dev, head, dev_list) {
+ list_for_each_entry(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
continue;
}
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
btrfs_set_stack_device_generation(dev_item, 0);
@@ -3551,10 +3545,11 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
}
total_errors = 0;
- list_for_each_entry_rcu(dev, head, dev_list) {
+ list_for_each_entry(dev, head, dev_list) {
if (!dev->bdev)
continue;
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_supers(dev, max_mirrors);
@@ -3848,7 +3843,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->len,
fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
+ /*
+ * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+ * but item data not updated.
+ * So here we should only check item pointers, not item data.
+ */
+ if (btrfs_header_level(buf) == 0 &&
+ btrfs_check_leaf_relaxed(root, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -3906,9 +3907,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
- btrfs_warn(fs_info, "unrecognized super flag: %llu",
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+ btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ }
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 7f7c35d6347a..301151a50ac1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset, u64 start, u64 len,
+ int create);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3aeb5770f896..ddaccad469f8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
name_len = btrfs_inode_ref_name_len(leaf, iref);
}
- ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
- if (!ret) {
- btrfs_free_path(path);
- return -EIO;
- }
read_extent_buffer(leaf, name, name_ptr, name_len);
btrfs_free_path(path);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7208ecef7088..05751a677da4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2145,7 +2145,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- if (!stripe->dev->can_discard)
+ struct request_queue *req_q;
+
+ req_q = bdev_get_queue(stripe->dev->bdev);
+ if (!blk_queue_discard(req_q))
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
@@ -2894,7 +2897,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
- u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
@@ -3502,13 +3505,6 @@ again:
goto again;
}
- /* We've already setup this transaction, go ahead and exit */
- if (block_group->cache_generation == trans->transid &&
- i_size_read(inode)) {
- dcs = BTRFS_DC_SETUP;
- goto out_put;
- }
-
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
@@ -3532,6 +3528,13 @@ again:
}
WARN_ON(ret);
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
if (i_size_read(inode) > 0) {
ret = btrfs_check_trunc_cache_free_space(fs_info,
&fs_info->global_block_rsv);
@@ -4945,12 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
bytes = 0;
else
bytes -= delayed_rsv->size;
+ spin_unlock(&delayed_rsv->lock);
+
if (percpu_counter_compare(&space_info->total_bytes_pinned,
bytes) < 0) {
- spin_unlock(&delayed_rsv->lock);
return -ENOSPC;
}
- spin_unlock(&delayed_rsv->lock);
commit:
trans = btrfs_join_transaction(fs_info->extent_root);
@@ -5738,8 +5741,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
* or return if we already have enough space. This will also handle the resreve
* tracepoint for the reserved amount.
*/
-int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
- enum btrfs_reserve_flush_enum flush)
+static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
@@ -5770,7 +5773,7 @@ int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
-void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -9206,6 +9209,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_del_root(trans, fs_info, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
+ err = ret;
goto out_end_trans;
}
@@ -9689,7 +9693,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free &&
- !device->is_tgtdev_for_dev_replace) {
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -10874,7 +10878,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
/* Not writeable = nothing to do. */
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
/* No free space = nothing to do. */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 16045ea86fc1..dfeb74a0be77 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,6 +21,7 @@
#include "locking.h"
#include "rcu-string.h"
#include "backref.h"
+#include "disk-io.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -109,8 +110,6 @@ struct tree_entry {
struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
- get_extent_t *get_extent;
-
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -139,7 +138,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
BUG_ON(ret < 0);
}
-static noinline void flush_write_bio(void *data);
+static void flush_write_bio(struct extent_page_data *epd);
+
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
@@ -581,7 +581,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask, struct extent_changeset *changeset)
@@ -1295,10 +1295,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask)
+ struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, mask, NULL);
+ cached, GFP_NOFS, NULL);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1348,7 +1348,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED, 1, 0, NULL);
return 0;
}
return 1;
@@ -1648,7 +1648,7 @@ again:
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -1744,7 +1744,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
unsigned long page_ops)
{
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL, GFP_NOFS);
+ NULL);
__process_pages_contig(inode->i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
@@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
struct btrfs_bio *bbio = NULL;
int ret;
- ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+ ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
bio = btrfs_io_bio_alloc(1);
@@ -2027,7 +2027,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
- if (!dev || !dev->bdev || !dev->writeable) {
+ if (!dev || !dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
@@ -2257,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
return 0;
}
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2281,7 +2282,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*/
- if (failed_bio->bi_vcnt > 1) {
+ if (failed_bio_pages > 1) {
/*
* to fulfill b), we need to know the exact failing sectors, as
* we don't want to rewrite any more than the failed ones. thus,
@@ -2374,6 +2375,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
+ unsigned failed_bio_pages = bio_pages_all(failed_bio);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2381,13 +2383,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
if (ret)
return ret;
- if (!btrfs_check_repairable(inode, failed_bio, failrec,
+ if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
- if (failed_bio->bi_vcnt > 1)
+ if (failed_bio_pages > 1)
read_mode |= REQ_FAILFAST_DEV;
phy_offset >>= inode->i_sb->s_blocksize_bits;
@@ -2492,7 +2494,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
if (uptodate && tree->track_uptodate)
set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, start, end, &cached);
}
/*
@@ -2724,7 +2726,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
@@ -2732,7 +2734,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
- bio_get(bio);
if (tree->ops)
ret = tree->ops->submit_bio_hook(tree->private_data, bio,
@@ -2740,7 +2741,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
else
btrfsic_submit_bio(bio);
- bio_put(bio);
return blk_status_to_errno(ret);
}
@@ -2942,8 +2942,7 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ cur + iosize - 1, &cached);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
@@ -3036,8 +3035,7 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ cur + iosize - 1, &cached);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3092,9 +3090,8 @@ out:
static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
struct page *pages[], int nr_pages,
u64 start, u64 end,
- get_extent_t *get_extent,
struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
+ struct bio **bio,
unsigned long *bio_flags,
u64 *prev_em_start)
{
@@ -3115,18 +3112,17 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
}
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], get_extent, em_cached, bio,
- mirror_num, bio_flags, 0, prev_em_start);
+ __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
+ bio, 0, bio_flags, 0, prev_em_start);
put_page(pages[index]);
}
}
static void __extent_readpages(struct extent_io_tree *tree,
struct page *pages[],
- int nr_pages, get_extent_t *get_extent,
+ int nr_pages,
struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
+ struct bio **bio, unsigned long *bio_flags,
u64 *prev_em_start)
{
u64 start = 0;
@@ -3146,8 +3142,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
} else {
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
- end, get_extent, em_cached,
- bio, mirror_num, bio_flags,
+ end, em_cached,
+ bio, bio_flags,
prev_em_start);
start = page_start;
end = start + PAGE_SIZE - 1;
@@ -3158,9 +3154,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
if (end)
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
- end, get_extent, em_cached, bio,
- mirror_num, bio_flags,
- prev_em_start);
+ end, em_cached, bio,
+ bio_flags, prev_em_start);
}
static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -3253,7 +3248,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
delalloc_start,
delalloc_end,
&page_started,
- nr_written);
+ nr_written, wbc);
/* File system has been set read-only */
if (ret) {
SetPageError(page);
@@ -3375,7 +3370,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, NULL, 1);
break;
}
- em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur,
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
@@ -3458,10 +3453,9 @@ done:
* and the end_io handler clears the writeback ranges
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+ struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
- struct extent_page_data *epd = data;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
int ret;
@@ -3895,8 +3889,7 @@ retry:
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
+ * @data: data passed to __extent_writepage function
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -3908,8 +3901,7 @@ retry:
*/
static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
- writepage_t writepage, void *data,
- void (*flush_fn)(void *))
+ struct extent_page_data *epd)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -3973,7 +3965,7 @@ retry:
* mapping
*/
if (!trylock_page(page)) {
- flush_fn(data);
+ flush_write_bio(epd);
lock_page(page);
}
@@ -3984,7 +3976,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- flush_fn(data);
+ flush_write_bio(epd);
wait_on_page_writeback(page);
}
@@ -3994,7 +3986,7 @@ retry:
continue;
}
- ret = (*writepage)(page, wbc, data);
+ ret = __extent_writepage(page, wbc, epd);
if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
unlock_page(page);
@@ -4042,7 +4034,7 @@ retry:
return ret;
}
-static void flush_epd_write_bio(struct extent_page_data *epd)
+static void flush_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
int ret;
@@ -4053,37 +4045,28 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
}
}
-static noinline void flush_write_bio(void *data)
-{
- struct extent_page_data *epd = data;
- flush_epd_write_bio(epd);
-}
-
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent,
- struct writeback_control *wbc)
+int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
- .get_extent = get_extent,
+ .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
ret = __extent_writepage(page, wbc, &epd);
- flush_epd_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
- u64 start, u64 end, get_extent_t *get_extent,
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode)
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -4091,7 +4074,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
- .get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4117,34 +4099,30 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
start += PAGE_SIZE;
}
- flush_epd_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
- get_extent_t *get_extent,
struct writeback_control *wbc)
{
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
- .get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
- ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
- flush_write_bio);
- flush_epd_write_bio(&epd);
+ ret = extent_write_cache_pages(mapping, wbc, &epd);
+ flush_write_bio(&epd);
return ret;
}
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages,
- get_extent_t get_extent)
+ struct list_head *pages, unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
@@ -4170,13 +4148,13 @@ int extent_readpages(struct extent_io_tree *tree,
pagepool[nr++] = page;
if (nr < ARRAY_SIZE(pagepool))
continue;
- __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, &prev_em_start);
+ __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+ &bio_flags, &prev_em_start);
nr = 0;
}
if (nr)
- __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, &prev_em_start);
+ __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+ &bio_flags, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
@@ -4209,7 +4187,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING,
- 1, 1, &cached_state, GFP_NOFS);
+ 1, 1, &cached_state);
return 0;
}
@@ -4234,9 +4212,9 @@ static int try_release_extent_state(struct extent_map_tree *map,
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
*/
- ret = clear_extent_bit(tree, start, end,
+ ret = __clear_extent_bit(tree, start, end,
~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask);
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4302,9 +4280,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
* This maps until we find something past 'last'
*/
static struct extent_map *get_extent_skip_holes(struct inode *inode,
- u64 offset,
- u64 last,
- get_extent_t *get_extent)
+ u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
struct extent_map *em;
@@ -4318,15 +4294,14 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
+ len, 0);
if (IS_ERR_OR_NULL(em))
return em;
/* if this isn't a hole return it */
- if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
- em->block_start != EXTENT_MAP_HOLE) {
+ if (em->block_start != EXTENT_MAP_HOLE)
return em;
- }
/* this is a hole, advance to the next extent */
offset = extent_map_end(em);
@@ -4451,7 +4426,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len, get_extent_t *get_extent)
+ __u64 start, __u64 len)
{
int ret = 0;
u64 off = start;
@@ -4533,8 +4508,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
- em = get_extent_skip_holes(inode, start, last_for_get_extent,
- get_extent);
+ em = get_extent_skip_holes(inode, start, last_for_get_extent);
if (!em)
goto out;
if (IS_ERR(em)) {
@@ -4622,8 +4596,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
/* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent,
- get_extent);
+ em = get_extent_skip_holes(inode, off, last_for_get_extent);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -4647,7 +4620,7 @@ out_free:
out:
btrfs_free_path(path);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
return ret;
}
@@ -5263,8 +5236,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb, int wait,
- get_extent_t *get_extent, int mirror_num)
+ struct extent_buffer *eb, int wait, int mirror_num)
{
unsigned long i;
struct page *page;
@@ -5324,7 +5296,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page);
err = __extent_read_full_page(tree, page,
- get_extent, &bio,
+ btree_get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);
if (err) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4a8861379d3e..a7a850abd600 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -116,7 +116,8 @@ struct extent_io_ops {
*/
int (*fill_delalloc)(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written);
+ unsigned long *nr_written,
+ struct writeback_control *wbc);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -299,19 +300,29 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
}
static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached, gfp_t mask)
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+ u64 start, u64 end, struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
@@ -322,8 +333,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
if (bits & EXTENT_LOCKED)
wake = 1;
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
- GFP_NOFS);
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -339,10 +349,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -357,7 +367,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -365,10 +375,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
+ u64 end, unsigned int extra_bits,
+ struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
NULL, cached_state, GFP_NOFS);
}
@@ -399,24 +410,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent,
- struct writeback_control *wbc);
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
- u64 start, u64 end, get_extent_t *get_extent,
+int extent_write_full_page(struct page *page, struct writeback_control *wbc);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
- get_extent_t *get_extent,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages,
- get_extent_t get_extent);
+ struct list_head *pages, unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len, get_extent_t *get_extent);
+ __u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -435,7 +441,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, int wait,
- get_extent_t *get_extent, int mirror_num);
+ int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -538,7 +544,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
struct io_failure_record **failrec_ret);
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int fail_mirror);
struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2e348fb0b280..d3bd02105d1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -454,3 +454,135 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
+
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
+/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start)
+{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len = em->len;
+ }
+ return add_extent_mapping(em_tree, em, 0);
+}
+
+/**
+ * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @em_tree - the extent tree into which we want to insert the extent mapping
+ * @em_in - extent we are inserting
+ * @start - start of the logical range btrfs_get_extent() is requesting
+ * @len - length of the logical range btrfs_get_extent() is requesting
+ *
+ * Note that @em_in's range may be different from [start, start+len),
+ * but they must be overlapped.
+ *
+ * Insert @em_in into @em_tree. In case there is an overlapping range, handle
+ * the -EEXIST by either:
+ * a) Returning the existing extent in @em_in if @start is within the
+ * existing em.
+ * b) Merge the existing extent with @em_in passed in.
+ *
+ * Return 0 on success, otherwise -EEXIST.
+ *
+ */
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len)
+{
+ int ret;
+ struct extent_map *em = *em_in;
+
+ ret = add_extent_mapping(em_tree, em, 0);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= existing->start &&
+ start < extent_map_end(existing)) {
+ free_extent_map(em);
+ *em_in = existing;
+ ret = 0;
+ } else {
+ u64 orig_start = em->start;
+ u64 orig_len = em->len;
+
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ ret = merge_extent_mapping(em_tree, existing,
+ em, start);
+ if (ret) {
+ free_extent_map(em);
+ *em_in = NULL;
+ WARN_ONCE(ret,
+"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
+ ret, existing->start, existing->len,
+ orig_start, orig_len);
+ }
+ free_extent_map(existing);
+ }
+ }
+
+ ASSERT(ret == 0 || ret == -EEXIST);
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 64365bbc9b16..b29f77bc0732 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,7 +13,6 @@
/* bits for the flags field */
#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
#define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
@@ -92,4 +91,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f80254d82f40..41ab9073d1d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -31,6 +31,7 @@
#include <linux/slab.h>
#include <linux/btrfs.h>
#include <linux/uio.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -477,6 +478,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
}
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start,
+ search_len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
@@ -497,14 +539,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(inode);
+ unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
+
+ if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (start_pos >= isize &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case
+ * so just set the delalloc new bit for the range
+ * directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+ start_pos,
+ num_bytes, cached);
+ if (err)
+ return err;
+ }
+ }
+
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- cached, 0);
+ extra_bits, cached, 0);
if (err)
return err;
@@ -1404,47 +1466,6 @@ fail:
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start,
- search_len, 0);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
@@ -1473,10 +1494,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ round_up(pos + write_bytes - start_pos,
fs_info->sectorsize) - 1;
- if (start_pos < inode->vfs_inode.i_size ||
- (inode->flags & BTRFS_INODE_PREALLOC)) {
+ if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
- unsigned int clear_bits;
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
cached_state);
@@ -1486,7 +1505,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state, GFP_NOFS);
+ last_pos, cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1498,19 +1517,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
- last_pos - start_pos + 1,
- cached_state);
- clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
- if (ret)
- clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
- clear_extent_bit(&inode->io_tree, start_pos,
- last_pos, clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0,
- 0, cached_state, GFP_NOFS);
- if (ret)
- return ret;
+ clear_extent_bit(&inode->io_tree, start_pos, last_pos,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, cached_state);
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
@@ -1746,11 +1756,10 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, NULL);
+ pos, copied, &cached_state);
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state,
- GFP_NOFS);
+ lockstart, lockend, &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
@@ -2010,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
int ret;
+ struct blk_plug plug;
+ /*
+ * This is only called in fsync, which would do synchronous writes, so
+ * a plug can merge adjacent IOs as much as possible. Esp. in case of
+ * multiple disks using raid profile, a large IO can be split to
+ * several segments of stripe length (currently 64K).
+ */
+ blk_start_plug(&plug);
atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
+ blk_finish_plug(&plug);
return ret;
}
@@ -2048,6 +2066,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
+ btrfs_init_log_ctx(&ctx, inode);
+
/*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
@@ -2194,8 +2214,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- btrfs_init_log_ctx(&ctx, inode);
-
ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
@@ -2253,6 +2271,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_end_transaction(trans);
}
out:
+ ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
@@ -2440,6 +2459,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
return ret;
}
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
+{
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ int ret;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len <= lockstart ||
+ ordered->file_offset > lockend)) &&
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, cached_state);
+ ret = btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2556,38 +2615,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- truncate_pagecache_range(inode, lockstart, lockend);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
- /*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
- */
- if ((!ordered ||
- (ordered->file_offset + ordered->len <= lockstart ||
- ordered->file_offset > lockend)) &&
- !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state, GFP_NOFS);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ if (ret) {
+ inode_unlock(inode);
+ goto out_only_mutex;
}
path = btrfs_alloc_path();
@@ -2732,7 +2764,7 @@ out_free:
btrfs_free_block_rsv(fs_info, rsv);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret && !err) {
/*
@@ -2796,6 +2828,234 @@ insert:
return 0;
}
+static int btrfs_fallocate_update_isize(struct inode *inode,
+ const u64 end,
+ const int mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ int ret2;
+
+ if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+ return 0;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ inode->i_ctime = current_time(inode);
+ i_size_write(inode, end);
+ btrfs_ordered_update_i_size(inode, end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_end_transaction(trans);
+
+ return ret ? ret : ret2;
+}
+
+enum {
+ RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
+ RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
+ RANGE_BOUNDARY_HOLE = 2,
+};
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+ u64 offset)
+{
+ const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ struct extent_map *em;
+ int ret;
+
+ offset = round_down(offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start == EXTENT_MAP_HOLE)
+ ret = RANGE_BOUNDARY_HOLE;
+ else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
+ else
+ ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
+
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+ loff_t offset,
+ loff_t len,
+ const int mode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_map *em;
+ struct extent_changeset *data_reserved = NULL;
+ int ret;
+ u64 alloc_hint = 0;
+ const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ u64 alloc_start = round_down(offset, sectorsize);
+ u64 alloc_end = round_up(offset + len, sectorsize);
+ u64 bytes_to_reserve = 0;
+ bool space_reserved = false;
+
+ inode_dio_wait(inode);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ alloc_start, alloc_end - alloc_start, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ /*
+ * Avoid hole punching and extent allocation for some cases. More cases
+ * could be considered, but these are unlikely common and we keep things
+ * as simple as possible for now. Also, intentionally, if the target
+ * range contains one or more prealloc extents together with regular
+ * extents and holes, we drop all the existing extents and allocate a
+ * new prealloc extent, so that we get a larger contiguous disk extent.
+ */
+ if (em->start <= alloc_start &&
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ const u64 em_end = em->start + em->len;
+
+ if (em_end >= offset + len) {
+ /*
+ * The whole range is already a prealloc extent,
+ * do nothing except updating the inode's i_size if
+ * needed.
+ */
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ /*
+ * Part of the range is already a prealloc extent, so operate
+ * only on the remaining part of the range.
+ */
+ alloc_start = em_end;
+ ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+ len = offset + len - alloc_start;
+ offset = alloc_start;
+ alloc_hint = em->block_start + em->len;
+ }
+ free_extent_map(em);
+
+ if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+ BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ alloc_start, sectorsize, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ free_extent_map(em);
+ ret = btrfs_truncate_block(inode, offset, len, 0);
+ if (!ret)
+ ret = btrfs_fallocate_update_isize(inode,
+ offset + len,
+ mode);
+ return ret;
+ }
+ free_extent_map(em);
+ alloc_start = round_down(offset, sectorsize);
+ alloc_end = alloc_start + sectorsize;
+ goto reserve_space;
+ }
+
+ alloc_start = round_up(offset, sectorsize);
+ alloc_end = round_down(offset + len, sectorsize);
+
+ /*
+ * For unaligned ranges, check the pages at the boundaries, they might
+ * map to an extent, in which case we need to partially zero them, or
+ * they might map to a hole, in which case we need our allocation range
+ * to cover them.
+ */
+ if (!IS_ALIGNED(offset, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_start = round_down(offset, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(inode, offset, 0, 0);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+ if (!IS_ALIGNED(offset + len, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(inode,
+ offset + len);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_end = round_up(offset + len, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+reserve_space:
+ if (alloc_start < alloc_end) {
+ struct extent_state *cached_state = NULL;
+ const u64 lockstart = alloc_start;
+ const u64 lockend = alloc_end - 1;
+
+ bytes_to_reserve = alloc_end - alloc_start;
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ bytes_to_reserve);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret)
+ goto out;
+ ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ if (ret)
+ goto out;
+ ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+ alloc_end - alloc_start,
+ i_blocksize(inode),
+ offset + len, &alloc_hint);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
+ /* btrfs_prealloc_file_range releases reserved space on error */
+ if (ret) {
+ space_reserved = false;
+ goto out;
+ }
+ }
+ ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
+ out:
+ if (ret && space_reserved)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ alloc_start, bytes_to_reserve);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
@@ -2821,7 +3081,8 @@ static long btrfs_fallocate(struct file *file, int mode,
cur_offset = alloc_start;
/* Make sure we aren't being give some crap mode */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -2832,10 +3093,12 @@ static long btrfs_fallocate(struct file *file, int mode,
*
* For qgroup space, it will be checked later.
*/
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
+ if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ alloc_end - alloc_start);
+ if (ret < 0)
+ return ret;
+ }
inode_lock(inode);
@@ -2877,6 +3140,12 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = btrfs_zero_range(inode, offset, len, mode);
+ inode_unlock(inode);
+ return ret;
+ }
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -2886,15 +3155,15 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- alloc_end - 1);
+ ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
- &cached_state, GFP_KERNEL);
+ &cached_state);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
@@ -2912,7 +3181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
- while (1) {
+ while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR(em)) {
@@ -2948,8 +3217,6 @@ static long btrfs_fallocate(struct file *file, int mode,
}
free_extent_map(em);
cur_offset = last_byte;
- if (cur_offset >= alloc_end)
- break;
}
/*
@@ -2972,37 +3239,18 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret < 0)
goto out_unlock;
- if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = current_time(inode);
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end, NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans);
- else
- ret = btrfs_end_transaction(trans);
- }
- }
+ /*
+ * We didn't need to allocate any more space, but we still extended the
+ * size of the file so we need to update i_size and the inode item.
+ */
+ ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state, GFP_KERNEL);
+ &cached_state);
out:
inode_unlock(inode);
/* Let go of our reservation. */
- if (ret != 0)
+ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(inode, data_reserved,
alloc_start, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
@@ -3071,7 +3319,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
*offset = min_t(loff_t, start, inode->i_size);
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
+ &cached_state);
return ret;
}
@@ -3135,7 +3383,7 @@ void btrfs_auto_defrag_exit(void)
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
-int btrfs_auto_defrag_init(void)
+int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cdc9f4015ec3..014f3c090231 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -993,8 +993,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1008,7 +1007,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL, GFP_NOFS);
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1105,8 +1104,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
return ret;
}
@@ -1127,8 +1125,7 @@ cleanup_write_cache_enospc(struct inode *inode,
{
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state,
- GFP_NOFS);
+ i_size_read(inode) - 1, cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1264,7 +1261,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Lock all pages first so we can lock the extent safely. */
ret = io_ctl_prepare_pages(io_ctl, inode, 0);
if (ret)
- goto out;
+ goto out_unlock;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
&cached_state);
@@ -1322,7 +1319,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -1358,6 +1355,7 @@ out_nospc_locked:
out_nospc:
cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
+out_unlock:
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b93fe05a39c7..53ca025655fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
#include <linux/magic.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -378,6 +379,7 @@ struct async_cow {
struct page *locked_page;
u64 start;
u64 end;
+ unsigned int write_flags;
struct list_head extents;
struct btrfs_work work;
};
@@ -535,9 +537,14 @@ again:
*
* If the compression fails for any reason, we set the pages
* dirty again later on.
+ *
+ * Note that the remaining part is redirtied, the start pointer
+ * has moved, the end is the original one.
*/
- extent_range_clear_dirty_for_io(inode, start, end);
- redirty = 1;
+ if (!redirty) {
+ extent_range_clear_dirty_for_io(inode, start, end);
+ redirty = 1;
+ }
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
@@ -764,11 +771,10 @@ retry:
* all those pages down to the drive.
*/
if (!page_started && !ret)
- extent_write_locked_range(io_tree,
- inode, async_extent->start,
+ extent_write_locked_range(inode,
+ async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- btrfs_get_extent,
WB_SYNC_ALL);
else if (ret)
unlock_page(async_cow->locked_page);
@@ -857,7 +863,8 @@ retry:
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
- async_extent->nr_pages)) {
+ async_extent->nr_pages,
+ async_cow->write_flags)) {
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
@@ -1191,7 +1198,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written)
+ unsigned long *nr_written,
+ unsigned int write_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_cow *async_cow;
@@ -1200,7 +1208,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
- 1, 0, NULL, GFP_NOFS);
+ 1, 0, NULL);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
@@ -1208,6 +1216,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->root = root;
async_cow->locked_page = locked_page;
async_cow->start = start;
+ async_cow->write_flags = write_flags;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS))
@@ -1577,11 +1586,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
*/
static int run_delalloc_range(void *private_data, struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written)
+ unsigned long *nr_written,
+ struct writeback_control *wbc)
{
struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
+ unsigned int write_flags = wbc_to_write_flags(wbc);
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1596,7 +1607,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
ret = cow_file_range_async(inode, locked_page, start, end,
- page_started, nr_written);
+ page_started, nr_written,
+ write_flags);
}
if (ret)
btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
@@ -1944,7 +1956,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
/*
* extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read
+ * on write, or reading the csums from the tree before a read.
+ *
+ * Rules about async/sync submit,
+ * a) read: sync submit
+ *
+ * b) write without checksum: sync submit
+ *
+ * c) write with checksum:
+ * c-1) if bio is issued by fsync: sync submit
+ * (sync_writers != 0)
+ *
+ * c-2) if root is reloc root: sync submit
+ * (only in case of buffered IO)
+ *
+ * c-3) otherwise: async submit
*/
static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
@@ -2016,20 +2042,21 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sum;
list_for_each_entry(sum, list, list) {
- trans->adding_csums = 1;
+ trans->adding_csums = true;
btrfs_csum_file_blocks(trans,
BTRFS_I(inode)->root->fs_info->csum_root, sum);
- trans->adding_csums = 0;
+ trans->adding_csums = false;
}
return 0;
}
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
struct extent_state **cached_state, int dedupe)
{
WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- cached_state);
+ extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2074,7 +2101,7 @@ again:
PAGE_SIZE);
if (ordered) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state, GFP_NOFS);
+ page_end, &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -2090,14 +2117,21 @@ again:
goto out;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
- 0);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+ &cached_state, 0);
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ ClearPageChecked(page);
+ goto out;
+ }
+
ClearPageChecked(page);
set_page_dirty(page);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
out_page:
unlock_page(page);
put_page(page);
@@ -2689,7 +2723,7 @@ out_free_path:
btrfs_end_transaction(trans);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached, GFP_NOFS);
+ &cached);
iput(inode);
return ret;
}
@@ -2978,7 +3012,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
clear_extent_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+ EXTENT_DEFRAG, 0, 0, &cached_state);
}
if (nolock)
@@ -2997,6 +3031,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
+ btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
+ ordered_extent->len);
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
ordered_extent->file_offset,
ordered_extent->file_offset +
@@ -3046,7 +3082,7 @@ out:
ordered_extent->len - 1,
clear_bits,
(clear_bits & EXTENT_LOCKED) ? 1 : 0,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
}
if (trans)
@@ -3060,7 +3096,7 @@ out:
else
start = ordered_extent->file_offset;
end = ordered_extent->file_offset + ordered_extent->len - 1;
- clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+ clear_extent_uptodate(io_tree, start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
@@ -3767,7 +3803,8 @@ static int btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+ inode_set_iversion_queried(inode,
+ btrfs_inode_sequence(leaf, inode_item));
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -3935,7 +3972,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
&token);
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
&token);
- btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
+ &token);
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
@@ -4734,8 +4772,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
u64 block_start;
u64 block_end;
- if ((offset & (blocksize - 1)) == 0 &&
- (!len || ((len & (blocksize - 1)) == 0)))
+ if (IS_ALIGNED(offset, blocksize) &&
+ (!len || IS_ALIGNED(len, blocksize)))
goto out;
block_start = round_down(from, blocksize);
@@ -4777,7 +4815,7 @@ again:
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
@@ -4788,13 +4826,13 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state, GFP_NOFS);
+ 0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state, 0);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
goto out_unlock;
}
@@ -4813,8 +4851,7 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret)
@@ -4915,7 +4952,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (!ordered)
break;
unlock_extent_cached(io_tree, hole_start, block_end - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -4980,8 +5017,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -5224,8 +5260,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1,
- &cached_state, GFP_NOFS);
+ EXTENT_DEFRAG, 1, 1, &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5438,6 +5473,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
goto out_err;
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+ if (location->type != BTRFS_INODE_ITEM_KEY &&
+ location->type != BTRFS_ROOT_ITEM_KEY) {
+ btrfs_warn(root->fs_info,
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+ __func__, name, btrfs_ino(BTRFS_I(dir)),
+ location->objectid, location->type, location->offset);
+ goto out_err;
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5754,8 +5797,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
- BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
-
index = srcu_read_lock(&fs_info->subvol_srcu);
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
@@ -5878,7 +5919,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_file_private *private = file->private_data;
struct btrfs_dir_item *di;
@@ -5946,9 +5986,6 @@ again:
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, slot, di))
- goto next;
-
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -6088,19 +6125,20 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now,
int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ bool dirty = flags & ~S_VERSION;
if (btrfs_root_readonly(root))
return -EROFS;
if (flags & S_VERSION)
- inode_inc_iversion(inode);
+ dirty |= inode_maybe_inc_iversion(inode, dirty);
if (flags & S_CTIME)
inode->i_ctime = *now;
if (flags & S_MTIME)
inode->i_mtime = *now;
if (flags & S_ATIME)
inode->i_atime = *now;
- return btrfs_dirty_inode(inode);
+ return dirty ? btrfs_dirty_inode(inode) : 0;
}
/*
@@ -6281,7 +6319,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
}
/*
* index_cnt is ignored for everything but a dir,
- * btrfs_get_inode_index_count has an explanation for the magic
+ * btrfs_set_inode_index_count has an explanation for the magic
* number
*/
BTRFS_I(inode)->index_cnt = 2;
@@ -6544,7 +6582,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
out_unlock:
btrfs_end_transaction(trans);
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -6625,7 +6662,6 @@ out_unlock:
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return err;
@@ -6700,7 +6736,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
- btrfs_balance_delayed_items(fs_info);
fail:
if (trans)
btrfs_end_transaction(trans);
@@ -6778,7 +6813,6 @@ out_fail:
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return err;
@@ -6787,68 +6821,6 @@ out_fail_inode:
goto out_fail;
}
-/* Find next extent map of a given extent map, caller needs to ensure locks */
-static struct extent_map *next_extent_map(struct extent_map *em)
-{
- struct rb_node *next;
-
- next = rb_next(&em->rb_node);
- if (!next)
- return NULL;
- return container_of(next, struct extent_map, rb_node);
-}
-
-static struct extent_map *prev_extent_map(struct extent_map *em)
-{
- struct rb_node *prev;
-
- prev = rb_prev(&em->rb_node);
- if (!prev)
- return NULL;
- return container_of(prev, struct extent_map, rb_node);
-}
-
-/* helper for btfs_get_extent. Given an existing extent in the tree,
- * the existing extent is the nearest extent to map_start,
- * and an extent that you want to insert, deal with overlap and insert
- * the best fitted new extent into the tree.
- */
-static int merge_extent_mapping(struct extent_map_tree *em_tree,
- struct extent_map *existing,
- struct extent_map *em,
- u64 map_start)
-{
- struct extent_map *prev;
- struct extent_map *next;
- u64 start;
- u64 end;
- u64 start_diff;
-
- BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-
- if (existing->start > map_start) {
- next = existing;
- prev = prev_extent_map(next);
- } else {
- prev = existing;
- next = next_extent_map(prev);
- }
-
- start = prev ? extent_map_end(prev) : em->start;
- start = max_t(u64, start, em->start);
- end = next ? next->start : extent_map_end(em);
- end = min_t(u64, end, extent_map_end(em));
- start_diff = start - em->start;
- em->start = start;
- em->len = end - start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- em->block_start += start_diff;
- em->block_len -= start_diff;
- }
- return add_extent_mapping(em_tree, em, 0);
-}
-
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
size_t pg_offset, u64 extent_offset,
@@ -6923,10 +6895,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_trans_handle *trans = NULL;
const bool new_inline = !page || create;
-again:
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
@@ -6965,8 +6935,7 @@ again:
path->reada = READA_FORWARD;
}
- ret = btrfs_lookup_file_extent(trans, root, path,
- objectid, start, trans != NULL);
+ ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
goto out;
@@ -7067,7 +7036,7 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
- if (create == 0 && !PageUptodate(page)) {
+ if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
@@ -7088,25 +7057,6 @@ next:
kunmap(page);
}
flush_dcache_page(page);
- } else if (create && PageUptodate(page)) {
- BUG();
- if (!trans) {
- kunmap(page);
- free_extent_map(em);
- em = NULL;
-
- btrfs_release_path(path);
- trans = btrfs_join_transaction(root);
-
- if (IS_ERR(trans))
- return ERR_CAST(trans);
- goto again;
- }
- map = kmap(page);
- write_extent_buffer(leaf, map + pg_offset, ptr,
- copy_size);
- kunmap(page);
- btrfs_mark_buffer_dirty(leaf);
}
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
@@ -7118,7 +7068,6 @@ not_found:
em->len = len;
not_found_em:
em->block_start = EXTENT_MAP_HOLE;
- set_bit(EXTENT_FLAG_VACANCY, &em->flags);
insert:
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
@@ -7131,62 +7080,13 @@ insert:
err = 0;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- /* it is possible that someone inserted the extent into the tree
- * while we had the lock dropped. It is also possible that
- * an overlapping map exists in the tree
- */
- if (ret == -EEXIST) {
- struct extent_map *existing;
-
- ret = 0;
-
- existing = search_extent_mapping(em_tree, start, len);
- /*
- * existing will always be non-NULL, since there must be
- * extent causing the -EEXIST.
- */
- if (existing->start == em->start &&
- extent_map_end(existing) >= extent_map_end(em) &&
- em->block_start == existing->block_start) {
- /*
- * The existing extent map already encompasses the
- * entire extent map we tried to add.
- */
- free_extent_map(em);
- em = existing;
- err = 0;
-
- } else if (start >= extent_map_end(existing) ||
- start <= existing->start) {
- /*
- * The existing extent map is the one nearest to
- * the [start, start + len) range which overlaps
- */
- err = merge_extent_mapping(em_tree, existing,
- em, start);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- free_extent_map(em);
- em = existing;
- err = 0;
- }
- }
+ err = btrfs_add_extent_mapping(em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
trace_btrfs_get_extent(root, inode, em);
btrfs_free_path(path);
- if (trans) {
- ret = btrfs_end_transaction(trans);
- if (!err)
- err = ret;
- }
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -7308,7 +7208,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em->block_start = EXTENT_MAP_DELALLOC;
em->block_len = found;
}
- } else if (hole_em) {
+ } else {
return hole_em;
}
out:
@@ -7625,7 +7525,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state, GFP_NOFS);
+ cached_state);
if (ordered) {
/*
@@ -7910,7 +7810,7 @@ unlock:
if (lockstart < lockend) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, unlock_bits, 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
} else {
free_extent_state(cached_state);
}
@@ -7921,7 +7821,7 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ unlock_bits, 1, 0, &cached_state);
err:
if (dio_data)
current->journal_info = dio_data;
@@ -7937,15 +7837,12 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- bio_get(bio);
-
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
if (ret)
- goto err;
+ return ret;
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
-err:
- bio_put(bio);
+
return ret;
}
@@ -7999,6 +7896,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
int segs;
int ret;
blk_status_t status;
+ struct bio_vec bvec;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -8014,8 +7912,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
}
segs = bio_segments(failed_bio);
+ bio_get_first_bvec(failed_bio, &bvec);
if (segs > 1 ||
- (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
+ (bvec.bv_len > btrfs_inode_sectorsize(inode)))
read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -8058,7 +7957,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
io_tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+ ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -8148,7 +8047,7 @@ static void btrfs_retry_endio(struct bio *bio)
uptodate = 1;
ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
io_tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8444,11 +8343,10 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
bool write = bio_op(bio) == REQ_OP_WRITE;
blk_status_t ret;
+ /* Check btrfs_submit_bio_hook() for rules about async submit. */
if (async_submit)
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
- bio_get(bio);
-
if (!write) {
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
@@ -8481,7 +8379,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
map:
ret = btrfs_map_bio(fs_info, bio, 0, 0);
err:
- bio_put(bio);
return ret;
}
@@ -8838,7 +8735,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+ return extent_fiemap(inode, fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
@@ -8850,7 +8747,6 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct extent_io_tree *tree;
struct inode *inode = page->mapping->host;
int ret;
@@ -8869,8 +8765,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
}
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(page, wbc);
btrfs_add_delayed_iput(inode);
return ret;
}
@@ -8881,7 +8776,7 @@ static int btrfs_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+ return extent_writepages(tree, mapping, wbc);
}
static int
@@ -8890,8 +8785,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
{
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_readpages(tree, mapping, pages, nr_pages,
- btrfs_get_extent);
+ return extent_readpages(tree, mapping, pages, nr_pages);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
@@ -8962,8 +8856,7 @@ again:
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DEFRAG, 1, 0, &cached_state);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -9020,7 +8913,7 @@ again:
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
__btrfs_releasepage(page, GFP_NOFS);
}
@@ -9121,7 +9014,7 @@ again:
PAGE_SIZE);
if (ordered) {
unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -9148,13 +9041,13 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state, GFP_NOFS);
+ 0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(inode, page_start, end,
+ ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
&cached_state, 0);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -9180,7 +9073,7 @@ again:
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
out_unlock:
if (!ret) {
@@ -9405,7 +9298,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+ ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -9557,7 +9450,7 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_free_space_cachep);
}
-int btrfs_init_cachep(void)
+int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
@@ -10672,7 +10565,6 @@ out:
btrfs_end_transaction(trans);
if (ret)
iput(inode);
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd172a93d11a..111ee282b777 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -307,12 +308,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_COMPRESS;
ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- comp = "lzo";
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
- comp = "zlib";
- else
- comp = "zstd";
+ comp = btrfs_compress_type2str(fs_info->compress_type);
+ if (!comp || comp[0] == 0)
+ comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
+
ret = btrfs_set_prop(inode, "btrfs.compression",
comp, strlen(comp), 0);
if (ret)
@@ -979,7 +978,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
- unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1130,7 +1129,7 @@ again:
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (!ordered)
break;
@@ -1172,7 +1171,7 @@ again:
if (!i_done || ret)
goto out;
- if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
goto out;
/*
@@ -1190,7 +1189,7 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -1206,8 +1205,7 @@ again:
&cached_state);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state,
- GFP_NOFS);
+ page_start, page_end - 1, &cached_state);
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
@@ -1333,7 +1331,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* make sure we stop running if someone unmounts
* the FS
*/
- if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
break;
if (btrfs_defrag_cancelled(fs_info)) {
@@ -1503,7 +1501,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- if (!device->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_info(fs_info,
"resizer unable to apply on readonly device %llu",
devid);
@@ -1528,7 +1526,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
}
- if (device->is_tgtdev_for_dev_replace) {
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
goto out_free;
}
@@ -2206,7 +2204,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
if (!path)
return -ENOMEM;
- ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+ ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
key.objectid = tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -2675,14 +2673,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- mutex_lock(&fs_info->volume_mutex);
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
} else {
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
@@ -2726,9 +2722,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&fs_info->volume_mutex);
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
- mutex_unlock(&fs_info->volume_mutex);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
@@ -2753,16 +2747,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
if (!fi_args)
return -ENOMEM;
- mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
- memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
+ memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
fi_args->nodesize = fs_info->nodesize;
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
@@ -2779,7 +2773,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
{
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int ret = 0;
char *s_uuid = NULL;
@@ -2790,7 +2783,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
if (!btrfs_is_empty_uuid(di_args->uuid))
s_uuid = di_args->uuid;
- mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
if (!dev) {
@@ -2805,17 +2798,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
if (dev->name) {
struct rcu_string *name;
- rcu_read_lock();
name = rcu_dereference(dev->name);
- strncpy(di_args->path, name->str, sizeof(di_args->path));
- rcu_read_unlock();
+ strncpy(di_args->path, name->str, sizeof(di_args->path) - 1);
di_args->path[sizeof(di_args->path) - 1] = 0;
} else {
di_args->path[0] = '\0';
}
out:
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
ret = -EFAULT;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f6a05f836629..b30a056963ab 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root,
size_t),
void *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *name_buf = NULL;
char *value_buf = NULL;
@@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root,
name_ptr = (unsigned long)(di + 1);
data_ptr = name_ptr + name_len;
- if (verify_dir_item(fs_info, leaf,
- path->slots[0], di)) {
- ret = -EIO;
- goto out;
- }
-
if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
name_ptr,
@@ -430,11 +423,11 @@ static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
- return "zlib";
case BTRFS_COMPRESS_LZO:
- return "lzo";
case BTRFS_COMPRESS_ZSTD:
- return "zstd";
+ return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ default:
+ break;
}
return NULL;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 168fd03ca3ac..9e61dd624f7b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2883,8 +2883,7 @@ cleanup:
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
- GFP_NOFS);
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
extent_changeset_release(reserved);
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a7f79254ecca..dec0907dfb8a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -231,7 +231,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
- init_waitqueue_head(&cur->wait);
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
@@ -595,14 +594,31 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
* bio list here, anyone else that wants to
* change this stripe needs to do their own rmw.
*/
- if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
- cur->operation == BTRFS_RBIO_PARITY_SCRUB)
+ if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
- cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
return 0;
+ if (last->operation == BTRFS_RBIO_READ_REBUILD) {
+ int fa = last->faila;
+ int fb = last->failb;
+ int cur_fa = cur->faila;
+ int cur_fb = cur->failb;
+
+ if (last->faila >= last->failb) {
+ fa = last->failb;
+ fb = last->faila;
+ }
+
+ if (cur->faila >= cur->failb) {
+ cur_fa = cur->failb;
+ cur_fb = cur->faila;
+ }
+
+ if (fa != cur_fa || fb != cur_fb)
+ return 0;
+ }
return 1;
}
@@ -670,7 +686,6 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
- DEFINE_WAIT(wait);
struct btrfs_raid_bio *freeit = NULL;
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
@@ -816,15 +831,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- /*
- * The barrier for this waitqueue_active is not needed,
- * we're protected by h->lock and can't miss a wakeup.
- */
- } else if (waitqueue_active(&h->wait)) {
- spin_unlock(&rbio->bio_list_lock);
- spin_unlock_irqrestore(&h->lock, flags);
- wake_up(&h->wait);
- goto done_nolock;
}
}
done:
@@ -858,10 +864,17 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
kfree(rbio);
}
-static void free_raid_bio(struct btrfs_raid_bio *rbio)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
- unlock_stripe(rbio);
- __free_raid_bio(rbio);
+ struct bio *next;
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ cur->bi_status = err;
+ bio_endio(cur);
+ cur = next;
+ }
}
/*
@@ -871,20 +884,26 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
- struct bio *next;
+ struct bio *extra;
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
- free_raid_bio(rbio);
+ /*
+ * At this moment, rbio->bio_list is empty, however since rbio does not
+ * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
+ * hash list, rbio may be merged with others so that rbio->bio_list
+ * becomes non-empty.
+ * Once unlock_stripe() is done, rbio->bio_list will not be updated any
+ * more and we can call bio_endio() on all queued bios.
+ */
+ unlock_stripe(rbio);
+ extra = bio_list_get(&rbio->bio_list);
+ __free_raid_bio(rbio);
- while (cur) {
- next = cur->bi_next;
- cur->bi_next = NULL;
- cur->bi_status = err;
- bio_endio(cur);
- cur = next;
- }
+ rbio_endio_bio_list(cur, err);
+ if (extra)
+ rbio_endio_bio_list(extra, err);
}
/*
@@ -1435,14 +1454,13 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
*/
static void set_bio_pages_uptodate(struct bio *bio)
{
- struct bio_vec bvec;
- struct bvec_iter iter;
+ struct bio_vec *bvec;
+ int i;
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment(bvec, bio, iter)
- SetPageUptodate(bvec.bv_page);
+ bio_for_each_segment_all(bvec, bio, i)
+ SetPageUptodate(bvec->bv_page);
}
/*
@@ -1969,7 +1987,22 @@ cleanup:
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == BLK_STS_OK)
+ /*
+ * - In case of two failures, where rbio->failb != -1:
+ *
+ * Do not cache this rbio since the above read reconstruction
+ * (raid6_datap_recov() or raid6_2data_recov()) may have
+ * changed some content of stripes which are not identical to
+ * on-disk content any more, otherwise, a later write/recover
+ * may steal stripe_pages from this rbio and end up with
+ * corruptions or rebuild failures.
+ *
+ * - In case of single failure, where rbio->failb == -1:
+ *
+ * Cache this rbio iff the above read reconstruction is
+ * excuted without problems.
+ */
+ if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2170,11 +2203,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * reconstruct from the q stripe if they are
- * asking for mirror 3
+ * Loop retry:
+ * for 'mirror == 2', reconstruct from all other stripes.
+ * for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num == 3)
- rbio->failb = rbio->real_stripes - 2;
+ if (mirror_num > 2) {
+ /*
+ * 'mirror == 3' is to fail the p stripe and
+ * reconstruct from the q stripe. 'mirror > 3' is to
+ * fail a data stripe and reconstruct from p+q stripe.
+ */
+ rbio->failb = rbio->real_stripes - (mirror_num - 1);
+ ASSERT(rbio->failb > 0);
+ if (rbio->failb <= rbio->faila)
+ rbio->failb--;
+ }
ret = lock_stripe_add(rbio);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 34878699d363..171f3cce30e6 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
}
/* Walk up to the next node that needs to be processed */
-static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
- int *level)
+static int walk_up_tree(struct btrfs_path *path, int *level)
{
int l;
@@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
- struct btrfs_root *root;
struct extent_buffer *eb;
u64 bytenr = 0, num_bytes = 0;
int ret, level;
@@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
&bytenr, &num_bytes);
if (ret)
break;
- ret = walk_up_tree(root, path, &level);
+ ret = walk_up_tree(path, &level);
if (ret < 0)
break;
if (ret > 0) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4cf2eb67eba6..f0c3f00e97cb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3268,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
+ 0);
set_page_dirty(page);
unlock_extent(&BTRFS_I(inode)->io_tree,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3338407ef0f0..aab0194efe46 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -387,13 +387,6 @@ again:
WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
- ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
- name_len);
- if (!ret) {
- err = -EIO;
- goto out;
- }
-
WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
*sequence = btrfs_root_ref_sequence(leaf, ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b2f871d80982..ec56f33feea9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -301,6 +301,11 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
@@ -1323,15 +1328,34 @@ nodatasum_case:
* could happen otherwise that a correct page would be
* overwritten by a bad one).
*/
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
+ for (mirror_index = 0; ;mirror_index++) {
struct scrub_block *sblock_other;
if (mirror_index == failed_mirror_index)
continue;
- sblock_other = sblocks_for_recheck + mirror_index;
+
+ /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
+ if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (mirror_index >= BTRFS_MAX_MIRRORS)
+ break;
+ if (!sblocks_for_recheck[mirror_index].page_count)
+ break;
+
+ sblock_other = sblocks_for_recheck + mirror_index;
+ } else {
+ struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+ int max_allowed = r->bbio->num_stripes -
+ r->bbio->num_tgtdevs;
+
+ if (mirror_index >= max_allowed)
+ break;
+ if (!sblocks_for_recheck[1].page_count)
+ break;
+
+ ASSERT(failed_mirror_index == 0);
+ sblock_other = sblocks_for_recheck + 1;
+ sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ }
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, 0);
@@ -1666,49 +1690,32 @@ leave_nomem:
return 0;
}
-struct scrub_bio_ret {
- struct completion event;
- blk_status_t status;
-};
-
static void scrub_bio_wait_endio(struct bio *bio)
{
- struct scrub_bio_ret *ret = bio->bi_private;
-
- ret->status = bio->bi_status;
- complete(&ret->event);
-}
-
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
-{
- return page->recover &&
- (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ complete(bio->bi_private);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *page)
{
- struct scrub_bio_ret done;
+ DECLARE_COMPLETION_ONSTACK(done);
int ret;
+ int mirror_num;
- init_completion(&done.event);
- done.status = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
+ mirror_num = page->sblock->pagev[0]->mirror_num;
ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
page->recover->map_length,
- page->mirror_num, 0);
+ mirror_num, 0);
if (ret)
return ret;
- wait_for_completion_io(&done.event);
- if (done.status)
- return -EIO;
-
- return 0;
+ wait_for_completion_io(&done);
+ return blk_status_to_errno(bio->bi_status);
}
/*
@@ -2535,7 +2542,7 @@ leave_nomem:
}
WARN_ON(sblock->page_count == 0);
- if (dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
* the comment in scrub_missing_raid56_pages() for details.
@@ -2870,7 +2877,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
- if (dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
scrub_parity_mark_sectors_error(sparity, logical, len);
return 0;
}
@@ -4112,12 +4119,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!dev || (dev->missing && !is_dev_replace)) {
+ if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
+ !is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -ENODEV;
}
- if (!is_dev_replace && !readonly && !dev->writeable) {
+ if (!is_dev_replace && !readonly &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
rcu_read_lock();
name = rcu_dereference(dev->name);
@@ -4128,14 +4137,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
mutex_lock(&fs_info->scrub_lock);
- if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EIO;
}
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (dev->scrub_device ||
+ if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -4160,7 +4170,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
}
sctx->readonly = readonly;
- dev->scrub_device = sctx;
+ dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/*
@@ -4195,7 +4205,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_lock(&fs_info->scrub_lock);
- dev->scrub_device = NULL;
+ dev->scrub_ctx = NULL;
scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
@@ -4252,16 +4262,16 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
struct scrub_ctx *sctx;
mutex_lock(&fs_info->scrub_lock);
- sctx = dev->scrub_device;
+ sctx = dev->scrub_ctx;
if (!sctx) {
mutex_unlock(&fs_info->scrub_lock);
return -ENOTCONN;
}
atomic_inc(&sctx->cancel_req);
- while (dev->scrub_device) {
+ while (dev->scrub_ctx) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
- dev->scrub_device == NULL);
+ dev->scrub_ctx == NULL);
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
@@ -4278,7 +4288,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (dev)
- sctx = dev->scrub_device;
+ sctx = dev->scrub_ctx;
if (sctx)
memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4478,8 +4488,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
free_extent_map(em);
out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c10e4c70f02d..f306c608dc28 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
}
- ret = btrfs_is_name_len_valid(eb, path->slots[0],
- (unsigned long)(di + 1), name_len + data_len);
- if (!ret) {
- ret = -EIO;
- goto out;
- }
if (name_len + data_len > buf_len) {
buf_len = name_len + data_len;
if (is_vmalloc_addr(buf)) {
@@ -3521,7 +3515,40 @@ out:
}
/*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Check if inode ino2, or any of its ancestors, is inode ino1.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int check_ino_in_path(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ const u64 ino2_gen,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ if (ino1 == ino2)
+ return ino1_gen == ino2_gen;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ u64 parent;
+ u64 parent_gen;
+ int ret;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0)
+ return ret;
+ if (parent == ino1)
+ return parent_gen == ino1_gen;
+ ino = parent;
+ }
+ return 0;
+}
+
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
static int is_ancestor(struct btrfs_root *root,
@@ -3530,36 +3557,91 @@ static int is_ancestor(struct btrfs_root *root,
const u64 ino2,
struct fs_path *fs_path)
{
- u64 ino = ino2;
- bool free_path = false;
+ bool free_fs_path = false;
int ret = 0;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
if (!fs_path) {
fs_path = fs_path_alloc();
if (!fs_path)
return -ENOMEM;
- free_path = true;
+ free_fs_path = true;
}
- while (ino > BTRFS_FIRST_FREE_OBJECTID) {
- u64 parent;
- u64 parent_gen;
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- fs_path_reset(fs_path);
- ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
- if (ret < 0) {
- if (ret == -ENOENT && ino == ino2)
- ret = 0;
- goto out;
+ key.objectid = ino2;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ continue;
}
- if (parent == ino1) {
- ret = parent_gen == ino1_gen ? 1 : 0;
- goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino2)
+ break;
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ while (cur_offset < item_size) {
+ u64 parent;
+ u64 parent_gen;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ unsigned long ptr;
+ struct btrfs_inode_extref *extref;
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(leaf,
+ extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ parent = key.offset;
+ cur_offset = item_size;
+ }
+
+ ret = get_inode_info(root, parent, NULL, &parent_gen,
+ NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = check_ino_in_path(root, ino1, ino1_gen,
+ parent, parent_gen, fs_path);
+ if (ret)
+ goto out;
}
- ino = parent;
+ path->slots[0]++;
}
+ ret = 0;
out:
- if (free_path)
+ btrfs_free_path(path);
+ if (free_fs_path)
fs_path_free(fs_path);
return ret;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65af029559b5..6e71a2a78363 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,12 +61,21 @@
#include "tests/btrfs-tests.h"
#include "qgroup.h"
-#include "backref.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
+
+/*
+ * Types for mounting the default subvolume and a subvolume explicitly
+ * requested by subvol=/path. That way the callchain is straightforward and we
+ * don't have to play tricks with the mount options and recursive calls to
+ * btrfs_mount.
+ *
+ * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
+ */
static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
@@ -98,30 +107,6 @@ const char *btrfs_decode_error(int errno)
return errstr;
}
-/* btrfs handle error by forcing the filesystem readonly */
-static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
-{
- struct super_block *sb = fs_info->sb;
-
- if (sb_rdonly(sb))
- return;
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- sb->s_flags |= MS_RDONLY;
- btrfs_info(fs_info, "forced readonly");
- /*
- * Note that a running device replace operation is not
- * canceled here although there is no way to update
- * the progress. It would add the risk of a deadlock,
- * therefore the canceling is omitted. The only penalty
- * is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is
- * mounted writeable again, the device replace
- * operation continues.
- */
- }
-}
-
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -137,7 +122,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
/*
* Special case: if the error is EROFS, and we're already
- * under MS_RDONLY, then it is safe here.
+ * under SB_RDONLY, then it is safe here.
*/
if (errno == -EROFS && sb_rdonly(sb))
return;
@@ -168,8 +153,23 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN)
- btrfs_handle_error(fs_info);
+ if (!(sb->s_flags & SB_BORN))
+ return;
+
+ if (sb_rdonly(sb))
+ return;
+
+ /* btrfs handle error by forcing the filesystem readonly */
+ sb->s_flags |= SB_RDONLY;
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not canceled here
+ * although there is no way to update the progress. It would add the
+ * risk of a deadlock, therefore the canceling is omitted. The only
+ * penalty is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is mounted writeable
+ * again, the device replace operation continues.
+ */
}
#ifdef CONFIG_PRINTK
@@ -405,7 +405,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags)
{
substring_t args[MAX_OPT_ARGS];
- char *p, *num, *orig = NULL;
+ char *p, *num;
u64 cache_gen;
int intarg;
int ret = 0;
@@ -428,16 +428,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (!options)
goto check;
- /*
- * strsep changes the string, duplicate it because parse_options
- * gets called twice
- */
- options = kstrdup(options, GFP_KERNEL);
- if (!options)
- return -ENOMEM;
-
- orig = options;
-
while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
@@ -454,7 +444,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_subvolrootid:
case Opt_device:
/*
- * These are parsed by btrfs_parse_early_options
+ * These are parsed by btrfs_parse_subvol_options
+ * and btrfs_parse_early_options
* and can be happily ignored here.
*/
break;
@@ -507,9 +498,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
compress_type = "zlib";
+
info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level =
- btrfs_compress_str2level(args[0].from);
+ info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ /*
+ * args[0] contains uninitialized data since
+ * for these tokens we don't expect any
+ * parameter.
+ */
+ if (token != Opt_compress &&
+ token != Opt_compress_force)
+ info->compress_level =
+ btrfs_compress_str2level(args[0].from);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -625,7 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= MS_POSIXACL;
+ info->sb->s_flags |= SB_POSIXACL;
break;
#else
btrfs_err(info, "support for ACL not compiled in!");
@@ -633,7 +633,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
#endif
case Opt_noacl:
- info->sb->s_flags &= ~MS_POSIXACL;
+ info->sb->s_flags &= ~SB_POSIXACL;
break;
case Opt_notreelog:
btrfs_set_and_info(info, NOTREELOG,
@@ -851,7 +851,7 @@ check:
/*
* Extra check for current option against current flag
*/
- if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+ if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
btrfs_err(info,
"nologreplay must be used with ro mount option");
ret = -EINVAL;
@@ -868,7 +868,6 @@ out:
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(info, "using free space tree");
- kfree(orig);
return ret;
}
@@ -879,11 +878,60 @@ out:
* only when we need to allocate a new super block.
*/
static int btrfs_parse_early_options(const char *options, fmode_t flags,
- void *holder, char **subvol_name, u64 *subvol_objectid,
- struct btrfs_fs_devices **fs_devices)
+ void *holder, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
+ int error = 0;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because btrfs_parse_options
+ * gets called later
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ if (token == Opt_device) {
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = btrfs_scan_one_device(device_name,
+ flags, holder, fs_devices);
+ kfree(device_name);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ kfree(orig);
+ return error;
+}
+
+/*
+ * Parse mount options that are related to subvolume id
+ *
+ * The value is later passed to mount_subvol()
+ */
+static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
+ char **subvol_name, u64 *subvol_objectid)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *opts, *orig, *p;
char *num = NULL;
int error = 0;
@@ -891,8 +939,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
return 0;
/*
- * strsep changes the string, duplicate it because parse_options
- * gets called twice
+ * strsep changes the string, duplicate it because
+ * btrfs_parse_early_options gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
@@ -931,18 +979,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
break;
- case Opt_device:
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- error = btrfs_scan_one_device(device_name,
- flags, holder, fs_devices);
- kfree(device_name);
- if (error)
- goto out;
- break;
default:
break;
}
@@ -1147,7 +1183,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
#endif
sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
@@ -1180,7 +1216,7 @@ static int btrfs_fill_super(struct super_block *sb,
}
cleancache_init_fs(sb);
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
return 0;
fail_close:
@@ -1234,7 +1270,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
- char *compress_type;
+ const char *compress_type;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1250,12 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
- if (info->compress_type == BTRFS_COMPRESS_ZLIB)
- compress_type = "zlib";
- else if (info->compress_type == BTRFS_COMPRESS_LZO)
- compress_type = "lzo";
- else
- compress_type = "zstd";
+ compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
@@ -1277,7 +1308,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD))
seq_puts(seq, ",discard");
- if (!(info->sb->s_flags & MS_POSIXACL))
+ if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
seq_puts(seq, ",space_cache");
@@ -1356,86 +1387,12 @@ static inline int is_subvolume_inode(struct inode *inode)
return 0;
}
-/*
- * This will add subvolid=0 to the argument string while removing any subvol=
- * and subvolid= arguments to make sure we get the top-level root for path
- * walking to the subvol we want.
- */
-static char *setup_root_args(char *args)
-{
- char *buf, *dst, *sep;
-
- if (!args)
- return kstrdup("subvolid=0", GFP_KERNEL);
-
- /* The worst case is that we add ",subvolid=0" to the end. */
- buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1,
- GFP_KERNEL);
- if (!buf)
- return NULL;
-
- while (1) {
- sep = strchrnul(args, ',');
- if (!strstarts(args, "subvol=") &&
- !strstarts(args, "subvolid=")) {
- memcpy(dst, args, sep - args);
- dst += sep - args;
- *dst++ = ',';
- }
- if (*sep)
- args = sep + 1;
- else
- break;
- }
- strcpy(dst, "subvolid=0");
-
- return buf;
-}
-
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
- int flags, const char *device_name,
- char *data)
+ const char *device_name, struct vfsmount *mnt)
{
struct dentry *root;
- struct vfsmount *mnt = NULL;
- char *newargs;
int ret;
- newargs = setup_root_args(data);
- if (!newargs) {
- root = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
- if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
- if (flags & MS_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
- device_name, newargs);
- } else {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
- device_name, newargs);
- if (IS_ERR(mnt)) {
- root = ERR_CAST(mnt);
- mnt = NULL;
- goto out;
- }
-
- down_write(&mnt->mnt_sb->s_umount);
- ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- up_write(&mnt->mnt_sb->s_umount);
- if (ret < 0) {
- root = ERR_PTR(ret);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt)) {
- root = ERR_CAST(mnt);
- mnt = NULL;
- goto out;
- }
-
if (!subvol_name) {
if (!subvol_objectid) {
ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
@@ -1491,7 +1448,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
out:
mntput(mnt);
- kfree(newargs);
kfree(subvol_name);
return root;
}
@@ -1549,11 +1505,11 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
/*
* Find a superblock for the given device / mount point.
*
- * Note: This is based on get_sb_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
+ * Note: This is based on mount_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
*/
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
+static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
+ int flags, const char *device_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
@@ -1561,27 +1517,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
int error = 0;
- if (!(flags & MS_RDONLY))
+ if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
error = btrfs_parse_early_options(data, mode, fs_type,
- &subvol_name, &subvol_objectid,
&fs_devices);
if (error) {
- kfree(subvol_name);
return ERR_PTR(error);
}
- if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
- /* mount_subvol() will free subvol_name. */
- return mount_subvol(subvol_name, subvol_objectid, flags,
- device_name, data);
- }
-
security_init_mnt_opts(&new_sec_opts);
if (data) {
error = parse_security_options(data, &new_sec_opts);
@@ -1619,13 +1565,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (error)
goto error_fs_info;
- if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
goto error_close_devices;
}
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
@@ -1635,7 +1581,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (s->s_root) {
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & MS_RDONLY)
+ if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
@@ -1665,6 +1611,84 @@ error_sec_opts:
return ERR_PTR(error);
}
+/*
+ * Mount function which is called by VFS layer.
+ *
+ * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
+ * which needs vfsmount* of device's root (/). This means device's root has to
+ * be mounted internally in any case.
+ *
+ * Operation flow:
+ * 1. Parse subvol id related options for later use in mount_subvol().
+ *
+ * 2. Mount device's root (/) by calling vfs_kern_mount().
+ *
+ * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
+ * first place. In order to avoid calling btrfs_mount() again, we use
+ * different file_system_type which is not registered to VFS by
+ * register_filesystem() (btrfs_root_fs_type). As a result,
+ * btrfs_mount_root() is called. The return value will be used by
+ * mount_subtree() in mount_subvol().
+ *
+ * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
+ * "btrfs subvolume set-default", mount_subvol() is called always.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ const char *device_name, void *data)
+{
+ struct vfsmount *mnt_root;
+ struct dentry *root;
+ fmode_t mode = FMODE_READ;
+ char *subvol_name = NULL;
+ u64 subvol_objectid = 0;
+ int error = 0;
+
+ if (!(flags & SB_RDONLY))
+ mode |= FMODE_WRITE;
+
+ error = btrfs_parse_subvol_options(data, mode,
+ &subvol_name, &subvol_objectid);
+ if (error) {
+ kfree(subvol_name);
+ return ERR_PTR(error);
+ }
+
+ /* mount device's root (/) */
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
+ if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
+ if (flags & SB_RDONLY) {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags & ~SB_RDONLY, device_name, data);
+ } else {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags | SB_RDONLY, device_name, data);
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ goto out;
+ }
+
+ down_write(&mnt_root->mnt_sb->s_umount);
+ error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
+ up_write(&mnt_root->mnt_sb->s_umount);
+ if (error < 0) {
+ root = ERR_PTR(error);
+ mntput(mnt_root);
+ goto out;
+ }
+ }
+ }
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ goto out;
+ }
+
+ /* mount_subvol() will free subvol_name and mnt_root */
+ root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root);
+
+out:
+ return root;
+}
+
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
int new_pool_size, int old_pool_size)
{
@@ -1702,11 +1726,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
- (flags & MS_RDONLY))) {
+ (flags & SB_RDONLY))) {
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
- if (flags & MS_RDONLY)
+ if (flags & SB_RDONLY)
sync_filesystem(fs_info->sb);
}
}
@@ -1766,10 +1790,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
/*
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
@@ -1781,10 +1805,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
/* avoid complains from lockdep et al. */
up(&fs_info->uuid_tree_rescan_sem);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
/*
- * Setting MS_RDONLY will put the cleaner thread to
+ * Setting SB_RDONLY will put the cleaner thread to
* sleep at the next loop if it's already active.
* If it's already asleep, we'll leave unused block
* groups on disk until we're mounted read-write again
@@ -1811,7 +1835,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
- if (!btrfs_check_rw_degradable(fs_info)) {
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"too many missing devices, writeable remount is not allowed");
ret = -EACCES;
@@ -1856,7 +1880,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
@@ -1866,9 +1890,9 @@ out:
return 0;
restore:
- /* We've hit an error - don't reset MS_RDONLY */
+ /* We've hit an error - don't reset SB_RDONLY */
if (sb_rdonly(sb))
- old_flags |= MS_RDONLY;
+ old_flags |= SB_RDONLY;
sb->s_flags = old_flags;
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
@@ -1963,8 +1987,10 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (!device->in_fs_metadata || !device->bdev ||
- device->is_tgtdev_for_dev_replace)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ !device->bdev ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (i >= nr_devices)
@@ -2165,6 +2191,15 @@ static struct file_system_type btrfs_fs_type = {
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
+
+static struct file_system_type btrfs_root_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .mount = btrfs_mount_root,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2198,11 +2233,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
ret = btrfs_scan_one_device(vol->name, FMODE_READ,
- &btrfs_fs_type, &fs_devices);
+ &btrfs_root_fs_type, &fs_devices);
break;
case BTRFS_IOC_DEVICES_READY:
ret = btrfs_scan_one_device(vol->name, FMODE_READ,
- &btrfs_fs_type, &fs_devices);
+ &btrfs_root_fs_type, &fs_devices);
if (ret)
break;
ret = !(fs_devices->num_devices == fs_devices->total_devices);
@@ -2260,7 +2295,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
while (cur_devices) {
head = &cur_devices->devices;
list_for_each_entry(dev, head, dev_list) {
- if (dev->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->name)
continue;
@@ -2315,7 +2350,7 @@ static struct miscdevice btrfs_misc = {
MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");
-static int btrfs_interface_init(void)
+static int __init btrfs_interface_init(void)
{
return misc_register(&btrfs_misc);
}
@@ -2325,7 +2360,7 @@ static void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
-static void btrfs_print_mod_info(void)
+static void __init btrfs_print_mod_info(void)
{
pr_info("Btrfs loaded, crc32c=%s"
#ifdef CONFIG_BTRFS_DEBUG
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a28bba801264..a8bafed931f4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void)
return 0;
}
-int btrfs_init_sysfs(void)
+int __init btrfs_init_sysfs(void)
{
int ret;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d3f25376a0f8..9786d8cd0aa6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -277,6 +277,9 @@ int btrfs_run_sanity_tests(void)
goto out;
}
}
+ ret = btrfs_test_extent_map();
+ if (ret)
+ goto out;
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 266f1e3d1784..bc0615bac3cc 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -33,6 +33,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d06b1c931d05..2e7f64a3b22b 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL);
+ set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize)
test_msg("Couldn't find the locked page\n");
goto out_bits;
}
- set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL);
+ set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
new file mode 100644
index 000000000000..70c993f01670
--- /dev/null
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2017 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static void free_extent_map_tree(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ struct rb_node *node;
+
+ while (!RB_EMPTY_ROOT(&em_tree->map)) {
+ node = rb_first(&em_tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ remove_extent_mapping(em_tree, em);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (refcount_read(&em->refs) != 1) {
+ test_msg(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+ em->start, em->len, em->block_start,
+ em->block_len, refcount_read(&em->refs));
+
+ refcount_set(&em->refs, 1);
+ }
+#endif
+ free_extent_map(em);
+ }
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet, there is a file
+ * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads
+ * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is
+ * reading [0, 8K)
+ *
+ * t1 t2
+ * btrfs_get_extent() btrfs_get_extent()
+ * -> lookup_extent_mapping() ->lookup_extent_mapping()
+ * -> add_extent_mapping(0, 16K)
+ * -> return em
+ * ->add_extent_mapping(0, 16K)
+ * -> #handle -EEXIST
+ */
+static void test_case_1(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ u64 start = 0;
+ u64 len = SZ_8K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip the test on error. */
+ return;
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ /* Add [16K, 20K) following [0, 16K) */
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ em->start = SZ_16K;
+ em->len = SZ_4K;
+ em->block_start = SZ_32K; /* avoid merging */
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 8K), should return [0, 16K) instead. */
+ em->start = start;
+ em->len = len;
+ em->block_start = start;
+ em->block_len = len;
+ ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ if (ret)
+ test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K))
+ test_msg(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Reading the inline ending up with EEXIST, ie. read an inline
+ * extent and discard page cache and read it again.
+ */
+static void test_case_2(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip the test on error. */
+ return;
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ /* Add [4K, 4K) following [0, 1K) */
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ if (ret)
+ test_msg("case2 [0 1K]: ret %d\n", ret);
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
+ test_msg(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ ret, em->start, em->len, em->block_start,
+ em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip this test on error. */
+ return;
+
+ /* Add [4K, 8K) */
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ if (ret)
+ test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+ start, start + len, ret);
+ /*
+ * Since bytes within em are contiguous, em->block_start is identical to
+ * em->start.
+ */
+ if (em &&
+ (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len))
+ test_msg(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 16K), two jobs are running concurrently
+ * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio
+ * read from [0, 4K) or [8K, 12K) or [12K, 16K).
+ *
+ * t1 goes ahead of t2 and adds em [4K, 8K) into tree.
+ *
+ * t1 t2
+ * cow_file_range() btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping()
+ * -> add_extent_mapping()
+ */
+static void test_case_3(struct extent_map_tree *em_tree)
+{
+ __test_case_3(em_tree, 0);
+ __test_case_3(em_tree, SZ_8K);
+ __test_case_3(em_tree, (12 * 1024ULL));
+}
+
+static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip this test on error. */
+ return;
+
+ /* Add [0K, 8K) */
+ em->start = 0;
+ em->len = SZ_8K;
+ em->block_start = 0;
+ em->block_len = SZ_8K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [8K, 24K) */
+ em->start = SZ_8K;
+ em->len = 24 * 1024ULL;
+ em->block_start = SZ_16K; /* avoid merging */
+ em->block_len = 24 * 1024ULL;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+ /* Add [0K, 32K) */
+ em->start = 0;
+ em->len = SZ_32K;
+ em->block_start = 0;
+ em->block_len = SZ_32K;
+ ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ if (ret)
+ test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+ start, len, ret);
+ if (em &&
+ (start < em->start || start + len > extent_map_end(em)))
+ test_msg(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ start, len, ret, em->start, em->len, em->block_start,
+ em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 32K), two jobs are running concurrently
+ * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio
+ * read from [0, 4K) or [4K, 8K).
+ *
+ * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K).
+ *
+ * t1 t2
+ * btrfs_get_blocks_direct() btrfs_get_blocks_direct()
+ * -> btrfs_get_extent() -> btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping() -> lookup_extent_mapping()
+ * # load [0, 32K)
+ * -> btrfs_new_extent_direct()
+ * -> btrfs_drop_extent_cache()
+ * # split [0, 32K)
+ * -> add_extent_mapping()
+ * # add [8K, 32K)
+ * -> add_extent_mapping()
+ * # handle -EEXIST when adding
+ * # [0, 32K)
+ */
+static void test_case_4(struct extent_map_tree *em_tree)
+{
+ __test_case_4(em_tree, 0);
+ __test_case_4(em_tree, SZ_4K);
+}
+
+int btrfs_test_extent_map()
+{
+ struct extent_map_tree *em_tree;
+
+ test_msg("Running extent_map tests\n");
+
+ em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
+ if (!em_tree)
+ /* Skip the test on error. */
+ return 0;
+
+ extent_map_tree_init(em_tree);
+
+ test_case_1(em_tree);
+ test_case_2(em_tree);
+ test_case_3(em_tree);
+ test_case_4(em_tree);
+
+ kfree(em_tree);
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index f797642c013d..13420cd19ef0 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
- test_msg("Vacancy flag wasn't set properly\n");
- goto out;
- }
free_extent_map(em);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
@@ -968,7 +964,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
@@ -984,7 +980,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
- NULL, 0);
+ 0, NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1001,8 +997,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DIRTY |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1018,7 +1013,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
- NULL, 0);
+ 0, NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1036,7 +1031,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
- NULL, 0);
+ 0, NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1053,7 +1048,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1070,8 +1065,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1089,7 +1083,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0);
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -1104,8 +1098,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1121,8 +1114,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -1134,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
int ret;
set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
test_msg("Running btrfs_get_extent tests\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5a8c2649af2f..04f07144b45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (current->journal_info) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
- h->use_count++;
- WARN_ON(h->use_count > 2);
+ refcount_inc(&h->use_count);
+ WARN_ON(refcount_read(&h->use_count) > 2);
h->orig_rsv = h->block_rsv;
h->block_rsv = NULL;
goto got_it;
@@ -567,7 +567,7 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
h->root = root;
- h->use_count = 1;
+ refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
h->type = type;
@@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int err = 0;
int must_run_delayed_refs = 0;
- if (trans->use_count > 1) {
- trans->use_count--;
+ if (refcount_read(&trans->use_count) > 1) {
+ refcount_dec(&trans->use_count);
trans->block_rsv = trans->orig_rsv;
return 0;
}
@@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through clear_btree_io_tree()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT,
- 0, 0, &cached_state, GFP_NOFS);
+ EXTENT_NEED_WAIT, 0, 0, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1869,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
DEFINE_WAIT(wait);
- WARN_ON(trans->use_count > 1);
+ WARN_ON(refcount_read(&trans->use_count) > 1);
btrfs_abort_transaction(trans, err);
@@ -2266,16 +2265,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
ret = write_all_supers(fs_info, 0);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- goto scrub_continue;
- }
-
/*
* the super is written, we can safely allow the tree-loggers
* to go about their business
*/
mutex_unlock(&fs_info->tree_log_mutex);
+ if (ret)
+ goto scrub_continue;
btrfs_finish_extent_commit(trans, fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c55e44560103..6beee072b1bd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -58,6 +58,7 @@ struct btrfs_transaction {
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
+ int aborted;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
@@ -70,7 +71,6 @@ struct btrfs_transaction {
struct list_head dirty_bgs;
struct list_head io_bgs;
struct list_head dropped_roots;
- u64 num_dirty_bgs;
/*
* we need to make sure block group deletion doesn't race with
@@ -79,11 +79,11 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ unsigned int num_dirty_bgs;
/* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
- int aborted;
struct btrfs_fs_info *fs_info;
};
@@ -111,20 +111,19 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- unsigned long use_count;
- unsigned long blocks_reserved;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ refcount_t use_count;
+ unsigned int type;
short aborted;
- short adding_csums;
+ bool adding_csums;
bool allocating_chunk;
bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
bool dirty;
- unsigned int type;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 114fc5f0ecc5..c3c8d48f6618 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,6 +30,7 @@
#include "tree-checker.h"
#include "disk-io.h"
#include "compression.h"
+#include "hash.h"
/*
* Error message should follow the following format:
@@ -223,6 +224,142 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
}
/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+ btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 cur = 0;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+
+ /* header itself should not cross item boundary */
+ if (cur + sizeof(*di) > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (dir_type >= BTRFS_FT_MAX) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY) {
+ dir_item_err(root, leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (name_len > max_name_len) {
+ dir_item_err(root, leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+ dir_item_err(root, leaf, slot,
+ "dir item name and data len too long, have %u max %u",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(root->fs_info));
+ return -EUCLEAN;
+ }
+
+ if (data_len && dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (cur + total_size > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (key->offset != name_hash) {
+ dir_item_err(root, leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+/*
* Common point to switch the item-specific validation.
*/
static int check_leaf_item(struct btrfs_root *root,
@@ -238,11 +375,17 @@ static int check_leaf_item(struct btrfs_root *root,
case BTRFS_EXTENT_CSUM_KEY:
ret = check_csum_item(root, leaf, key, slot);
break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(root, leaf, key, slot);
+ break;
}
return ret;
}
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+ bool check_item_data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
/* No valid key type is 0, so all key should be larger than this key */
@@ -361,10 +504,15 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
return -EUCLEAN;
}
- /* Check if the item size and content meet other criteria */
- ret = check_leaf_item(root, leaf, &key, slot);
- if (ret < 0)
- return ret;
+ if (check_item_data) {
+ /*
+ * Check if the item size and content meet other
+ * criteria
+ */
+ ret = check_leaf_item(root, leaf, &key, slot);
+ if (ret < 0)
+ return ret;
+ }
prev_key.objectid = key.objectid;
prev_key.type = key.type;
@@ -374,6 +522,17 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
return 0;
}
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ return check_leaf(root, leaf, false);
+}
+
int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
{
unsigned long nr = btrfs_header_nritems(node);
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 96c486e95d70..3d53e8d6fda0 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -20,7 +20,19 @@
#include "ctree.h"
#include "extent_io.h"
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+ struct extent_buffer *leaf);
int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aa7c71cff575..afadaadab18e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
+#include <linux/iversion.h>
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
@@ -1173,19 +1174,15 @@ next:
return 0;
}
-static int extref_get_fields(struct extent_buffer *eb, int slot,
- unsigned long ref_ptr, u32 *namelen, char **name,
- u64 *index, u64 *parent_objectid)
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index,
+ u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ref_ptr;
*namelen = btrfs_inode_extref_name_len(eb, extref);
- if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
- *namelen))
- return -EIO;
-
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1200,19 +1197,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot,
return 0;
}
-static int ref_get_fields(struct extent_buffer *eb, int slot,
- unsigned long ref_ptr, u32 *namelen, char **name,
- u64 *index)
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index)
{
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ref_ptr;
*namelen = btrfs_inode_ref_name_len(eb, ref);
- if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
- *namelen))
- return -EIO;
-
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1287,8 +1279,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
- &name, &ref_index, &parent_objectid);
+ ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
@@ -1300,8 +1292,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
- &name, &ref_index);
+ ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index);
}
if (ret)
goto out;
@@ -1835,7 +1827,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
@@ -1848,8 +1839,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, slot, di))
- return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
if (ret < 0)
@@ -2024,11 +2013,6 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, slot, di)) {
- ret = -EIO;
- goto out;
- }
-
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
@@ -2109,7 +2093,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const u64 ino)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
@@ -2151,11 +2134,6 @@ process_leaf:
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
- ret = verify_dir_item(fs_info, path->nodes[0], i, di);
- if (ret) {
- ret = -EIO;
- goto out;
- }
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
@@ -3609,7 +3587,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
&token);
- btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_sequence(leaf, item,
+ inode_peek_iversion(inode), &token);
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
@@ -4102,7 +4081,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ordered_io_err) {
ctx->io_err = -EIO;
- return 0;
+ return ctx->io_err;
}
btrfs_init_map_token(&token);
@@ -4572,12 +4551,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
this_len = sizeof(*extref) + this_name_len;
}
- ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
- this_name_len);
- if (!ret) {
- ret = -EIO;
- goto out;
- }
if (this_name_len > name_len) {
char *new_name;
@@ -5432,11 +5405,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *parent,
const loff_t start,
const loff_t end,
- int exists_only,
+ int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
struct dentry *old_parent = NULL;
int ret = 0;
@@ -5602,7 +5574,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
- parent, start, end, 0, ctx);
+ parent, start, end, LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -5865,6 +5837,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
return 0;
return btrfs_log_inode_parent(trans, root, inode, parent, 0,
- LLONG_MAX, 1, NULL);
+ LLONG_MAX, LOG_INODE_EXISTS, NULL);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1ecb938ba4d..b5036bd69e6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map);
+/*
+ * Device locking
+ * ==============
+ *
+ * There are several mutexes that protect manipulation of devices and low-level
+ * structures like chunks but not block groups, extents or files
+ *
+ * uuid_mutex (global lock)
+ * ------------------------
+ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * the SCAN_DEV ioctl registration or from mount either implicitly (the first
+ * device) or requested by the device= mount option
+ *
+ * the mutex can be very coarse and can cover long-running operations
+ *
+ * protects: updates to fs_devices counters like missing devices, rw devices,
+ * seeding, structure cloning, openning/closing devices at mount/umount time
+ *
+ * global::fs_devs - add, remove, updates to the global list
+ *
+ * does not protect: manipulation of the fs_devices::devices list!
+ *
+ * btrfs_device::name - renames (write side), read is RCU
+ *
+ * fs_devices::device_list_mutex (per-fs, with RCU)
+ * ------------------------------------------------
+ * protects updates to fs_devices::devices, ie. adding and deleting
+ *
+ * simple list traversal with read-only actions can be done with RCU protection
+ *
+ * may be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers)
+ *
+ * volume_mutex
+ * ------------
+ * coarse lock owned by a mounted filesystem; used to exclude some operations
+ * that cannot run in parallel and affect the higher-level properties of the
+ * filesystem like: device add/deleting/resize/replace, or balance
+ *
+ * balance_mutex
+ * -------------
+ * protects balance structures (status, state) and context accessed from
+ * several places (internally, ioctl)
+ *
+ * chunk_mutex
+ * -----------
+ * protects chunks, adding or removing during allocation, trim or when a new
+ * device is added/removed
+ *
+ * cleaner_mutex
+ * -------------
+ * a big lock that is held by the cleaner thread and prevents running subvolume
+ * cleaning together with relocation or delayed iputs
+ *
+ *
+ * Lock nesting
+ * ============
+ *
+ * uuid_mutex
+ * volume_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
+ */
+
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head *btrfs_get_fs_uuids(void)
@@ -180,6 +245,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
return fs_devs;
}
+static void free_device(struct btrfs_device *device)
+{
+ rcu_string_free(device->name);
+ bio_put(device->flush_bio);
+ kfree(device);
+}
+
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
@@ -188,8 +260,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- rcu_string_free(device->name);
- kfree(device);
+ free_device(device);
}
kfree(fs_devices);
}
@@ -219,6 +290,11 @@ void btrfs_cleanup_fs_uuids(void)
}
}
+/*
+ * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
+ * Returned struct is not linked onto any lists and must be destroyed using
+ * free_device.
+ */
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
@@ -236,7 +312,6 @@ static struct btrfs_device *__alloc_device(void)
kfree(dev);
return ERR_PTR(-ENOMEM);
}
- bio_get(dev->flush_bio);
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
@@ -244,7 +319,6 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->io_lock);
- spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
@@ -530,45 +604,42 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
-
-static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+/*
+ * Search and remove all stale (devices which are not mounted) devices.
+ * When both inputs are NULL, it will search and release all stale devices.
+ * path: Optional. When provided will it release all unmounted devices
+ * matching this path only.
+ * skip_dev: Optional. Will skip this device when searching for the stale
+ * devices.
+ */
+static void btrfs_free_stale_devices(const char *path,
+ struct btrfs_device *skip_dev)
{
- struct btrfs_fs_devices *fs_devs;
- struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
+ struct btrfs_device *dev, *tmp_dev;
- if (!cur_dev->name)
- return;
-
- list_for_each_entry(fs_devs, &fs_uuids, list) {
- int del = 1;
+ list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
if (fs_devs->opened)
continue;
- if (fs_devs->seeding)
- continue;
- list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+ list_for_each_entry_safe(dev, tmp_dev,
+ &fs_devs->devices, dev_list) {
+ int not_found = 0;
- if (dev == cur_dev)
+ if (skip_dev && skip_dev == dev)
continue;
- if (!dev->name)
+ if (path && !dev->name)
continue;
- /*
- * Todo: This won't be enough. What if the same device
- * comes back (with new uuid and) with its mapper path?
- * But for now, this does help as mostly an admin will
- * either use mapper or non mapper path throughout.
- */
rcu_read_lock();
- del = strcmp(rcu_str_deref(dev->name),
- rcu_str_deref(cur_dev->name));
+ if (path)
+ not_found = strcmp(rcu_str_deref(dev->name),
+ path);
rcu_read_unlock();
- if (!del)
- break;
- }
+ if (not_found)
+ continue;
- if (!del) {
/* delete the stale device */
if (fs_devs->num_devices == 1) {
btrfs_sysfs_remove_fsid(fs_devs);
@@ -577,37 +648,99 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
- rcu_string_free(dev->name);
- kfree(dev);
+ free_device(dev);
}
- break;
}
}
}
+static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *device, fmode_t flags,
+ void *holder)
+{
+ struct request_queue *q;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ int ret;
+
+ if (device->bdev)
+ return -EINVAL;
+ if (!device->name)
+ return -EINVAL;
+
+ ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh);
+ if (ret)
+ return ret;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->seeding = 1;
+ } else {
+ if (bdev_read_only(bdev))
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ else
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ }
+
+ q = bdev_get_queue(bdev);
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
+
+ device->bdev = bdev;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ device->mode = flags;
+
+ fs_devices->open_devices++;
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ fs_devices->rw_devices++;
+ list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+ }
+ brelse(bh);
+
+ return 0;
+
+error_brelse:
+ brelse(bh);
+ blkdev_put(bdev, flags);
+
+ return -EINVAL;
+}
+
/*
* Add new device to list of registered devices
*
* Returns:
- * 1 - first time device is seen
- * 0 - device already known
- * < 0 - error
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
*/
-static noinline int device_list_add(const char *path,
- struct btrfs_super_block *disk_super,
- u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+static noinline struct btrfs_device *device_list_add(const char *path,
+ struct btrfs_super_block *disk_super)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
- int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
+ u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
- return PTR_ERR(fs_devices);
+ return ERR_CAST(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
@@ -619,19 +752,19 @@ static noinline int device_list_add(const char *path,
if (!device) {
if (fs_devices->opened)
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
- return PTR_ERR(device);
+ return device;
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
- kfree(device);
- return -ENOMEM;
+ free_device(device);
+ return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
@@ -640,8 +773,16 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
- ret = 1;
device->fs_devices = fs_devices;
+ btrfs_free_stale_devices(path, device);
+
+ if (disk_super->label[0])
+ pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
+ disk_super->label, devid, found_transid, path);
+ else
+ pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
+ disk_super->fsid, devid, found_transid, path);
+
} else if (!device->name || strcmp(device->name->str, path)) {
/*
* When FS is already mounted.
@@ -677,17 +818,17 @@ static noinline int device_list_add(const char *path,
* with larger generation number or the last-in if
* generation are equal.
*/
- return -EEXIST;
+ return ERR_PTR(-EEXIST);
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
- if (device->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
- device->missing = 0;
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
}
@@ -700,16 +841,9 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
- /*
- * if there is new btrfs on an already registered device,
- * then remove the stale device entry.
- */
- if (ret > 0)
- btrfs_free_stale_device(device);
-
- *fs_devices_ret = fs_devices;
+ fs_devices->total_devices = btrfs_super_num_devices(disk_super);
- return ret;
+ return device;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -742,7 +876,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
- kfree(device);
+ free_device(device);
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -769,10 +903,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (device->in_fs_metadata) {
- if (!device->is_tgtdev_for_dev_replace &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state)) {
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state) &&
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
latest_dev = device;
}
continue;
@@ -789,7 +925,8 @@ again:
* not, which means whether this device is
* used or whether it should be removed.
*/
- if (step == 0 || device->is_tgtdev_for_dev_replace) {
+ if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state)) {
continue;
}
}
@@ -798,16 +935,16 @@ again:
device->bdev = NULL;
fs_devices->open_devices--;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
- device->writeable = 0;
- if (!device->is_tgtdev_for_dev_replace)
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state))
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- rcu_string_free(device->name);
- kfree(device);
+ free_device(device);
}
if (fs_devices->seed) {
@@ -820,35 +957,25 @@ again:
mutex_unlock(&uuid_mutex);
}
-static void __free_device(struct work_struct *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, rcu_work);
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
-}
-
-static void free_device(struct rcu_head *head)
+static void free_device_rcu(struct rcu_head *head)
{
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
-
- INIT_WORK(&device->rcu_work, __free_device);
- schedule_work(&device->rcu_work);
+ free_device(device);
}
static void btrfs_close_bdev(struct btrfs_device *device)
{
- if (device->bdev && device->writeable) {
+ if (!device->bdev)
+ return;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
- if (device->bdev)
- blkdev_put(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
}
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
@@ -860,13 +987,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
if (device->bdev)
fs_devices->open_devices--;
- if (device->writeable &&
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
new_device = btrfs_alloc_device(NULL, &device->devid,
@@ -912,7 +1039,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
}
WARN_ON(fs_devices->open_devices);
@@ -942,93 +1069,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
- /*
- * Wait for rcu kworkers under __btrfs_close_devices
- * to finish all blkdev_puts so device is really
- * free when umount is done.
- */
- rcu_barrier();
return ret;
}
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
- struct request_queue *q;
- struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- u64 devid;
- int seeding = 1;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, head, dev_list) {
- if (device->bdev)
- continue;
- if (!device->name)
- continue;
-
/* Just open everything we can; ignore failures here */
- if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh))
+ if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- if (devid != device->devid)
- goto error_brelse;
-
- if (memcmp(device->uuid, disk_super->dev_item.uuid,
- BTRFS_UUID_SIZE))
- goto error_brelse;
-
- device->generation = btrfs_super_generation(disk_super);
if (!latest_dev ||
device->generation > latest_dev->generation)
latest_dev = device;
-
- if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
- device->writeable = 0;
- } else {
- device->writeable = !bdev_read_only(bdev);
- seeding = 0;
- }
-
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
-
- device->bdev = bdev;
- device->in_fs_metadata = 0;
- device->mode = flags;
-
- fs_devices->open_devices++;
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- fs_devices->rw_devices++;
- list_add(&device->dev_alloc_list,
- &fs_devices->alloc_list);
- }
- brelse(bh);
- continue;
-
-error_brelse:
- brelse(bh);
- blkdev_put(bdev, flags);
- continue;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
- fs_devices->seeding = seeding;
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
@@ -1112,12 +1178,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
+ struct btrfs_device *device;
struct block_device *bdev;
struct page *page;
- int ret = -EINVAL;
- u64 devid;
- u64 transid;
- u64 total_devices;
+ int ret = 0;
u64 bytenr;
/*
@@ -1136,26 +1200,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
+ ret = -EINVAL;
goto error_bdev_put;
-
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- transid = btrfs_super_generation(disk_super);
- total_devices = btrfs_super_num_devices(disk_super);
-
- ret = device_list_add(path, disk_super, devid, fs_devices_ret);
- if (ret > 0) {
- if (disk_super->label[0]) {
- pr_info("BTRFS: device label %s ", disk_super->label);
- } else {
- pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
- }
-
- pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
- ret = 0;
}
- if (!ret && fs_devices_ret)
- (*fs_devices_ret)->total_devices = total_devices;
+
+ device = device_list_add(path, disk_super);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ *fs_devices_ret = device->fs_devices;
btrfs_release_disk_super(page);
@@ -1181,7 +1235,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
*length = 0;
- if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+ if (start >= device->total_bytes ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return 0;
path = btrfs_alloc_path();
@@ -1359,7 +1414,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
max_hole_size = 0;
again:
- if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+ if (search_start >= search_end ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
goto out;
}
@@ -1566,8 +1622,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
- WARN_ON(!device->in_fs_metadata);
- WARN_ON(device->is_tgtdev_for_dev_replace);
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1657,7 +1713,7 @@ error:
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
-static int btrfs_add_device(struct btrfs_trans_handle *trans,
+static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
@@ -1750,20 +1806,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = -ENOENT;
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret)
- goto out;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+
out:
btrfs_free_path(path);
- btrfs_commit_transaction(trans);
+ if (!ret)
+ ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1809,7 +1869,8 @@ static struct btrfs_device * btrfs_find_next_active_device(
list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
if (next_device != device &&
- !next_device->missing && next_device->bdev)
+ !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
+ && next_device->bdev)
return next_device;
}
@@ -1850,6 +1911,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
+ mutex_lock(&fs_info->volume_mutex);
mutex_lock(&uuid_mutex);
num_devices = fs_info->fs_devices->num_devices;
@@ -1869,17 +1931,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto out;
- if (device->is_tgtdev_for_dev_replace) {
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
}
- if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
@@ -1901,7 +1964,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto error_undo;
- device->in_fs_metadata = 0;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(fs_info, device);
/*
@@ -1921,7 +1984,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
device->fs_devices->num_devices--;
device->fs_devices->total_devices--;
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
device->fs_devices->missing_devices--;
btrfs_assign_next_active_device(fs_info, device, NULL);
@@ -1941,11 +2004,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* the devices list. All that's left is to zero out the old
* supers and free the device.
*/
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
btrfs_scratch_superblocks(device->bdev, device->name->str);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@@ -1964,10 +2027,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
out:
mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
return ret;
error_undo:
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
&fs_info->fs_devices->alloc_list);
@@ -1993,12 +2057,12 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
fs_devices = srcdev->fs_devices;
list_del_rcu(&srcdev->dev_list);
- list_del_rcu(&srcdev->dev_alloc_list);
+ list_del(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
- if (srcdev->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
fs_devices->missing_devices--;
- if (srcdev->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
fs_devices->rw_devices--;
if (srcdev->bdev)
@@ -2010,13 +2074,13 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (srcdev->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
btrfs_close_bdev(srcdev);
- call_rcu(&srcdev->rcu, free_device);
+ call_rcu(&srcdev->rcu, free_device_rcu);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
@@ -2075,7 +2139,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
btrfs_close_bdev(tgtdev);
- call_rcu(&tgtdev->rcu, free_device);
+ call_rcu(&tgtdev->rcu, free_device_rcu);
}
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
@@ -2120,7 +2184,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
* is held by the caller.
*/
list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata && !tmp->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &tmp->dev_state) && !tmp->bdev) {
*device = tmp;
break;
}
@@ -2349,24 +2414,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- kfree(device);
ret = -ENOMEM;
- goto error;
+ goto error_free_device;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- rcu_string_free(device->name);
- kfree(device);
ret = PTR_ERR(trans);
- goto error;
+ goto error_free_device;
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2377,14 +2437,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 0;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
ret = btrfs_prepare_sprout(fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2439,7 +2499,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
}
- ret = btrfs_add_device(trans, fs_info, device);
+ ret = btrfs_add_dev_item(trans, fs_info, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
@@ -2497,11 +2557,11 @@ error_sysfs:
btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
error_trans:
if (seeding_dev)
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
if (trans)
btrfs_end_transaction(trans);
- rcu_string_free(device->name);
- kfree(device);
+error_free_device:
+ free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev && !unlocked) {
@@ -2516,7 +2576,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
- struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
@@ -2567,17 +2626,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- kfree(device);
+ free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2590,8 +2646,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2619,7 +2675,7 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
tgtdev->io_align = sectorsize;
tgtdev->sector_size = sectorsize;
tgtdev->fs_info = fs_info;
- tgtdev->in_fs_metadata = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2677,7 +2733,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
u64 old_total;
u64 diff;
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
new_size = round_down(new_size, fs_info->sectorsize);
@@ -2687,7 +2743,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace) {
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
@@ -3031,6 +3087,48 @@ error:
return ret;
}
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytes_used;
+ u64 chunk_type;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ ASSERT(cache);
+ chunk_type = cache->flags;
+ btrfs_put_block_group(cache);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_force_chunk_alloc(trans, fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+ }
+ }
+ return 0;
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3489,7 +3587,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
- u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3497,10 +3594,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min_t(u64, size_to_free, SZ_1M);
- if (!device->writeable ||
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
- device->is_tgtdev_for_dev_replace)
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -3648,28 +3745,21 @@ again:
goto loop;
}
- ASSERT(fs_info->data_sinfo);
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
-
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
- !chunk_reserved && !bytes_used) {
- trans = btrfs_start_transaction(chunk_root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- ret = PTR_ERR(trans);
- goto error;
- }
-
- ret = btrfs_force_chunk_alloc(trans, fs_info,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
+ if (!chunk_reserved) {
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info,
+ found_key.offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ } else if (ret == 1) {
+ chunk_reserved = 1;
}
- chunk_reserved = 1;
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -4368,7 +4458,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
new_size = round_down(new_size, fs_info->sectorsize);
diff = round_down(old_size - new_size, fs_info->sectorsize);
- if (device->is_tgtdev_for_dev_replace)
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return -EINVAL;
path = btrfs_alloc_path();
@@ -4380,7 +4470,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
@@ -4432,6 +4522,18 @@ again:
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto done;
+ }
+
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
@@ -4505,7 +4607,7 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
device->fs_devices->total_rw_bytes += diff;
atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
@@ -4665,14 +4767,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 max_avail;
u64 dev_offset;
- if (!device->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
- if (!device->in_fs_metadata ||
- device->is_tgtdev_for_dev_replace)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (device->total_bytes > device->bytes_used)
@@ -5020,12 +5123,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING,
+ &map->stripes[i].dev->dev_state)) {
miss_ndevs++;
continue;
}
-
- if (!map->stripes[i].dev->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &map->stripes[i].dev->dev_state)) {
readonly = 1;
goto end;
}
@@ -5091,7 +5195,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- ret = 3;
+ /*
+ * There could be two corrupted data stripes, we need
+ * to loop retry in order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the
+ * stripe under reconstruction.
+ */
+ ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
@@ -5991,15 +6102,14 @@ static void btrfs_end_bio(struct bio *bio)
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- btrfs_dev_stat_print_on_error(dev);
}
}
}
@@ -6049,16 +6159,15 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
- if (device->missing || !device->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
+ !device->bdev) {
bio_io_error(bio);
return;
}
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
- bio_get(bio);
btrfsic_submit_bio(bio);
- bio_put(bio);
return;
}
@@ -6195,7 +6304,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev ||
- (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
+ (bio_op(first_bio) == REQ_OP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
}
@@ -6244,7 +6354,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices++;
return device;
@@ -6260,8 +6370,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
- * on error. Returned struct is not linked onto any lists and can be
- * destroyed with kfree() right away.
+ * on error. Returned struct is not linked onto any lists and must be
+ * destroyed with free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
@@ -6284,7 +6394,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
ret = find_next_devid(fs_info, &tmp);
if (ret) {
- kfree(dev);
+ free_device(dev);
return ERR_PTR(ret);
}
}
@@ -6463,7 +6573,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
}
btrfs_report_missing_device(fs_info, devid, uuid, false);
}
- map->stripes[i].dev->in_fs_metadata = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &(map->stripes[i].dev->dev_state));
+
}
write_lock(&map_tree->map_tree.lock);
@@ -6492,7 +6604,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
- device->is_tgtdev_for_dev_replace = 0;
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -6604,7 +6716,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
dev_uuid, false);
}
- if(!device->bdev && !device->missing) {
+ if (!device->bdev &&
+ !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
@@ -6612,12 +6725,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
* device->missing to one here
*/
device->fs_devices->missing_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
- ASSERT(device->missing);
+ ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state));
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
@@ -6631,15 +6745,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
}
if (device->fs_devices != fs_info->fs_devices) {
- BUG_ON(device->writeable);
+ BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
- device->in_fs_metadata = 1;
- if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
device->fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
@@ -6771,10 +6886,13 @@ out_short_read:
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
+ * If the @failing_dev is specified, it's accounted as missing.
+ *
* Return true if all chunks meet the minimal RW mount requirements.
* Return false if any chunk doesn't meet the minimal RW mount requirements.
*/
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev)
{
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
@@ -6802,12 +6920,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- if (!dev || !dev->bdev || dev->missing ||
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
dev->last_flush_error)
missing++;
+ else if (failing_dev && failing_dev == dev)
+ missing++;
}
if (missing > max_tolerated) {
- btrfs_warn(fs_info,
+ if (!failing_dev)
+ btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
@@ -7078,10 +7200,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
+ if (!device->dev_stats_valid || stats_cnt == 0)
continue;
- stats_cnt = atomic_read(&device->dev_stats_ccnt);
+
+ /*
+ * There is a LOAD-LOAD control dependency between the value of
+ * dev_stats_ccnt and updating the on-disk values which requires
+ * reading the in-memory counters. Such control dependencies
+ * require explicit read memory barriers.
+ *
+ * This memory barriers pairs with smp_mb__before_atomic in
+ * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
+ * barrier implied by atomic_xchg in
+ * btrfs_dev_stats_read_and_reset
+ */
+ smp_rmb();
+
ret = update_dev_stat_item(trans, fs_info, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ff15208344a7..28c28eeadff3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -47,6 +47,12 @@ struct btrfs_pending_bios {
#define btrfs_device_data_ordered_init(device) do { } while (0)
#endif
+#define BTRFS_DEV_STATE_WRITEABLE (0)
+#define BTRFS_DEV_STATE_IN_FS_METADATA (1)
+#define BTRFS_DEV_STATE_MISSING (2)
+#define BTRFS_DEV_STATE_REPLACE_TGT (3)
+#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
@@ -69,11 +75,7 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
- int writeable;
- int in_fs_metadata;
- int missing;
- int can_discard;
- int is_tgtdev_for_dev_replace;
+ unsigned long dev_state;
blk_status_t last_flush_error;
int flush_bio_sent;
@@ -129,14 +131,12 @@ struct btrfs_device {
struct completion flush_wait;
/* per-device scrub information */
- struct scrub_ctx *scrub_device;
+ struct scrub_ctx *scrub_ctx;
struct btrfs_work work;
struct rcu_head rcu;
- struct work_struct rcu_work;
/* readahead state */
- spinlock_t reada_lock;
atomic_t reada_in_flight;
u64 reada_next;
struct reada_zone *reada_curr_zone;
@@ -489,15 +489,16 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset);
-static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
-{
- return atomic_read(&dev->dev_stats_ccnt);
-}
-
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
atomic_inc(dev->dev_stat_values + index);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
smp_mb__before_atomic();
atomic_inc(&dev->dev_stats_ccnt);
}
@@ -514,7 +515,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
int ret;
ret = atomic_xchg(dev->dev_stat_values + index, 0);
- smp_mb__before_atomic();
+ /*
+ * atomic_xchg implies a full memory barriers as per atomic_t.txt:
+ * - RMW operations that have a return value are fully ordered;
+ *
+ * This implicit memory barriers is paired with the smp_rmb in
+ * btrfs_run_dev_stats
+ */
atomic_inc(&dev->dev_stats_ccnt);
return ret;
}
@@ -523,6 +530,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
int index, unsigned long val)
{
atomic_set(dev->dev_stat_values + index, val);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
smp_mb__before_atomic();
atomic_inc(&dev->dev_stats_ccnt);
}
@@ -540,7 +553,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 2c7e53f9ff1b..de7d072c78ef 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -23,6 +23,7 @@
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
@@ -267,7 +268,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret = 0;
@@ -336,11 +336,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
- if (verify_dir_item(fs_info, leaf, slot, di)) {
- ret = -EIO;
- goto err;
- }
-
total_size += name_len + 1;
/*
* We are just looking for how big our buffer needs to
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 17f2dd8fddb8..01a4eab602a3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -43,6 +43,8 @@ struct workspace {
size_t size;
char *buf;
struct list_head list;
+ ZSTD_inBuffer in_buf;
+ ZSTD_outBuffer out_buf;
};
static void zstd_free_workspace(struct list_head *ws)
@@ -94,8 +96,6 @@ static int zstd_compress_pages(struct list_head *ws,
int nr_pages = 0;
struct page *in_page = NULL; /* The current page to read */
struct page *out_page = NULL; /* The current page to write to */
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long len = *total_out;
@@ -118,9 +118,9 @@ static int zstd_compress_pages(struct list_head *ws,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- in_buf.src = kmap(in_page);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
@@ -130,14 +130,15 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
while (1) {
size_t ret2;
- ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_compressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -146,22 +147,22 @@ static int zstd_compress_pages(struct list_head *ws,
}
/* Check to see if we are making it bigger */
- if (tot_in + in_buf.pos > 8192 &&
- tot_in + in_buf.pos <
- tot_out + out_buf.pos) {
+ if (tot_in + workspace->in_buf.pos > 8192 &&
+ tot_in + workspace->in_buf.pos <
+ tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
goto out;
}
/* We've reached the end of our output range */
- if (out_buf.pos >= max_out) {
- tot_out += out_buf.pos;
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
ret = -E2BIG;
goto out;
}
/* Check if we need more output space */
- if (out_buf.pos == out_buf.size) {
+ if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
kunmap(out_page);
@@ -176,19 +177,20 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out,
+ PAGE_SIZE);
}
/* We've reached the end of the input */
- if (in_buf.pos >= len) {
- tot_in += in_buf.pos;
+ if (workspace->in_buf.pos >= len) {
+ tot_in += workspace->in_buf.pos;
break;
}
/* Check if we need more input */
- if (in_buf.pos == in_buf.size) {
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
kunmap(in_page);
put_page(in_page);
@@ -196,15 +198,15 @@ static int zstd_compress_pages(struct list_head *ws,
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- in_buf.src = kmap(in_page);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
}
while (1) {
size_t ret2;
- ret2 = ZSTD_endStream(stream, &out_buf);
+ ret2 = ZSTD_endStream(stream, &workspace->out_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_endStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -212,11 +214,11 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
if (ret2 == 0) {
- tot_out += out_buf.pos;
+ tot_out += workspace->out_buf.pos;
break;
}
- if (out_buf.pos >= max_out) {
- tot_out += out_buf.pos;
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
ret = -E2BIG;
goto out;
}
@@ -235,9 +237,9 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
if (tot_out >= tot_in) {
@@ -273,8 +275,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
stream = ZSTD_initDStream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -284,18 +284,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- in_buf.src = kmap(pages_in[page_in_index]);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
- out_buf.dst = workspace->buf;
- out_buf.pos = 0;
- out_buf.size = PAGE_SIZE;
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
while (1) {
size_t ret2;
- ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -303,38 +304,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
buf_start = total_out;
- total_out += out_buf.pos;
- out_buf.pos = 0;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
- ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
- total_out, disk_start, orig_bio);
+ ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
+ buf_start, total_out, disk_start, orig_bio);
if (ret == 0)
break;
- if (in_buf.pos >= srclen)
+ if (workspace->in_buf.pos >= srclen)
break;
/* Check if we've hit the end of a frame */
if (ret2 == 0)
break;
- if (in_buf.pos == in_buf.size) {
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
- in_buf.src = NULL;
+ workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- in_buf.src = kmap(pages_in[page_in_index]);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
}
ret = 0;
zero_fill_bio(orig_bio);
done:
- if (in_buf.src)
+ if (workspace->in_buf.src)
kunmap(pages_in[page_in_index]);
return ret;
}
@@ -348,8 +349,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ZSTD_DStream *stream;
int ret = 0;
size_t ret2;
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
unsigned long total_out = 0;
unsigned long pg_offset = 0;
char *kaddr;
@@ -364,16 +363,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(size_t, destlen, PAGE_SIZE);
- in_buf.src = data_in;
- in_buf.pos = 0;
- in_buf.size = srclen;
+ workspace->in_buf.src = data_in;
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = srclen;
- out_buf.dst = workspace->buf;
- out_buf.pos = 0;
- out_buf.size = PAGE_SIZE;
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
ret2 = 1;
- while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+ while (pg_offset < destlen
+ && workspace->in_buf.pos < workspace->in_buf.size) {
unsigned long buf_start;
unsigned long buf_offset;
unsigned long bytes;
@@ -384,7 +384,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ret = -EIO;
goto finish;
}
- ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -393,8 +394,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
}
buf_start = total_out;
- total_out += out_buf.pos;
- out_buf.pos = 0;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
if (total_out <= start_byte)
continue;
@@ -405,10 +406,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
buf_offset = 0;
bytes = min_t(unsigned long, destlen - pg_offset,
- out_buf.size - buf_offset);
+ workspace->out_buf.size - buf_offset);
kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+ memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset,
+ bytes);
kunmap_atomic(kaddr);
pg_offset += bytes;
diff --git a/fs/buffer.c b/fs/buffer.c
index 0736a6a2e2f0..8b26295a56fe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3014,7 +3014,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
void guard_bio_eod(int op, struct bio *bio)
{
sector_t maxsector;
- struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
unsigned truncated_bytes;
struct hd_struct *part;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ab69dcb70e8a..1b468250e947 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1440,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc,
return request_close_session(mdsc, session);
}
+static bool drop_negative_children(struct dentry *dentry)
+{
+ struct dentry *child;
+ bool all_negative = true;
+
+ if (!d_is_dir(dentry))
+ goto out;
+
+ spin_lock(&dentry->d_lock);
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ if (d_really_is_positive(child)) {
+ all_negative = false;
+ break;
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (all_negative)
+ shrink_dcache_parent(dentry);
+out:
+ return all_negative;
+}
+
/*
* Trim old(er) caps.
*
@@ -1490,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
- session->s_trim_caps--;
if (oissued) {
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
+ session->s_trim_caps--;
} else {
+ struct dentry *dentry;
/* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
- d_prune_aliases(inode);
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, atomic_read(&inode->i_count));
+ dentry = d_find_any_alias(inode);
+ if (dentry && drop_negative_children(dentry)) {
+ int count;
+ dput(dentry);
+ d_prune_aliases(inode);
+ count = atomic_read(&inode->i_count);
+ if (count == 1)
+ session->s_trim_caps--;
+ dout("trim_caps_cb %p cap %p pruned, count now %d\n",
+ inode, cap, count);
+ } else {
+ dput(dentry);
+ }
return 0;
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fe9fbb3f13f7..a62d2a9841dc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -331,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private)
break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
- fsopt->sb_flags |= MS_POSIXACL;
+ fsopt->sb_flags |= SB_POSIXACL;
break;
#endif
case Opt_noacl:
- fsopt->sb_flags &= ~MS_POSIXACL;
+ fsopt->sb_flags &= ~SB_POSIXACL;
break;
default:
BUG_ON(token);
@@ -520,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nopoolperm");
#ifdef CONFIG_CEPH_FS_POSIX_ACL
- if (fsopt->sb_flags & MS_POSIXACL)
+ if (fsopt->sb_flags & SB_POSIXACL)
seq_puts(m, ",acl");
else
seq_puts(m, ",noacl");
@@ -988,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
dout("ceph_mount\n");
#ifdef CONFIG_CEPH_FS_POSIX_ACL
- flags |= MS_POSIXACL;
+ flags |= SB_POSIXACL;
#endif
err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
if (err < 0) {
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index d5b2e12b5d02..c71971c01c63 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -196,6 +196,14 @@ config CIFS_SMB311
This dialect includes improved security negotiation features.
If unsure, say N
+config CIFS_SMB_DIRECT
+ bool "SMB Direct support (Experimental)"
+ depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y
+ help
+ Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
+ SMB Direct allows transferring SMB packets over RDMA. If unsure,
+ say N.
+
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7134f182720b..7e4a1e2f0696 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -19,3 +19,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index cbb9534b89b4..c7a863219fa3 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -30,6 +30,9 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
void
cifs_dump_mem(char *label, void *data, int length)
@@ -107,6 +110,32 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
}
#ifdef CONFIG_PROC_FS
+static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
+{
+ __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+
+ seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count);
+ if (tcon->nativeFileSystem)
+ seq_printf(m, "Type: %s ", tcon->nativeFileSystem);
+ seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d",
+ le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+ le32_to_cpu(tcon->fsAttrInfo.Attributes),
+ le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+ tcon->tidStatus);
+ if (dev_type == FILE_DEVICE_DISK)
+ seq_puts(m, " type: DISK ");
+ else if (dev_type == FILE_DEVICE_CD_ROM)
+ seq_puts(m, " type: CDROM ");
+ else
+ seq_printf(m, " type: %d ", dev_type);
+ if (tcon->ses->server->ops->dump_share_caps)
+ tcon->ses->server->ops->dump_share_caps(m, tcon);
+
+ if (tcon->need_reconnect)
+ seq_puts(m, "\tDISCONNECTED ");
+ seq_putc(m, '\n');
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct list_head *tmp1, *tmp2, *tmp3;
@@ -115,7 +144,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
int i, j;
- __u32 dev_type;
seq_puts(m,
"Display Internal CIFS Data Structures for Debugging\n"
@@ -152,6 +180,72 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &cifs_tcp_ses_list) {
server = list_entry(tmp1, struct TCP_Server_Info,
tcp_ses_list);
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (!server->rdma)
+ goto skip_rdma;
+
+ seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
+ "transport status: %x",
+ server->smbd_conn->protocol,
+ server->smbd_conn->transport_status);
+ seq_printf(m, "\nConn receive_credit_max: %x "
+ "send_credit_target: %x max_send_size: %x",
+ server->smbd_conn->receive_credit_max,
+ server->smbd_conn->send_credit_target,
+ server->smbd_conn->max_send_size);
+ seq_printf(m, "\nConn max_fragmented_recv_size: %x "
+ "max_fragmented_send_size: %x max_receive_size:%x",
+ server->smbd_conn->max_fragmented_recv_size,
+ server->smbd_conn->max_fragmented_send_size,
+ server->smbd_conn->max_receive_size);
+ seq_printf(m, "\nConn keep_alive_interval: %x "
+ "max_readwrite_size: %x rdma_readwrite_threshold: %x",
+ server->smbd_conn->keep_alive_interval,
+ server->smbd_conn->max_readwrite_size,
+ server->smbd_conn->rdma_readwrite_threshold);
+ seq_printf(m, "\nDebug count_get_receive_buffer: %x "
+ "count_put_receive_buffer: %x count_send_empty: %x",
+ server->smbd_conn->count_get_receive_buffer,
+ server->smbd_conn->count_put_receive_buffer,
+ server->smbd_conn->count_send_empty);
+ seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
+ "count_enqueue_reassembly_queue: %x "
+ "count_dequeue_reassembly_queue: %x "
+ "fragment_reassembly_remaining: %x "
+ "reassembly_data_length: %x "
+ "reassembly_queue_length: %x",
+ server->smbd_conn->count_reassembly_queue,
+ server->smbd_conn->count_enqueue_reassembly_queue,
+ server->smbd_conn->count_dequeue_reassembly_queue,
+ server->smbd_conn->fragment_reassembly_remaining,
+ server->smbd_conn->reassembly_data_length,
+ server->smbd_conn->reassembly_queue_length);
+ seq_printf(m, "\nCurrent Credits send_credits: %x "
+ "receive_credits: %x receive_credit_target: %x",
+ atomic_read(&server->smbd_conn->send_credits),
+ atomic_read(&server->smbd_conn->receive_credits),
+ server->smbd_conn->receive_credit_target);
+ seq_printf(m, "\nPending send_pending: %x send_payload_pending:"
+ " %x smbd_send_pending: %x smbd_recv_pending: %x",
+ atomic_read(&server->smbd_conn->send_pending),
+ atomic_read(&server->smbd_conn->send_payload_pending),
+ server->smbd_conn->smbd_send_pending,
+ server->smbd_conn->smbd_recv_pending);
+ seq_printf(m, "\nReceive buffers count_receive_queue: %x "
+ "count_empty_packet_queue: %x",
+ server->smbd_conn->count_receive_queue,
+ server->smbd_conn->count_empty_packet_queue);
+ seq_printf(m, "\nMR responder_resources: %x "
+ "max_frmr_depth: %x mr_type: %x",
+ server->smbd_conn->responder_resources,
+ server->smbd_conn->max_frmr_depth,
+ server->smbd_conn->mr_type);
+ seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x",
+ atomic_read(&server->smbd_conn->mr_ready_count),
+ atomic_read(&server->smbd_conn->mr_used_count));
+skip_rdma:
+#endif
seq_printf(m, "\nNumber of credits: %d", server->credits);
i++;
list_for_each(tmp2, &server->smb_ses_list) {
@@ -176,6 +270,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
+ if (server->rdma)
+ seq_printf(m, "RDMA\n\t");
seq_printf(m, "TCP status: %d\n\tLocal Users To "
"Server: %d SecMode: 0x%x Req On Wire: %d",
server->tcpStatus, server->srv_count,
@@ -189,35 +285,19 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_puts(m, "\n\tShares:");
j = 0;
+
+ seq_printf(m, "\n\t%d) IPC: ", j);
+ if (ses->tcon_ipc)
+ cifs_debug_tcon(m, ses->tcon_ipc);
+ else
+ seq_puts(m, "none\n");
+
list_for_each(tmp3, &ses->tcon_list) {
tcon = list_entry(tmp3, struct cifs_tcon,
tcon_list);
++j;
- dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
- seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
- tcon->treeName, tcon->tc_count);
- if (tcon->nativeFileSystem) {
- seq_printf(m, "Type: %s ",
- tcon->nativeFileSystem);
- }
- seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
- "\n\tPathComponentMax: %d Status: %d",
- le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
- le32_to_cpu(tcon->fsAttrInfo.Attributes),
- le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
- tcon->tidStatus);
- if (dev_type == FILE_DEVICE_DISK)
- seq_puts(m, " type: DISK ");
- else if (dev_type == FILE_DEVICE_CD_ROM)
- seq_puts(m, " type: CDROM ");
- else
- seq_printf(m, " type: %d ", dev_type);
- if (server->ops->dump_share_caps)
- server->ops->dump_share_caps(m, tcon);
-
- if (tcon->need_reconnect)
- seq_puts(m, "\tDISCONNECTED ");
- seq_putc(m, '\n');
+ seq_printf(m, "\n\t%d) ", j);
+ cifs_debug_tcon(m, tcon);
}
seq_puts(m, "\n\tMIDs:\n");
@@ -374,6 +454,45 @@ static const struct file_operations cifs_stats_proc_fops = {
};
#endif /* STATS */
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#define PROC_FILE_DEFINE(name) \
+static ssize_t name##_write(struct file *file, const char __user *buffer, \
+ size_t count, loff_t *ppos) \
+{ \
+ int rc; \
+ rc = kstrtoint_from_user(buffer, count, 10, & name); \
+ if (rc) \
+ return rc; \
+ return count; \
+} \
+static int name##_proc_show(struct seq_file *m, void *v) \
+{ \
+ seq_printf(m, "%d\n", name ); \
+ return 0; \
+} \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, name##_proc_show, NULL); \
+} \
+\
+static const struct file_operations cifs_##name##_proc_fops = { \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = name##_write, \
+}
+
+PROC_FILE_DEFINE(rdma_readwrite_threshold);
+PROC_FILE_DEFINE(smbd_max_frmr_depth);
+PROC_FILE_DEFINE(smbd_keep_alive_interval);
+PROC_FILE_DEFINE(smbd_max_receive_size);
+PROC_FILE_DEFINE(smbd_max_fragmented_recv_size);
+PROC_FILE_DEFINE(smbd_max_send_size);
+PROC_FILE_DEFINE(smbd_send_credit_target);
+PROC_FILE_DEFINE(smbd_receive_credit_max);
+#endif
+
static struct proc_dir_entry *proc_fs_cifs;
static const struct file_operations cifsFYI_proc_fops;
static const struct file_operations cifs_lookup_cache_proc_fops;
@@ -401,6 +520,24 @@ cifs_proc_init(void)
&cifs_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
&cifs_lookup_cache_proc_fops);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+ &cifs_rdma_readwrite_threshold_proc_fops);
+ proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+ &cifs_smbd_max_frmr_depth_proc_fops);
+ proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+ &cifs_smbd_keep_alive_interval_proc_fops);
+ proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+ &cifs_smbd_max_receive_size_proc_fops);
+ proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+ &cifs_smbd_max_fragmented_recv_size_proc_fops);
+ proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+ &cifs_smbd_max_send_size_proc_fops);
+ proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+ &cifs_smbd_send_credit_target_proc_fops);
+ proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+ &cifs_smbd_receive_credit_max_proc_fops);
+#endif
}
void
@@ -418,6 +555,16 @@ cifs_proc_clean(void)
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
+ remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);
+ remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs);
+ remove_proc_entry("smbd_max_receive_size", proc_fs_cifs);
+ remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs);
+ remove_proc_entry("smbd_max_send_size", proc_fs_cifs);
+ remove_proc_entry("smbd_send_credit_target", proc_fs_cifs);
+ remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs);
+#endif
remove_proc_entry("fs/cifs", NULL);
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index cbd216b57239..350fa55a1bf7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -42,7 +42,7 @@
#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
-#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */
#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index b98436f5c7c7..13a8a77322c9 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1125,7 +1125,7 @@ out:
return rc;
}
-/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+/* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */
int
cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
struct inode *inode, const char *path,
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 68abbb0db608..f2b0a7f124da 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -325,9 +325,8 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
{
int i;
int rc;
- char password_with_pad[CIFS_ENCPWD_SIZE];
+ char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
- memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
if (password)
strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c8b75d33f31..a7be591d8e18 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb)
tcon = cifs_sb_master_tcon(cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -327,6 +327,8 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
default:
seq_puts(s, "(unknown)");
}
+ if (server->rdma)
+ seq_puts(s, ",rdma");
}
static void
@@ -497,7 +499,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
seq_puts(s, ",dynperm");
- if (root->d_sb->s_flags & MS_POSIXACL)
+ if (root->d_sb->s_flags & SB_POSIXACL)
seq_puts(s, ",acl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
seq_puts(s, ",mfsymlinks");
@@ -573,7 +575,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_NODIRATIME;
+ *flags |= SB_NODIRATIME;
return 0;
}
@@ -708,7 +710,7 @@ cifs_do_mount(struct file_system_type *fs_type,
rc = cifs_mount(cifs_sb, volume_info);
if (rc) {
- if (!(flags & MS_SILENT))
+ if (!(flags & SB_SILENT))
cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
rc);
root = ERR_PTR(rc);
@@ -720,7 +722,7 @@ cifs_do_mount(struct file_system_type *fs_type,
mnt_data.flags = flags;
/* BB should we make this contingent on mount parm? */
- flags |= MS_NODIRATIME | MS_NOATIME;
+ flags |= SB_NODIRATIME | SB_NOATIME;
sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
if (IS_ERR(sb)) {
@@ -739,7 +741,7 @@ cifs_do_mount(struct file_system_type *fs_type,
goto out_super;
}
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
}
root = cifs_get_root(volume_info, sb);
@@ -1068,6 +1070,7 @@ const struct file_operations cifs_file_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1086,6 +1089,7 @@ const struct file_operations cifs_file_strict_ops = {
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1105,6 +1109,7 @@ const struct file_operations cifs_file_direct_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range,
@@ -1122,6 +1127,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1139,6 +1145,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1157,6 +1164,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5a10e566f0e6..013ba2aed8d9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.10"
+#define CIFS_VERSION "2.11"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e185b2853eab..48f7c197cd2d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -64,8 +64,8 @@
#define RFC1001_NAME_LEN 15
#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
-/* currently length of NIP6_FMT */
-#define SERVER_NAME_LENGTH 40
+/* maximum length of ip addr as a string (including ipv6 and sctp) */
+#define SERVER_NAME_LENGTH 80
#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
/* echo interval in seconds */
@@ -230,8 +230,14 @@ struct smb_version_operations {
__u64 (*get_next_mid)(struct TCP_Server_Info *);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
- /* data length from read response message */
- unsigned int (*read_data_length)(char *);
+ /*
+ * Data length from read response message
+ * When in_remaining is true, the returned data length is in
+ * message field DataRemaining for out-of-band data read (e.g through
+ * Memory Registration RDMA write in SMBD).
+ * Otherwise, the returned data length is in message field DataLength.
+ */
+ unsigned int (*read_data_length)(char *, bool in_remaining);
/* map smb to linux error */
int (*map_error)(char *, bool);
/* find mid corresponding to the response message */
@@ -532,6 +538,7 @@ struct smb_vol {
bool nopersistent:1;
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
+ bool rdma:1;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -559,8 +566,8 @@ struct smb_vol {
CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
-#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
- MS_NODEV | MS_SYNCHRONOUS)
+#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \
+ SB_NODEV | SB_SYNCHRONOUS)
struct cifs_mnt_data {
struct cifs_sb_info *cifs_sb;
@@ -648,6 +655,10 @@ struct TCP_Server_Info {
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
bool large_buf; /* is current buffer large? */
+ /* use SMBD connection instead of socket */
+ bool rdma;
+ /* point to the SMBD connection if RDMA is used instead of socket */
+ struct smbd_connection *smbd_conn;
struct delayed_work echo; /* echo ping workqueue job */
char *smallbuf; /* pointer to current "small" buffer */
char *bigbuf; /* pointer to current "big" buffer */
@@ -822,12 +833,12 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
struct cifs_ses {
struct list_head smb_ses_list;
struct list_head tcon_list;
+ struct cifs_tcon *tcon_ipc;
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
enum statusEnum status;
unsigned overrideSecFlg; /* if non-zero override global sec flags */
- __u32 ipc_tid; /* special tid for connection to IPC share */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
@@ -835,8 +846,7 @@ struct cifs_ses {
kuid_t linux_uid; /* overriding owner of files on the mount */
kuid_t cred_uid; /* owner of credentials */
unsigned int capabilities;
- char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
- TCP names - will ipv6 and sctp addresses fit? */
+ char serverName[SERVER_NAME_LEN_WITH_NULL];
char *user_name; /* must not be null except during init of sess
and after mount option parsing we fill it */
char *domainName;
@@ -931,7 +941,9 @@ struct cifs_tcon {
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
FILE_SYSTEM_UNIX_INFO fsUnixInfo;
- bool ipc:1; /* set if connection to IPC$ eg for RPC/PIPES */
+ bool ipc:1; /* set if connection to IPC$ share (always also pipe) */
+ bool pipe:1; /* set if connection to pipe share */
+ bool print:1; /* set if connection to printer share */
bool retry:1;
bool nocase:1;
bool seal:1; /* transport encryption for this mounted share */
@@ -944,7 +956,6 @@ struct cifs_tcon {
bool need_reopen_files:1; /* need to reopen tcon file handles */
bool use_resilient:1; /* use resilient instead of durable handles */
bool use_persistent:1; /* use persistent instead of durable handles */
- bool print:1; /* set if connection to printer share */
__le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
@@ -1147,6 +1158,9 @@ struct cifs_readdata {
struct cifs_readdata *rdata,
struct iov_iter *iter);
struct kvec iov[2];
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ struct smbd_mr *mr;
+#endif
unsigned int pagesz;
unsigned int tailsz;
unsigned int credits;
@@ -1169,6 +1183,9 @@ struct cifs_writedata {
pid_t pid;
unsigned int bytes;
int result;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ struct smbd_mr *mr;
+#endif
unsigned int pagesz;
unsigned int tailsz;
unsigned int credits;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4143c9dec463..93d565186698 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -106,6 +106,10 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
struct kvec * /* resp vec */);
+extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses,
+ struct kvec *pkvec, int nvec_to_send,
+ int *pbuftype, const int flags,
+ struct kvec *presp);
extern int SendReceiveBlockingLock(const unsigned int xid,
struct cifs_tcon *ptcon,
struct smb_hdr *in_buf ,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 35dc5bf01ee2..4e0922d24eb2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -43,6 +43,7 @@
#include "cifs_unicode.h"
#include "cifs_debug.h"
#include "fscache.h"
+#include "smbdirect.h"
#ifdef CONFIG_CIFS_POSIX
static struct {
@@ -1454,6 +1455,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
struct cifs_readdata *rdata = mid->callback_data;
char *buf = server->smallbuf;
unsigned int buflen = get_rfc1002_length(buf) + 4;
+ bool use_rdma_mr = false;
cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
__func__, mid->mid, rdata->offset, rdata->bytes);
@@ -1542,8 +1544,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
rdata->iov[0].iov_base, server->total_read);
/* how much data is in the response? */
- data_len = server->ops->read_data_length(buf);
- if (data_offset + data_len > buflen) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ use_rdma_mr = rdata->mr;
+#endif
+ data_len = server->ops->read_data_length(buf, use_rdma_mr);
+ if (!use_rdma_mr && (data_offset + data_len > buflen)) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
@@ -1923,6 +1928,12 @@ cifs_writedata_release(struct kref *refcount)
{
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
@@ -4822,10 +4833,11 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
*target_nodes = NULL;
cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name);
- if (ses == NULL)
+ if (ses == NULL || ses->tcon_ipc == NULL)
return -ENODEV;
+
getDFSRetry:
- rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB,
+ rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, (void **) &pSMB,
(void **) &pSMBr);
if (rc)
return rc;
@@ -4833,7 +4845,7 @@ getDFSRetry:
/* server pointer checked in called function,
but should never be null here anyway */
pSMB->hdr.Mid = get_next_mid(ses->server);
- pSMB->hdr.Tid = ses->ipc_tid;
+ pSMB->hdr.Tid = ses->tcon_ipc->tid;
pSMB->hdr.Uid = ses->Suid;
if (ses->capabilities & CAP_STATUS32)
pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0bfc2280436d..a726f524fb84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -44,7 +44,6 @@
#include <net/ipv6.h>
#include <linux/parser.h>
#include <linux/bvec.h>
-
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -56,6 +55,7 @@
#include "rfc1002pdu.h"
#include "fscache.h"
#include "smb2proto.h"
+#include "smbdirect.h"
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -92,7 +92,7 @@ enum {
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
- Opt_domainauto,
+ Opt_domainauto, Opt_rdma,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -183,6 +183,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_resilient, "resilienthandles"},
{ Opt_noresilient, "noresilienthandles"},
{ Opt_domainauto, "domainauto"},
+ { Opt_rdma, "rdma"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -353,11 +354,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
ses->need_reconnect = true;
- ses->ipc_tid = 0;
list_for_each(tmp2, &ses->tcon_list) {
tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
tcon->need_reconnect = true;
}
+ if (ses->tcon_ipc)
+ ses->tcon_ipc->need_reconnect = true;
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -405,7 +407,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* we should try only the port we connected to before */
mutex_lock(&server->srv_mutex);
- rc = generic_ip_connect(server);
+ if (cifs_rdma_enabled(server))
+ rc = smbd_reconnect(server);
+ else
+ rc = generic_ip_connect(server);
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
mutex_unlock(&server->srv_mutex);
@@ -538,8 +543,10 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
if (server_unresponsive(server))
return -ECONNABORTED;
-
- length = sock_recvmsg(server->ssocket, smb_msg, 0);
+ if (cifs_rdma_enabled(server) && server->smbd_conn)
+ length = smbd_recv(server->smbd_conn, smb_msg);
+ else
+ length = sock_recvmsg(server->ssocket, smb_msg, 0);
if (server->tcpStatus == CifsExiting)
return -ESHUTDOWN;
@@ -700,7 +707,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
wake_up_all(&server->request_q);
/* give those requests time to exit */
msleep(125);
-
+ if (cifs_rdma_enabled(server) && server->smbd_conn) {
+ smbd_destroy(server->smbd_conn);
+ server->smbd_conn = NULL;
+ }
if (server->ssocket) {
sock_release(server->ssocket);
server->ssocket = NULL;
@@ -1550,6 +1560,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_domainauto:
vol->domainauto = true;
break;
+ case Opt_rdma:
+ vol->rdma = true;
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -1707,7 +1720,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
tmp_end++;
if (!(tmp_end < end && tmp_end[1] == delim)) {
/* No it is not. Set the password to NULL */
- kfree(vol->password);
+ kzfree(vol->password);
vol->password = NULL;
break;
}
@@ -1745,7 +1758,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
options = end;
}
- kfree(vol->password);
+ kzfree(vol->password);
/* Now build new password string */
temp_len = strlen(value);
vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -1951,6 +1964,19 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
+ if (vol->rdma && vol->vals->protocol_id < SMB30_PROT_ID) {
+ cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n");
+ goto cifs_parse_mount_err;
+ }
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (vol->rdma && vol->sign) {
+ cifs_dbg(VFS, "Currently SMB direct doesn't support signing."
+ " This is being fixed\n");
+ goto cifs_parse_mount_err;
+ }
+#endif
+
#ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (vol->multiuser) {
@@ -2162,6 +2188,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
if (server->echo_interval != vol->echo_interval * HZ)
return 0;
+ if (server->rdma != vol->rdma)
+ return 0;
+
return 1;
}
@@ -2260,6 +2289,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->noblocksnd = volume_info->noblocksnd;
tcp_ses->noautotune = volume_info->noautotune;
tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
+ tcp_ses->rdma = volume_info->rdma;
tcp_ses->in_flight = 0;
tcp_ses->credits = 1;
init_waitqueue_head(&tcp_ses->response_q);
@@ -2297,13 +2327,29 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->echo_interval = volume_info->echo_interval * HZ;
else
tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
-
+ if (tcp_ses->rdma) {
+#ifndef CONFIG_CIFS_SMB_DIRECT
+ cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
+ rc = -ENOENT;
+ goto out_err_crypto_release;
+#endif
+ tcp_ses->smbd_conn = smbd_get_connection(
+ tcp_ses, (struct sockaddr *)&volume_info->dstaddr);
+ if (tcp_ses->smbd_conn) {
+ cifs_dbg(VFS, "RDMA transport established\n");
+ rc = 0;
+ goto smbd_connected;
+ } else {
+ rc = -ENOENT;
+ goto out_err_crypto_release;
+ }
+ }
rc = ip_connect(tcp_ses);
if (rc < 0) {
cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
goto out_err_crypto_release;
}
-
+smbd_connected:
/*
* since we're in a cifs function already, we know that
* this will succeed. No need for try_module_get().
@@ -2381,6 +2427,93 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
return 1;
}
+/**
+ * cifs_setup_ipc - helper to setup the IPC tcon for the session
+ *
+ * A new IPC connection is made and stored in the session
+ * tcon_ipc. The IPC tcon has the same lifetime as the session.
+ */
+static int
+cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
+{
+ int rc = 0, xid;
+ struct cifs_tcon *tcon;
+ struct nls_table *nls_codepage;
+ char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
+ bool seal = false;
+
+ /*
+ * If the mount request that resulted in the creation of the
+ * session requires encryption, force IPC to be encrypted too.
+ */
+ if (volume_info->seal) {
+ if (ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)
+ seal = true;
+ else {
+ cifs_dbg(VFS,
+ "IPC: server doesn't support encryption\n");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ tcon = tconInfoAlloc();
+ if (tcon == NULL)
+ return -ENOMEM;
+
+ snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->serverName);
+
+ /* cannot fail */
+ nls_codepage = load_nls_default();
+
+ xid = get_xid();
+ tcon->ses = ses;
+ tcon->ipc = true;
+ tcon->seal = seal;
+ rc = ses->server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage);
+ free_xid(xid);
+
+ if (rc) {
+ cifs_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc);
+ tconInfoFree(tcon);
+ goto out;
+ }
+
+ cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid);
+
+ ses->tcon_ipc = tcon;
+out:
+ unload_nls(nls_codepage);
+ return rc;
+}
+
+/**
+ * cifs_free_ipc - helper to release the session IPC tcon
+ *
+ * Needs to be called everytime a session is destroyed
+ */
+static int
+cifs_free_ipc(struct cifs_ses *ses)
+{
+ int rc = 0, xid;
+ struct cifs_tcon *tcon = ses->tcon_ipc;
+
+ if (tcon == NULL)
+ return 0;
+
+ if (ses->server->ops->tree_disconnect) {
+ xid = get_xid();
+ rc = ses->server->ops->tree_disconnect(xid, tcon);
+ free_xid(xid);
+ }
+
+ if (rc)
+ cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc);
+
+ tconInfoFree(tcon);
+ ses->tcon_ipc = NULL;
+ return rc;
+}
+
static struct cifs_ses *
cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
{
@@ -2421,6 +2554,8 @@ cifs_put_smb_ses(struct cifs_ses *ses)
ses->status = CifsExiting;
spin_unlock(&cifs_tcp_ses_lock);
+ cifs_free_ipc(ses);
+
if (ses->status == CifsExiting && server->ops->logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
@@ -2569,6 +2704,13 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
}
#endif /* CONFIG_KEYS */
+/**
+ * cifs_get_smb_ses - get a session matching @volume_info data from @server
+ *
+ * This function assumes it is being called from cifs_mount() where we
+ * already got a server reference (server refcount +1). See
+ * cifs_get_tcon() for refcount explanations.
+ */
static struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
{
@@ -2665,6 +2807,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
spin_unlock(&cifs_tcp_ses_lock);
free_xid(xid);
+
+ cifs_setup_ipc(ses, volume_info);
+
return ses;
get_ses_fail:
@@ -2709,8 +2854,16 @@ void
cifs_put_tcon(struct cifs_tcon *tcon)
{
unsigned int xid;
- struct cifs_ses *ses = tcon->ses;
+ struct cifs_ses *ses;
+
+ /*
+ * IPC tcon share the lifetime of their session and are
+ * destroyed in the session put function
+ */
+ if (tcon == NULL || tcon->ipc)
+ return;
+ ses = tcon->ses;
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
if (--tcon->tc_count > 0) {
@@ -2731,6 +2884,26 @@ cifs_put_tcon(struct cifs_tcon *tcon)
cifs_put_smb_ses(ses);
}
+/**
+ * cifs_get_tcon - get a tcon matching @volume_info data from @ses
+ *
+ * - tcon refcount is the number of mount points using the tcon.
+ * - ses refcount is the number of tcon using the session.
+ *
+ * 1. This function assumes it is being called from cifs_mount() where
+ * we already got a session reference (ses refcount +1).
+ *
+ * 2. Since we're in the context of adding a mount point, the end
+ * result should be either:
+ *
+ * a) a new tcon already allocated with refcount=1 (1 mount point) and
+ * its session refcount incremented (1 new tcon). This +1 was
+ * already done in (1).
+ *
+ * b) an existing tcon with refcount+1 (add a mount point to it) and
+ * identical ses refcount (no new tcon). Because of (1) we need to
+ * decrement the ses refcount.
+ */
static struct cifs_tcon *
cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
{
@@ -2739,8 +2912,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
tcon = cifs_find_tcon(ses, volume_info);
if (tcon) {
+ /*
+ * tcon has refcount already incremented but we need to
+ * decrement extra ses reference gotten by caller (case b)
+ */
cifs_dbg(FYI, "Found match on UNC path\n");
- /* existing tcon already has a reference */
cifs_put_smb_ses(ses);
return tcon;
}
@@ -2986,39 +3162,17 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
const struct nls_table *nls_codepage, unsigned int *num_referrals,
struct dfs_info3_param **referrals, int remap)
{
- char *temp_unc;
int rc = 0;
- if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer)
+ if (!ses->server->ops->get_dfs_refer)
return -ENOSYS;
*num_referrals = 0;
*referrals = NULL;
- if (ses->ipc_tid == 0) {
- temp_unc = kmalloc(2 /* for slashes */ +
- strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2)
- + 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL);
- if (temp_unc == NULL)
- return -ENOMEM;
- temp_unc[0] = '\\';
- temp_unc[1] = '\\';
- strcpy(temp_unc + 2, ses->serverName);
- strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$");
- rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL,
- nls_codepage);
- cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid);
- kfree(temp_unc);
- }
- if (rc == 0)
- rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
- referrals, num_referrals,
- nls_codepage, remap);
- /*
- * BB - map targetUNCs to dfs_info3 structures, here or in
- * ses->server->ops->get_dfs_refer.
- */
-
+ rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
+ referrals, num_referrals,
+ nls_codepage, remap);
return rc;
}
@@ -3783,7 +3937,7 @@ try_mount_again:
tcon->unix_ext = 0; /* server does not support them */
/* do not care if a following call succeed - informational */
- if (!tcon->ipc && server->ops->qfs_tcon)
+ if (!tcon->pipe && server->ops->qfs_tcon)
server->ops->qfs_tcon(xid, tcon);
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
@@ -3913,8 +4067,7 @@ out:
}
/*
- * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon
- * pointer may be NULL.
+ * Issue a TREE_CONNECT request.
*/
int
CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -3950,7 +4103,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
pSMB->AndXCommand = 0xFF;
pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
bcc_ptr = &pSMB->Password[0];
- if (!tcon || (ses->server->sec_mode & SECMODE_USER)) {
+ if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) {
pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
*bcc_ptr = 0; /* password is null byte */
bcc_ptr++; /* skip password */
@@ -4022,7 +4175,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
0);
/* above now done in SendReceive */
- if ((rc == 0) && (tcon != NULL)) {
+ if (rc == 0) {
bool is_unicode;
tcon->tidStatus = CifsGood;
@@ -4042,7 +4195,8 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
(bcc_ptr[2] == 'C')) {
cifs_dbg(FYI, "IPC connection\n");
- tcon->ipc = 1;
+ tcon->ipc = true;
+ tcon->pipe = true;
}
} else if (length == 2) {
if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
@@ -4069,9 +4223,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
else
tcon->Flags = 0;
cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
- } else if ((rc == 0) && tcon == NULL) {
- /* all we need to save for IPC$ connection */
- ses->ipc_tid = smb_buffer_response->Tid;
}
cifs_buf_release(smb_buffer);
@@ -4235,7 +4386,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
out:
kfree(vol_info->username);
- kfree(vol_info->password);
+ kzfree(vol_info->password);
kfree(vol_info);
return tcon;
@@ -4387,7 +4538,7 @@ cifs_prune_tlinks(struct work_struct *work)
struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
prune_tlinks.work);
struct rb_root *root = &cifs_sb->tlink_tree;
- struct rb_node *node = rb_first(root);
+ struct rb_node *node;
struct rb_node *tmp;
struct tcon_link *tlink;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index df9f682708c6..7cee97b93a61 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -42,7 +42,7 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "fscache.h"
-
+#include "smbdirect.h"
static inline int cifs_convert_flags(unsigned int flags)
{
@@ -2902,7 +2902,12 @@ cifs_readdata_release(struct kref *refcount)
{
struct cifs_readdata *rdata = container_of(refcount,
struct cifs_readdata, refcount);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr) {
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
@@ -3031,6 +3036,10 @@ uncached_fill_pages(struct TCP_Server_Info *server,
}
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ else if (rdata->mr)
+ result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
@@ -3471,20 +3480,18 @@ static const struct vm_operations_struct cifs_file_vm_ops = {
int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
{
- int rc, xid;
+ int xid, rc = 0;
struct inode *inode = file_inode(file);
xid = get_xid();
- if (!CIFS_CACHE_READ(CIFS_I(inode))) {
+ if (!CIFS_CACHE_READ(CIFS_I(inode)))
rc = cifs_zap_mapping(inode);
- if (rc)
- return rc;
- }
-
- rc = generic_file_mmap(file, vma);
- if (rc == 0)
+ if (!rc)
+ rc = generic_file_mmap(file, vma);
+ if (!rc)
vma->vm_ops = &cifs_file_vm_ops;
+
free_xid(xid);
return rc;
}
@@ -3494,16 +3501,16 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
int rc, xid;
xid = get_xid();
+
rc = cifs_revalidate_file(file);
- if (rc) {
+ if (rc)
cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
rc);
- free_xid(xid);
- return rc;
- }
- rc = generic_file_mmap(file, vma);
- if (rc == 0)
+ if (!rc)
+ rc = generic_file_mmap(file, vma);
+ if (!rc)
vma->vm_ops = &cifs_file_vm_ops;
+
free_xid(xid);
return rc;
}
@@ -3600,6 +3607,10 @@ readpages_fill_pages(struct TCP_Server_Info *server,
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ else if (rdata->mr)
+ result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c732cb44164..8f9a8cc7cc62 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -985,7 +985,7 @@ retry_iget5_locked:
}
cifs_fattr_to_inode(inode, fattr);
- if (sb->s_flags & MS_NOATIME)
+ if (sb->s_flags & SB_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
@@ -1049,7 +1049,7 @@ iget_no_retry:
tcon->resource_id = CIFS_I(inode)->uniqueid;
#endif
- if (rc && tcon->ipc) {
+ if (rc && tcon->pipe) {
cifs_dbg(FYI, "ipc connection - fake read inode\n");
spin_lock(&inode->i_lock);
inode->i_mode |= S_IFDIR;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index eea93ac15ef0..a0dbced4a45c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -98,14 +98,11 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->serverOS);
kfree(buf_to_free->serverDomain);
kfree(buf_to_free->serverNOS);
- if (buf_to_free->password) {
- memset(buf_to_free->password, 0, strlen(buf_to_free->password));
- kfree(buf_to_free->password);
- }
+ kzfree(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
- kfree(buf_to_free->auth_key.response);
- kfree(buf_to_free);
+ kzfree(buf_to_free->auth_key.response);
+ kzfree(buf_to_free);
}
struct cifs_tcon *
@@ -136,10 +133,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
}
atomic_dec(&tconInfoAllocCount);
kfree(buf_to_free->nativeFileSystem);
- if (buf_to_free->password) {
- memset(buf_to_free->password, 0, strlen(buf_to_free->password));
- kfree(buf_to_free->password);
- }
+ kzfree(buf_to_free->password);
kfree(buf_to_free);
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index a723df3e0197..3d495e440c87 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -87,9 +87,11 @@ cifs_read_data_offset(char *buf)
}
static unsigned int
-cifs_read_data_length(char *buf)
+cifs_read_data_length(char *buf, bool in_remaining)
{
READ_RSP *rsp = (READ_RSP *)buf;
+ /* It's a bug reading remaining data for SMB1 packets */
+ WARN_ON(in_remaining);
return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
le16_to_cpu(rsp->DataLength);
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b4b1f0305f29..12af5dba742b 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -74,7 +74,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
- true /* is_fsctl */, false /* use_ipc */,
+ true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b08a1446a7f..76d03abaa38c 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -578,7 +578,7 @@ smb2_is_valid_lease_break(char *buffer)
bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
+ struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer;
struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index e06740436b92..eb68e2fcc500 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -32,6 +32,7 @@
#include "smb2status.h"
#include "smb2glob.h"
#include "cifs_ioctl.h"
+#include "smbdirect.h"
static int
change_conf(struct TCP_Server_Info *server)
@@ -250,7 +251,11 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma)
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_readwrite_size);
+#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
@@ -266,6 +271,11 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma)
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_readwrite_size);
+#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
@@ -283,7 +293,6 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
- false /* use_ipc */,
NULL /* no data input */, 0 /* no data input */,
(char **)&out_buf, &ret_data_len);
if (rc != 0)
@@ -782,7 +791,6 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- false /* use_ipc */,
NULL, 0 /* no input */,
(char **)&res_key, &ret_data_len);
@@ -848,8 +856,7 @@ smb2_copychunk_range(const unsigned int xid,
/* Request server copy to target from src identified by key */
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
- true /* is_fsctl */, false /* use_ipc */,
- (char *)pcchunk,
+ true /* is_fsctl */, (char *)pcchunk,
sizeof(struct copychunk_ioctl), (char **)&retbuf,
&ret_data_len);
if (rc == 0) {
@@ -947,9 +954,13 @@ smb2_read_data_offset(char *buf)
}
static unsigned int
-smb2_read_data_length(char *buf)
+smb2_read_data_length(char *buf, bool in_remaining)
{
struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+
+ if (in_remaining)
+ return le32_to_cpu(rsp->DataRemaining);
+
return le32_to_cpu(rsp->DataLength);
}
@@ -1006,7 +1017,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
- true /* is_fctl */, false /* use_ipc */,
+ true /* is_fctl */,
&setsparse, 1, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
@@ -1077,7 +1088,7 @@ smb2_duplicate_extents(const unsigned int xid,
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
- true /* is_fsctl */, false /* use_ipc */,
+ true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
NULL,
@@ -1112,7 +1123,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SET_INTEGRITY_INFORMATION,
- true /* is_fsctl */, false /* use_ipc */,
+ true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
NULL,
@@ -1132,7 +1143,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
- true /* is_fsctl */, false /* use_ipc */,
+ true /* is_fsctl */,
NULL, 0 /* no input data */,
(char **)&retbuf,
&ret_data_len);
@@ -1351,16 +1362,20 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "smb2_get_dfs_refer path <%s>\n", search_name);
/*
- * Use any tcon from the current session. Here, the first one.
+ * Try to use the IPC tcon, otherwise just use any
*/
- spin_lock(&cifs_tcp_ses_lock);
- tcon = list_first_entry_or_null(&ses->tcon_list, struct cifs_tcon,
- tcon_list);
- if (tcon)
- tcon->tc_count++;
- spin_unlock(&cifs_tcp_ses_lock);
+ tcon = ses->tcon_ipc;
+ if (tcon == NULL) {
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon = list_first_entry_or_null(&ses->tcon_list,
+ struct cifs_tcon,
+ tcon_list);
+ if (tcon)
+ tcon->tc_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ }
- if (!tcon) {
+ if (tcon == NULL) {
cifs_dbg(VFS, "session %p has no tcon available for a dfs referral request\n",
ses);
rc = -ENOTCONN;
@@ -1389,24 +1404,16 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
memcpy(dfs_req->RequestFileName, utf16_path, utf16_path_len);
do {
- /* try first with IPC */
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
- true /* is_fsctl */, true /* use_ipc */,
+ true /* is_fsctl */,
(char *)dfs_req, dfs_req_size,
(char **)&dfs_rsp, &dfs_rsp_size);
- if (rc == -ENOTCONN) {
- /* try with normal tcon */
- rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_DFS_GET_REFERRALS,
- true /* is_fsctl */, false /*use_ipc*/,
- (char *)dfs_req, dfs_req_size,
- (char **)&dfs_rsp, &dfs_rsp_size);
- }
} while (rc == -EAGAIN);
if (rc) {
- cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
+ if (rc != -ENOENT)
+ cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc);
goto out;
}
@@ -1420,7 +1427,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
}
out:
- if (tcon) {
+ if (tcon && !tcon->ipc) {
+ /* ipc tcons are not refcounted */
spin_lock(&cifs_tcp_ses_lock);
tcon->tc_count--;
spin_unlock(&cifs_tcp_ses_lock);
@@ -1712,8 +1720,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, false /* use_ipc */,
- (char *)&fsctl_buf,
+ true /* is_fctl */, (char *)&fsctl_buf,
sizeof(struct file_zero_data_information), NULL, NULL);
free_xid(xid);
return rc;
@@ -1747,8 +1754,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, false /* use_ipc */,
- (char *)&fsctl_buf,
+ true /* is_fctl */, (char *)&fsctl_buf,
sizeof(struct file_zero_data_information), NULL, NULL);
free_xid(xid);
return rc;
@@ -2410,6 +2416,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
struct iov_iter iter;
struct kvec iov;
int length;
+ bool use_rdma_mr = false;
if (shdr->Command != SMB2_READ) {
cifs_dbg(VFS, "only big read responses are supported\n");
@@ -2436,7 +2443,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
data_offset = server->ops->read_data_offset(buf) + 4;
- data_len = server->ops->read_data_length(buf);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ use_rdma_mr = rdata->mr;
+#endif
+ data_len = server->ops->read_data_length(buf, use_rdma_mr);
if (data_offset < server->vals->read_rsp_size) {
/*
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5331631386a2..63778ac22fd9 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -48,6 +48,7 @@
#include "smb2glob.h"
#include "cifspdu.h"
#include "cifs_spnego.h"
+#include "smbdirect.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -319,54 +320,16 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf,
*total_len = parmsize + sizeof(struct smb2_sync_hdr);
}
-/* init request without RFC1001 length at the beginning */
-static int
-smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
- void **request_buf, unsigned int *total_len)
-{
- int rc;
- struct smb2_sync_hdr *shdr;
-
- rc = smb2_reconnect(smb2_command, tcon);
- if (rc)
- return rc;
-
- /* BB eventually switch this to SMB2 specific small buf size */
- *request_buf = cifs_small_buf_get();
- if (*request_buf == NULL) {
- /* BB should we add a retry in here if not a writepage? */
- return -ENOMEM;
- }
-
- shdr = (struct smb2_sync_hdr *)(*request_buf);
-
- fill_small_buf(smb2_command, tcon, shdr, total_len);
-
- if (tcon != NULL) {
-#ifdef CONFIG_CIFS_STATS2
- uint16_t com_code = le16_to_cpu(smb2_command);
-
- cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]);
-#endif
- cifs_stats_inc(&tcon->num_smbs_sent);
- }
-
- return rc;
-}
-
/*
* Allocate and return pointer to an SMB request hdr, and set basic
* SMB information in the SMB header. If the return code is zero, this
- * function must have filled in request_buf pointer. The returned buffer
- * has RFC1001 length at the beginning.
+ * function must have filled in request_buf pointer.
*/
static int
-small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
- void **request_buf)
+smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
{
int rc;
- unsigned int total_len;
- struct smb2_pdu *pdu;
rc = smb2_reconnect(smb2_command, tcon);
if (rc)
@@ -379,12 +342,9 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
return -ENOMEM;
}
- pdu = (struct smb2_pdu *)(*request_buf);
-
- fill_small_buf(smb2_command, tcon, get_sync_hdr(pdu), &total_len);
-
- /* Note this is only network field converted to big endian */
- pdu->hdr.smb2_buf_length = cpu_to_be32(total_len);
+ fill_small_buf(smb2_command, tcon,
+ (struct smb2_sync_hdr *)(*request_buf),
+ total_len);
if (tcon != NULL) {
#ifdef CONFIG_CIFS_STATS2
@@ -398,8 +358,8 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
}
#ifdef CONFIG_CIFS_SMB311
-/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */
-#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */
+/* offset is sizeof smb2_negotiate_req but rounded up to 8 bytes */
+#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) */
#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
@@ -427,23 +387,25 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
}
static void
-assemble_neg_contexts(struct smb2_negotiate_req *req)
+assemble_neg_contexts(struct smb2_negotiate_req *req,
+ unsigned int *total_len)
{
-
- /* +4 is to account for the RFC1001 len field */
- char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4;
+ char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
/* Add 2 to size to round to 8 byte boundary */
+
pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
req->NegotiateContextCount = cpu_to_le16(2);
- inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context)
- + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */
+
+ *total_len += 4 + sizeof(struct smb2_preauth_neg_context)
+ + sizeof(struct smb2_encryption_neg_context);
}
#else
-static void assemble_neg_contexts(struct smb2_negotiate_req *req)
+static void assemble_neg_contexts(struct smb2_negotiate_req *req,
+ unsigned int *total_len)
{
return;
}
@@ -477,6 +439,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
+ unsigned int total_len;
cifs_dbg(FYI, "Negotiate protocol\n");
@@ -485,30 +448,30 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
return -EIO;
}
- rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_NEGOTIATE, NULL, (void **) &req, &total_len);
if (rc)
return rc;
- req->hdr.sync_hdr.SessionId = 0;
+ req->sync_hdr.SessionId = 0;
if (strcmp(ses->server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
req->DialectCount = cpu_to_le16(2);
- inc_rfc1001_len(req, 4);
+ total_len += 4;
} else if (strcmp(ses->server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
req->DialectCount = cpu_to_le16(3);
- inc_rfc1001_len(req, 6);
+ total_len += 6;
} else {
/* otherwise send specific dialect */
req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
req->DialectCount = cpu_to_le16(1);
- inc_rfc1001_len(req, 2);
+ total_len += 2;
}
/* only one of SMB2 signing flags may be set in SMB2 request */
@@ -528,13 +491,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
memcpy(req->ClientGUID, server->client_guid,
SMB2_CLIENT_GUID_SIZE);
if (ses->server->vals->protocol_id == SMB311_PROT_ID)
- assemble_neg_contexts(req);
+ assemble_neg_contexts(req, &total_len);
}
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ iov[0].iov_len = total_len;
- rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base;
/*
@@ -654,6 +616,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
cifs_dbg(FYI, "validate negotiate\n");
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (tcon->ses->server->rdma)
+ return 0;
+#endif
+
/*
* validation ioctl must be signed, so no point sending this if we
* can not sign it (ie are not known user). Even if signing is not
@@ -713,7 +680,6 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
- false /* use_ipc */,
(char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
(char **)&pneg_rsp, &rsplen);
@@ -733,8 +699,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
/* check validate negotiate info response matches what we got earlier */
- if (pneg_rsp->Dialect !=
- cpu_to_le16(tcon->ses->server->vals->protocol_id))
+ if (pneg_rsp->Dialect != cpu_to_le16(tcon->ses->server->dialect))
goto vneg_out;
if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
@@ -806,20 +771,22 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
struct smb2_sess_setup_req *req;
struct TCP_Server_Info *server = ses->server;
+ unsigned int total_len;
- rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req,
+ &total_len);
if (rc)
return rc;
/* First session, not a reauthenticate */
- req->hdr.sync_hdr.SessionId = 0;
+ req->sync_hdr.SessionId = 0;
/* if reconnect, we need to send previous sess id, otherwise it is 0 */
req->PreviousSessionId = sess_data->previous_session;
req->Flags = 0; /* MBZ */
/* to enable echos and oplocks */
- req->hdr.sync_hdr.CreditRequest = cpu_to_le16(3);
+ req->sync_hdr.CreditRequest = cpu_to_le16(3);
/* only one of SMB2 signing flags may be set in SMB2 request */
if (server->sign)
@@ -833,8 +800,8 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
req->Channel = 0; /* MBZ */
sess_data->iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field and 1 for pad */
- sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ /* 1 for pad */
+ sess_data->iov[0].iov_len = total_len - 1;
/*
* This variable will be used to clear the buffer
* allocated above in case of any error in the calling function.
@@ -860,18 +827,15 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
/* Testing shows that buffer offset must be at location of Buffer[0] */
req->SecurityBufferOffset =
- cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
- 1 /* pad */ - 4 /* rfc1001 len */);
+ cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */);
req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len);
- inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */);
-
/* BB add code to build os and lm fields */
- rc = SendReceive2(sess_data->xid, sess_data->ses,
- sess_data->iov, 2,
- &sess_data->buf0_type,
- CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov);
+ rc = smb2_send_recv(sess_data->xid, sess_data->ses,
+ sess_data->iov, 2,
+ &sess_data->buf0_type,
+ CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov);
cifs_small_buf_release(sess_data->iov[0].iov_base);
memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
@@ -1092,7 +1056,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
goto out;
req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
- req->hdr.sync_hdr.SessionId = ses->Suid;
+ req->sync_hdr.SessionId = ses->Suid;
rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
sess_data->nls_cp);
@@ -1202,6 +1166,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
int rc = 0;
struct TCP_Server_Info *server;
int flags = 0;
+ unsigned int total_len;
+ struct kvec iov[1];
+ struct kvec rsp_iov;
+ int resp_buf_type;
cifs_dbg(FYI, "disconnect session %p\n", ses);
@@ -1214,19 +1182,24 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
if (ses->need_reconnect)
goto smb2_session_already_dead;
- rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, (void **) &req, &total_len);
if (rc)
return rc;
/* since no tcon, smb2_init can not do this, so do here */
- req->hdr.sync_hdr.SessionId = ses->Suid;
+ req->sync_hdr.SessionId = ses->Suid;
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
flags |= CIFS_TRANSFORM_REQ;
else if (server->sign)
- req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
- rc = SendReceiveNoRsp(xid, ses, (char *) req, flags);
+ flags |= CIFS_NO_RESP;
+
+ iov[0].iov_base = (char *)req;
+ iov[0].iov_len = total_len;
+
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
/*
* No tcon so can't do
@@ -1265,6 +1238,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
int unc_path_len;
__le16 *unc_path = NULL;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "TCON\n");
@@ -1283,40 +1257,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
}
/* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
- if (tcon)
- tcon->tid = 0;
+ tcon->tid = 0;
- rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req,
+ &total_len);
if (rc) {
kfree(unc_path);
return rc;
}
- if (tcon == NULL) {
- if ((ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA))
- flags |= CIFS_TRANSFORM_REQ;
-
- /* since no tcon, smb2_init can not do this, so do here */
- req->hdr.sync_hdr.SessionId = ses->Suid;
- if (ses->server->sign)
- req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
- } else if (encryption_required(tcon))
+ if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field and 1 for pad */
- iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ /* 1 for pad */
+ iov[0].iov_len = total_len - 1;
/* Testing shows that buffer offset must be at location of Buffer[0] */
req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)
- - 1 /* pad */ - 4 /* do not count rfc1001 len field */);
+ - 1 /* pad */);
req->PathLength = cpu_to_le16(unc_path_len - 2);
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
- inc_rfc1001_len(req, unc_path_len - 1 /* pad */);
-
- rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -1328,21 +1292,16 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
goto tcon_error_exit;
}
- if (tcon == NULL) {
- ses->ipc_tid = rsp->hdr.sync_hdr.TreeId;
- goto tcon_exit;
- }
-
switch (rsp->ShareType) {
case SMB2_SHARE_TYPE_DISK:
cifs_dbg(FYI, "connection to disk share\n");
break;
case SMB2_SHARE_TYPE_PIPE:
- tcon->ipc = true;
+ tcon->pipe = true;
cifs_dbg(FYI, "connection to pipe share\n");
break;
case SMB2_SHARE_TYPE_PRINT:
- tcon->ipc = true;
+ tcon->print = true;
cifs_dbg(FYI, "connection to printer\n");
break;
default:
@@ -1389,6 +1348,10 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
int rc = 0;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
+ unsigned int total_len;
+ struct kvec iov[1];
+ struct kvec rsp_iov;
+ int resp_buf_type;
cifs_dbg(FYI, "Tree Disconnect\n");
@@ -1398,14 +1361,20 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
return 0;
- rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- rc = SendReceiveNoRsp(xid, ses, (char *)req, flags);
+ flags |= CIFS_NO_RESP;
+
+ iov[0].iov_base = (char *)req;
+ iov[0].iov_len = total_len;
+
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc)
cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
@@ -1505,11 +1474,10 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
if (!req->CreateContextsOffset)
req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) - 4 +
+ sizeof(struct smb2_create_req) +
iov[num - 1].iov_len);
le32_add_cpu(&req->CreateContextsLength,
server->vals->create_lease_size);
- inc_rfc1001_len(&req->hdr, server->vals->create_lease_size);
*num_iovec = num + 1;
return 0;
}
@@ -1589,10 +1557,9 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
iov[num].iov_len = sizeof(struct create_durable_v2);
if (!req->CreateContextsOffset)
req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ cpu_to_le32(sizeof(struct smb2_create_req) +
iov[1].iov_len);
le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
- inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
*num_iovec = num + 1;
return 0;
}
@@ -1613,12 +1580,10 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
if (!req->CreateContextsOffset)
req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ cpu_to_le32(sizeof(struct smb2_create_req) +
iov[1].iov_len);
le32_add_cpu(&req->CreateContextsLength,
sizeof(struct create_durable_handle_reconnect_v2));
- inc_rfc1001_len(&req->hdr,
- sizeof(struct create_durable_handle_reconnect_v2));
*num_iovec = num + 1;
return 0;
}
@@ -1649,10 +1614,9 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
iov[num].iov_len = sizeof(struct create_durable);
if (!req->CreateContextsOffset)
req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ cpu_to_le32(sizeof(struct smb2_create_req) +
iov[1].iov_len);
le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
- inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
*num_iovec = num + 1;
return 0;
}
@@ -1723,6 +1687,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u32 file_attributes = 0;
char *dhc_buf = NULL, *lc_buf = NULL;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "create/open\n");
@@ -1731,7 +1696,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
else
return -EIO;
- rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len);
+
if (rc)
return rc;
@@ -1752,12 +1718,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
/* -1 since last byte is buf[0] which is sent below (path) */
- iov[0].iov_len--;
+ iov[0].iov_len = total_len - 1;
- req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
+ req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req));
/* [MS-SMB2] 2.2.13 NameOffset:
* If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of
@@ -1770,7 +1734,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->hdr.sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, path);
@@ -1797,8 +1761,6 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
iov[1].iov_len = uni_path_len;
iov[1].iov_base = path;
- /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
- inc_rfc1001_len(req, uni_path_len - 1);
if (!server->oplocks)
*oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -1836,7 +1798,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
dhc_buf = iov[n_iov-1].iov_base;
}
- rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags,
+ &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -1877,7 +1840,7 @@ creat_exit:
*/
int
SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl, bool use_ipc,
+ u64 volatile_fid, u32 opcode, bool is_fsctl,
char *in_data, u32 indatalen,
char **out_data, u32 *plen /* returned data len */)
{
@@ -1891,6 +1854,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int n_iov;
int rc = 0;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "SMB2 IOCTL\n");
@@ -1909,20 +1873,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (!ses || !(ses->server))
return -EIO;
- rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
if (rc)
return rc;
- if (use_ipc) {
- if (ses->ipc_tid == 0) {
- cifs_small_buf_release(req);
- return -ENOTCONN;
- }
-
- cifs_dbg(FYI, "replacing tid 0x%x with IPC tid 0x%x\n",
- req->hdr.sync_hdr.TreeId, ses->ipc_tid);
- req->hdr.sync_hdr.TreeId = ses->ipc_tid;
- }
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1934,7 +1888,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
req->InputCount = cpu_to_le32(indatalen);
/* do not set InputOffset if no input data */
req->InputOffset =
- cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+ cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
iov[1].iov_base = in_data;
iov[1].iov_len = indatalen;
n_iov = 2;
@@ -1969,21 +1923,20 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
* but if input data passed to ioctl, we do not
* want to double count this, so we do not send
* the dummy one byte of data in iovec[0] if sending
- * input data (in iovec[1]). We also must add 4 bytes
- * in first iovec to allow for rfc1002 length field.
+ * input data (in iovec[1]).
*/
if (indatalen) {
- iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
- inc_rfc1001_len(req, indatalen - 1);
+ iov[0].iov_len = total_len - 1;
} else
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ iov[0].iov_len = total_len;
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
- req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
- rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags,
+ &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
@@ -2052,7 +2005,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
- false /* use_ipc */,
(char *)&fsctl_input /* data input */,
2 /* in data len */, &ret_data /* out data */, NULL);
@@ -2073,13 +2025,14 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype;
int rc = 0;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "Close\n");
if (!ses || !(ses->server))
return -EIO;
- rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_CLOSE, tcon, (void **) &req, &total_len);
if (rc)
return rc;
@@ -2090,10 +2043,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFileId = volatile_fid;
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ iov[0].iov_len = total_len;
- rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
@@ -2180,13 +2132,15 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "Query Info\n");
if (!ses || !(ses->server))
return -EIO;
- rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
@@ -2203,15 +2157,14 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
* We do not use the input buffer (do not send extra byte)
*/
req->InputBufferOffset = 0;
- inc_rfc1001_len(req, -1);
req->OutputBufferLength = cpu_to_le32(output_len);
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ /* 1 for Buffer */
+ iov[0].iov_len = total_len - 1;
- rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -2338,6 +2291,10 @@ void smb2_reconnect_server(struct work_struct *work)
tcon_exist = true;
}
}
+ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
+ list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
+ tcon_exist = true;
+ }
}
/*
* Get the reference to server struct to be sure that the last call of
@@ -2376,6 +2333,8 @@ SMB2_echo(struct TCP_Server_Info *server)
struct kvec iov[2];
struct smb_rqst rqst = { .rq_iov = iov,
.rq_nvec = 2 };
+ unsigned int total_len;
+ __be32 rfc1002_marker;
cifs_dbg(FYI, "In echo request\n");
@@ -2385,17 +2344,17 @@ SMB2_echo(struct TCP_Server_Info *server)
return rc;
}
- rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
+ rc = smb2_plain_req_init(SMB2_ECHO, NULL, (void **)&req, &total_len);
if (rc)
return rc;
- req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->sync_hdr.CreditRequest = cpu_to_le16(1);
- /* 4 for rfc1002 length field */
iov[0].iov_len = 4;
- iov[0].iov_base = (char *)req;
- iov[1].iov_len = get_rfc1002_length(req);
- iov[1].iov_base = (char *)req + 4;
+ rfc1002_marker = cpu_to_be32(total_len);
+ iov[0].iov_base = &rfc1002_marker;
+ iov[1].iov_len = total_len;
+ iov[1].iov_base = (char *)req;
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
server, CIFS_ECHO_OP);
@@ -2417,13 +2376,14 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int resp_buftype;
int rc = 0;
int flags = 0;
+ unsigned int total_len;
cifs_dbg(FYI, "Flush\n");
if (!ses || !(ses->server))
return -EIO;
- rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len);
if (rc)
return rc;
@@ -2434,10 +2394,9 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
req->VolatileFileId = volatile_fid;
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ iov[0].iov_len = total_len;
- rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc != 0)
@@ -2453,18 +2412,21 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
*/
static int
smb2_new_read_req(void **buf, unsigned int *total_len,
- struct cifs_io_parms *io_parms, unsigned int remaining_bytes,
- int request_type)
+ struct cifs_io_parms *io_parms, struct cifs_readdata *rdata,
+ unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
struct smb2_read_plain_req *req = NULL;
struct smb2_sync_hdr *shdr;
+ struct TCP_Server_Info *server;
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req,
total_len);
if (rc)
return rc;
- if (io_parms->tcon->ses->server == NULL)
+
+ server = io_parms->tcon->ses->server;
+ if (server == NULL)
return -ECONNABORTED;
shdr = &req->sync_hdr;
@@ -2478,7 +2440,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ /*
+ * If we want to do a RDMA write, fill in and append
+ * smbd_buffer_descriptor_v1 to the end of read request
+ */
+ if (server->rdma && rdata &&
+ rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
+
+ struct smbd_buffer_descriptor_v1 *v1;
+ bool need_invalidate =
+ io_parms->tcon->ses->server->dialect == SMB30_PROT_ID;
+
+ rdata->mr = smbd_register_mr(
+ server->smbd_conn, rdata->pages,
+ rdata->nr_pages, rdata->tailsz,
+ true, need_invalidate);
+ if (!rdata->mr)
+ return -ENOBUFS;
+
+ req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+ if (need_invalidate)
+ req->Channel = SMB2_CHANNEL_RDMA_V1;
+ req->ReadChannelInfoOffset =
+ cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer));
+ req->ReadChannelInfoLength =
+ cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
+ v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ v1->offset = cpu_to_le64(rdata->mr->mr->iova);
+ v1->token = cpu_to_le32(rdata->mr->mr->rkey);
+ v1->length = cpu_to_le32(rdata->mr->mr->length);
+
+ *total_len += sizeof(*v1) - 1;
+ }
+#endif
if (request_type & CHAINED_REQUEST) {
if (!(request_type & END_OF_CHAIN)) {
/* next 8-byte aligned request */
@@ -2557,7 +2552,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->result != -ENODATA)
rdata->result = -EIO;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ /*
+ * If this rdata has a memmory registered, the MR can be freed
+ * MR needs to be freed as soon as I/O finishes to prevent deadlock
+ * because they have limited number and are used for future I/Os
+ */
+ if (rdata->mr) {
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
if (rdata->result)
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
@@ -2592,7 +2597,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
server = io_parms.tcon->ses->server;
- rc = smb2_new_read_req((void **) &buf, &total_len, &io_parms, 0, 0);
+ rc = smb2_new_read_req(
+ (void **) &buf, &total_len, &io_parms, rdata, 0, 0);
if (rc) {
if (rc == -EAGAIN && rdata->credits) {
/* credits was reset by reconnect */
@@ -2650,55 +2656,48 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
struct smb2_read_plain_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
struct smb2_sync_hdr *shdr;
- struct kvec iov[2];
+ struct kvec iov[1];
struct kvec rsp_iov;
unsigned int total_len;
- __be32 req_len;
- struct smb_rqst rqst = { .rq_iov = iov,
- .rq_nvec = 2 };
int flags = CIFS_LOG_ERROR;
struct cifs_ses *ses = io_parms->tcon->ses;
*nbytes = 0;
- rc = smb2_new_read_req((void **)&req, &total_len, io_parms, 0, 0);
+ rc = smb2_new_read_req((void **)&req, &total_len, io_parms, NULL, 0, 0);
if (rc)
return rc;
if (encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
- req_len = cpu_to_be32(total_len);
-
- iov[0].iov_base = &req_len;
- iov[0].iov_len = sizeof(__be32);
- iov[1].iov_base = req;
- iov[1].iov_len = total_len;
+ iov[0].iov_base = (char *)req;
+ iov[0].iov_len = total_len;
- rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
- shdr = get_sync_hdr(rsp);
- if (shdr->Status == STATUS_END_OF_FILE) {
+ if (rc) {
+ if (rc != -ENODATA) {
+ cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
+ cifs_dbg(VFS, "Send error in read = %d\n", rc);
+ }
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
- return 0;
+ return rc == -ENODATA ? 0 : rc;
}
- if (rc) {
- cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
- cifs_dbg(VFS, "Send error in read = %d\n", rc);
- } else {
- *nbytes = le32_to_cpu(rsp->DataLength);
- if ((*nbytes > CIFS_MAX_MSGSIZE) ||
- (*nbytes > io_parms->length)) {
- cifs_dbg(FYI, "bad length %d for count %d\n",
- *nbytes, io_parms->length);
- rc = -EIO;
- *nbytes = 0;
- }
+ *nbytes = le32_to_cpu(rsp->DataLength);
+ if ((*nbytes > CIFS_MAX_MSGSIZE) ||
+ (*nbytes > io_parms->length)) {
+ cifs_dbg(FYI, "bad length %d for count %d\n",
+ *nbytes, io_parms->length);
+ rc = -EIO;
+ *nbytes = 0;
}
+ shdr = get_sync_hdr(rsp);
+
if (*buf) {
memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
@@ -2755,7 +2754,19 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EIO;
break;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ /*
+ * If this wdata has a memory registered, the MR can be freed
+ * The number of MRs available is limited, it's important to recover
+ * used MR as soon as I/O is finished. Hold MR longer in the later
+ * I/O process can possibly result in I/O deadlock due to lack of MR
+ * to send request on I/O retry
+ */
+ if (wdata->mr) {
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
if (wdata->result)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
@@ -2776,8 +2787,10 @@ smb2_async_writev(struct cifs_writedata *wdata,
struct TCP_Server_Info *server = tcon->ses->server;
struct kvec iov[2];
struct smb_rqst rqst = { };
+ unsigned int total_len;
+ __be32 rfc1002_marker;
- rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
if (rc) {
if (rc == -EAGAIN && wdata->credits) {
/* credits was reset by reconnect */
@@ -2793,7 +2806,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- shdr = get_sync_hdr(req);
+ shdr = (struct smb2_sync_hdr *)req;
shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
req->PersistentFileId = wdata->cfile->fid.persistent_fid;
@@ -2802,16 +2815,51 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->WriteChannelInfoLength = 0;
req->Channel = 0;
req->Offset = cpu_to_le64(wdata->offset);
- /* 4 for rfc1002 length field */
req->DataOffset = cpu_to_le16(
- offsetof(struct smb2_write_req, Buffer) - 4);
+ offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ /*
+ * If we want to do a server RDMA read, fill in and append
+ * smbd_buffer_descriptor_v1 to the end of write request
+ */
+ if (server->rdma && wdata->bytes >=
+ server->smbd_conn->rdma_readwrite_threshold) {
+
+ struct smbd_buffer_descriptor_v1 *v1;
+ bool need_invalidate = server->dialect == SMB30_PROT_ID;
+
+ wdata->mr = smbd_register_mr(
+ server->smbd_conn, wdata->pages,
+ wdata->nr_pages, wdata->tailsz,
+ false, need_invalidate);
+ if (!wdata->mr) {
+ rc = -ENOBUFS;
+ goto async_writev_out;
+ }
+ req->Length = 0;
+ req->DataOffset = 0;
+ req->RemainingBytes =
+ cpu_to_le32((wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz);
+ req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+ if (need_invalidate)
+ req->Channel = SMB2_CHANNEL_RDMA_V1;
+ req->WriteChannelInfoOffset =
+ cpu_to_le16(offsetof(struct smb2_write_req, Buffer));
+ req->WriteChannelInfoLength =
+ cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
+ v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ v1->offset = cpu_to_le64(wdata->mr->mr->iova);
+ v1->token = cpu_to_le32(wdata->mr->mr->rkey);
+ v1->length = cpu_to_le32(wdata->mr->mr->length);
+ }
+#endif
/* 4 for rfc1002 length field and 1 for Buffer */
iov[0].iov_len = 4;
- iov[0].iov_base = req;
- iov[1].iov_len = get_rfc1002_length(req) - 1;
- iov[1].iov_base = (char *)req + 4;
+ rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes);
+ iov[0].iov_base = &rfc1002_marker;
+ iov[1].iov_len = total_len - 1;
+ iov[1].iov_base = (char *)req;
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
@@ -2819,13 +2867,22 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_npages = wdata->nr_pages;
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
+ rqst.rq_npages = 0;
+ }
+#endif
cifs_dbg(FYI, "async write at %llu %u bytes\n",
wdata->offset, wdata->bytes);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ /* For RDMA read, I/O size is in RemainingBytes not in Length */
+ if (!wdata->mr)
+ req->Length = cpu_to_le32(wdata->bytes);
+#else
req->Length = cpu_to_le32(wdata->bytes);
-
- inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+#endif
if (wdata->credits) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
@@ -2869,13 +2926,15 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
int resp_buftype;
struct kvec rsp_iov;
int flags = 0;
+ unsigned int total_len;
*nbytes = 0;
if (n_vec < 1)
return rc;
- rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
@@ -2885,7 +2944,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->hdr.sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
+ req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
req->PersistentFileId = io_parms->persistent_fid;
req->VolatileFileId = io_parms->volatile_fid;
@@ -2894,20 +2953,16 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
req->Channel = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
- /* 4 for rfc1002 length field */
req->DataOffset = cpu_to_le16(
- offsetof(struct smb2_write_req, Buffer) - 4);
+ offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field and 1 for Buffer */
- iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
-
- /* length of entire message including data to be written */
- inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */);
+ /* 1 for Buffer */
+ iov[0].iov_len = total_len - 1;
- rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1,
- &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1,
+ &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
@@ -2984,13 +3039,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int output_size = CIFSMaxBufSize;
size_t info_buf_size;
int flags = 0;
+ unsigned int total_len;
if (ses && (ses->server))
server = ses->server;
else
return -EIO;
- rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
@@ -3022,7 +3079,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(bufptr, &asteriks, len);
req->FileNameOffset =
- cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4);
+ cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1);
req->FileNameLength = cpu_to_le16(len);
/*
* BB could be 30 bytes or so longer if we used SMB2 specific
@@ -3033,15 +3090,13 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
req->OutputBufferLength = cpu_to_le32(output_size);
iov[0].iov_base = (char *)req;
- /* 4 for RFC1001 length and 1 for Buffer */
- iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ /* 1 for Buffer */
+ iov[0].iov_len = total_len - 1;
iov[1].iov_base = (char *)(req->Buffer);
iov[1].iov_len = len;
- inc_rfc1001_len(req, len - 1 /* Buffer */);
-
- rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3110,6 +3165,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int i;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
+ unsigned int total_len;
if (!ses || !(ses->server))
return -EIO;
@@ -3121,7 +3177,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!iov)
return -ENOMEM;
- rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len);
if (rc) {
kfree(iov);
return rc;
@@ -3130,7 +3186,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->sync_hdr.ProcessId = cpu_to_le32(pid);
req->InfoType = info_type;
req->FileInfoClass = info_class;
@@ -3138,27 +3194,25 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFileId = volatile_fid;
req->AdditionalInformation = cpu_to_le32(additional_info);
- /* 4 for RFC1001 length and 1 for Buffer */
req->BufferOffset =
- cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4);
+ cpu_to_le16(sizeof(struct smb2_set_info_req) - 1);
req->BufferLength = cpu_to_le32(*size);
- inc_rfc1001_len(req, *size - 1 /* Buffer */);
-
memcpy(req->Buffer, *data, *size);
+ total_len += *size;
iov[0].iov_base = (char *)req;
- /* 4 for RFC1001 length */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
+ /* 1 for Buffer */
+ iov[0].iov_len = total_len - 1;
for (i = 1; i < num; i++) {
- inc_rfc1001_len(req, size[i]);
le32_add_cpu(&req->BufferLength, size[i]);
iov[i].iov_base = (char *)data[i];
iov[i].iov_len = size[i];
}
- rc = SendReceive2(xid, ses, iov, num, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags,
+ &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
@@ -3310,11 +3364,17 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock_level)
{
int rc;
- struct smb2_oplock_break *req = NULL;
+ struct smb2_oplock_break_req *req = NULL;
+ struct cifs_ses *ses = tcon->ses;
int flags = CIFS_OBREAK_OP;
+ unsigned int total_len;
+ struct kvec iov[1];
+ struct kvec rsp_iov;
+ int resp_buf_type;
cifs_dbg(FYI, "SMB2_oplock_break\n");
- rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
@@ -3324,9 +3384,14 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFid = volatile_fid;
req->PersistentFid = persistent_fid;
req->OplockLevel = oplock_level;
- req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->sync_hdr.CreditRequest = cpu_to_le16(1);
+
+ flags |= CIFS_NO_RESP;
- rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags);
+ iov[0].iov_base = (char *)req;
+ iov[0].iov_len = total_len;
+
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
@@ -3355,13 +3420,15 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
{
int rc;
struct smb2_query_info_req *req;
+ unsigned int total_len;
cifs_dbg(FYI, "Query FSInfo level %d\n", level);
if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
return -EIO;
- rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
@@ -3369,15 +3436,14 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
req->FileInfoClass = level;
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
- /* 4 for rfc1002 length field and 1 for pad */
+ /* 1 for pad */
req->InputBufferOffset =
- cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+ cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
req->OutputBufferLength = cpu_to_le32(
outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4);
iov->iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov->iov_len = get_rfc1002_length(req) + 4;
+ iov->iov_len = total_len;
return 0;
}
@@ -3403,7 +3469,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(iov.iov_base);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
@@ -3459,7 +3525,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
+ rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(iov.iov_base);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
@@ -3505,34 +3571,33 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buf_type;
unsigned int count;
int flags = CIFS_NO_RESP;
+ unsigned int total_len;
cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
- rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_LOCK, tcon, (void **) &req, &total_len);
if (rc)
return rc;
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->sync_hdr.ProcessId = cpu_to_le32(pid);
req->LockCount = cpu_to_le16(num_lock);
req->PersistentFileId = persist_fid;
req->VolatileFileId = volatile_fid;
count = num_lock * sizeof(struct smb2_lock_element);
- inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element));
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field and count for all locks */
- iov[0].iov_len = get_rfc1002_length(req) + 4 - count;
+ iov[0].iov_len = total_len - sizeof(struct smb2_lock_element);
iov[1].iov_base = (char *)buf;
iov[1].iov_len = count;
cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
- rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, flags,
- &rsp_iov);
+ rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags,
+ &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
@@ -3565,24 +3630,35 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
{
int rc;
struct smb2_lease_ack *req = NULL;
+ struct cifs_ses *ses = tcon->ses;
int flags = CIFS_OBREAK_OP;
+ unsigned int total_len;
+ struct kvec iov[1];
+ struct kvec rsp_iov;
+ int resp_buf_type;
cifs_dbg(FYI, "SMB2_lease_break\n");
- rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+ rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req,
+ &total_len);
if (rc)
return rc;
if (encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->sync_hdr.CreditRequest = cpu_to_le16(1);
req->StructureSize = cpu_to_le16(36);
- inc_rfc1001_len(req, 12);
+ total_len += 12;
memcpy(req->LeaseKey, lease_key, 16);
req->LeaseState = lease_state;
- rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags);
+ flags |= CIFS_NO_RESP;
+
+ iov[0].iov_base = (char *)req;
+ iov[0].iov_len = total_len;
+
+ rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index c2ec934be968..6eb9f9691ed4 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -195,7 +195,7 @@ struct smb2_symlink_err_rsp {
#define SMB2_CLIENT_GUID_SIZE 16
struct smb2_negotiate_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 36 */
__le16 DialectCount;
__le16 SecurityMode;
@@ -282,7 +282,7 @@ struct smb2_negotiate_rsp {
#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
struct smb2_sess_setup_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 25 */
__u8 Flags;
__u8 SecurityMode;
@@ -308,7 +308,7 @@ struct smb2_sess_setup_rsp {
} __packed;
struct smb2_logoff_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -323,7 +323,7 @@ struct smb2_logoff_rsp {
#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001
struct smb2_tree_connect_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 Reserved; /* Flags in SMB3.1.1 */
__le16 PathOffset;
@@ -375,7 +375,7 @@ struct smb2_tree_connect_rsp {
#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
struct smb2_tree_disconnect_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -496,7 +496,7 @@ struct smb2_tree_disconnect_rsp {
#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
struct smb2_create_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
__u8 SecurityFlags;
__u8 RequestedOplockLevel;
@@ -753,7 +753,7 @@ struct duplicate_extents_to_file {
} __packed;
struct smb2_ioctl_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -789,7 +789,7 @@ struct smb2_ioctl_rsp {
/* Currently defined values for close flags */
#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
struct smb2_close_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 24 */
__le16 Flags;
__le32 Reserved;
@@ -812,7 +812,7 @@ struct smb2_close_rsp {
} __packed;
struct smb2_flush_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 24 */
__le16 Reserved1;
__le32 Reserved2;
@@ -830,9 +830,9 @@ struct smb2_flush_rsp {
#define SMB2_READFLAG_READ_UNBUFFERED 0x01
/* Channel field for read and write: exactly one of following flags can be set*/
-#define SMB2_CHANNEL_NONE 0x00000000
-#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000002 /* SMB3.02 or later */
+#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
/* SMB2 read request without RFC1001 length at the beginning */
struct smb2_read_plain_req {
@@ -847,8 +847,8 @@ struct smb2_read_plain_req {
__le32 MinimumCount;
__le32 Channel; /* MBZ except for SMB3 or later */
__le32 RemainingBytes;
- __le16 ReadChannelInfoOffset; /* Reserved MBZ */
- __le16 ReadChannelInfoLength; /* Reserved MBZ */
+ __le16 ReadChannelInfoOffset;
+ __le16 ReadChannelInfoLength;
__u8 Buffer[1];
} __packed;
@@ -868,7 +868,7 @@ struct smb2_read_rsp {
#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
struct smb2_write_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 49 */
__le16 DataOffset; /* offset from start of SMB2 header to write data */
__le32 Length;
@@ -877,8 +877,8 @@ struct smb2_write_req {
__u64 VolatileFileId; /* opaque endianness */
__le32 Channel; /* Reserved MBZ */
__le32 RemainingBytes;
- __le16 WriteChannelInfoOffset; /* Reserved MBZ */
- __le16 WriteChannelInfoLength; /* Reserved MBZ */
+ __le16 WriteChannelInfoOffset;
+ __le16 WriteChannelInfoLength;
__le32 Flags;
__u8 Buffer[1];
} __packed;
@@ -907,7 +907,7 @@ struct smb2_lock_element {
} __packed;
struct smb2_lock_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 48 */
__le16 LockCount;
__le32 Reserved;
@@ -924,7 +924,7 @@ struct smb2_lock_rsp {
} __packed;
struct smb2_echo_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
@@ -942,7 +942,7 @@ struct smb2_echo_rsp {
#define SMB2_REOPEN 0x10
struct smb2_query_directory_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
__u8 FileInformationClass;
__u8 Flags;
@@ -989,7 +989,7 @@ struct smb2_query_directory_rsp {
#define SL_INDEX_SPECIFIED 0x00000004
struct smb2_query_info_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 41 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1013,7 +1013,7 @@ struct smb2_query_info_rsp {
} __packed;
struct smb2_set_info_req {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1031,7 +1031,19 @@ struct smb2_set_info_rsp {
__le16 StructureSize; /* Must be 2 */
} __packed;
-struct smb2_oplock_break {
+/* oplock break without an rfc1002 header */
+struct smb2_oplock_break_req {
+ struct smb2_sync_hdr sync_hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __u8 OplockLevel;
+ __u8 Reserved;
+ __le32 Reserved2;
+ __u64 PersistentFid;
+ __u64 VolatileFid;
+} __packed;
+
+/* oplock break with an rfc1002 header */
+struct smb2_oplock_break_rsp {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 24 */
__u8 OplockLevel;
@@ -1057,7 +1069,7 @@ struct smb2_lease_break {
} __packed;
struct smb2_lease_ack {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 36 */
__le16 Reserved;
__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index e9ab5227e7a8..05287b01f596 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -125,8 +125,7 @@ extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
struct smb2_err_rsp **err_buf);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, bool use_ipc,
- char *in_data, u32 indatalen,
+ bool is_fsctl, char *in_data, u32 indatalen,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
new file mode 100644
index 000000000000..5130492847eb
--- /dev/null
+++ b/fs/cifs/smbdirect.c
@@ -0,0 +1,2610 @@
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ *
+ * Author(s): Long Li <longli@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include "smbdirect.h"
+#include "cifs_debug.h"
+
+static struct smbd_response *get_empty_queue_buffer(
+ struct smbd_connection *info);
+static struct smbd_response *get_receive_buffer(
+ struct smbd_connection *info);
+static void put_receive_buffer(
+ struct smbd_connection *info,
+ struct smbd_response *response);
+static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
+static void destroy_receive_buffers(struct smbd_connection *info);
+
+static void put_empty_packet(
+ struct smbd_connection *info, struct smbd_response *response);
+static void enqueue_reassembly(
+ struct smbd_connection *info,
+ struct smbd_response *response, int data_length);
+static struct smbd_response *_get_first_reassembly(
+ struct smbd_connection *info);
+
+static int smbd_post_recv(
+ struct smbd_connection *info,
+ struct smbd_response *response);
+
+static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_data(
+ struct smbd_connection *info,
+ struct kvec *iov, int n_vec, int remaining_data_length);
+static int smbd_post_send_page(struct smbd_connection *info,
+ struct page *page, unsigned long offset,
+ size_t size, int remaining_data_length);
+
+static void destroy_mr_list(struct smbd_connection *info);
+static int allocate_mr_list(struct smbd_connection *info);
+
+/* SMBD version number */
+#define SMBD_V1 0x0100
+
+/* Port numbers for SMBD transport */
+#define SMB_PORT 445
+#define SMBD_PORT 5445
+
+/* Address lookup and resolve timeout in ms */
+#define RDMA_RESOLVE_TIMEOUT 5000
+
+/* SMBD negotiation timeout in seconds */
+#define SMBD_NEGOTIATE_TIMEOUT 120
+
+/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
+#define SMBD_MIN_RECEIVE_SIZE 128
+#define SMBD_MIN_FRAGMENTED_SIZE 131072
+
+/*
+ * Default maximum number of RDMA read/write outstanding on this connection
+ * This value is possibly decreased during QP creation on hardware limit
+ */
+#define SMBD_CM_RESPONDER_RESOURCES 32
+
+/* Maximum number of retries on data transfer operations */
+#define SMBD_CM_RETRY 6
+/* No need to retry on Receiver Not Ready since SMBD manages credits */
+#define SMBD_CM_RNR_RETRY 0
+
+/*
+ * User configurable initial values per SMBD transport connection
+ * as defined in [MS-SMBD] 3.1.1.1
+ * Those may change after a SMBD negotiation
+ */
+/* The local peer's maximum number of credits to grant to the peer */
+int smbd_receive_credit_max = 255;
+
+/* The remote peer's credit request of local peer */
+int smbd_send_credit_target = 255;
+
+/* The maximum single message size can be sent to remote peer */
+int smbd_max_send_size = 1364;
+
+/* The maximum fragmented upper-layer payload receive size supported */
+int smbd_max_fragmented_recv_size = 1024 * 1024;
+
+/* The maximum single-message size which can be received */
+int smbd_max_receive_size = 8192;
+
+/* The timeout to initiate send of a keepalive message on idle */
+int smbd_keep_alive_interval = 120;
+
+/*
+ * User configurable initial values for RDMA transport
+ * The actual values used may be lower and are limited to hardware capabilities
+ */
+/* Default maximum number of SGEs in a RDMA write/read */
+int smbd_max_frmr_depth = 2048;
+
+/* If payload is less than this byte, use RDMA send/recv not read/write */
+int rdma_readwrite_threshold = 4096;
+
+/* Transport logging functions
+ * Logging are defined as classes. They can be OR'ed to define the actual
+ * logging level via module parameter smbd_logging_class
+ * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
+ * log_rdma_event()
+ */
+#define LOG_OUTGOING 0x1
+#define LOG_INCOMING 0x2
+#define LOG_READ 0x4
+#define LOG_WRITE 0x8
+#define LOG_RDMA_SEND 0x10
+#define LOG_RDMA_RECV 0x20
+#define LOG_KEEP_ALIVE 0x40
+#define LOG_RDMA_EVENT 0x80
+#define LOG_RDMA_MR 0x100
+static unsigned int smbd_logging_class;
+module_param(smbd_logging_class, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_class,
+ "Logging class for SMBD transport 0x0 to 0x100");
+
+#define ERR 0x0
+#define INFO 0x1
+static unsigned int smbd_logging_level = ERR;
+module_param(smbd_logging_level, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_level,
+ "Logging level for SMBD transport, 0 (default): error, 1: info");
+
+#define log_rdma(level, class, fmt, args...) \
+do { \
+ if (level <= smbd_logging_level || class & smbd_logging_class) \
+ cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
+} while (0)
+
+#define log_outgoing(level, fmt, args...) \
+ log_rdma(level, LOG_OUTGOING, fmt, ##args)
+#define log_incoming(level, fmt, args...) \
+ log_rdma(level, LOG_INCOMING, fmt, ##args)
+#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
+#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
+#define log_rdma_send(level, fmt, args...) \
+ log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
+#define log_rdma_recv(level, fmt, args...) \
+ log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
+#define log_keep_alive(level, fmt, args...) \
+ log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
+#define log_rdma_event(level, fmt, args...) \
+ log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
+#define log_rdma_mr(level, fmt, args...) \
+ log_rdma(level, LOG_RDMA_MR, fmt, ##args)
+
+/*
+ * Destroy the transport and related RDMA and memory resources
+ * Need to go through all the pending counters and make sure on one is using
+ * the transport while it is destroyed
+ */
+static void smbd_destroy_rdma_work(struct work_struct *work)
+{
+ struct smbd_response *response;
+ struct smbd_connection *info =
+ container_of(work, struct smbd_connection, destroy_work);
+ unsigned long flags;
+
+ log_rdma_event(INFO, "destroying qp\n");
+ ib_drain_qp(info->id->qp);
+ rdma_destroy_qp(info->id);
+
+ /* Unblock all I/O waiting on the send queue */
+ wake_up_interruptible_all(&info->wait_send_queue);
+
+ log_rdma_event(INFO, "cancelling idle timer\n");
+ cancel_delayed_work_sync(&info->idle_timer_work);
+ log_rdma_event(INFO, "cancelling send immediate work\n");
+ cancel_delayed_work_sync(&info->send_immediate_work);
+
+ log_rdma_event(INFO, "wait for all send to finish\n");
+ wait_event(info->wait_smbd_send_pending,
+ info->smbd_send_pending == 0);
+
+ log_rdma_event(INFO, "wait for all recv to finish\n");
+ wake_up_interruptible(&info->wait_reassembly_queue);
+ wait_event(info->wait_smbd_recv_pending,
+ info->smbd_recv_pending == 0);
+
+ log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
+ wait_event(info->wait_send_pending,
+ atomic_read(&info->send_pending) == 0);
+ wait_event(info->wait_send_payload_pending,
+ atomic_read(&info->send_payload_pending) == 0);
+
+ log_rdma_event(INFO, "freeing mr list\n");
+ wake_up_interruptible_all(&info->wait_mr);
+ wait_event(info->wait_for_mr_cleanup,
+ atomic_read(&info->mr_used_count) == 0);
+ destroy_mr_list(info);
+
+ /* It's not posssible for upper layer to get to reassembly */
+ log_rdma_event(INFO, "drain the reassembly queue\n");
+ do {
+ spin_lock_irqsave(&info->reassembly_queue_lock, flags);
+ response = _get_first_reassembly(info);
+ if (response) {
+ list_del(&response->list);
+ spin_unlock_irqrestore(
+ &info->reassembly_queue_lock, flags);
+ put_receive_buffer(info, response);
+ }
+ } while (response);
+ spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
+ info->reassembly_data_length = 0;
+
+ log_rdma_event(INFO, "free receive buffers\n");
+ wait_event(info->wait_receive_queues,
+ info->count_receive_queue + info->count_empty_packet_queue
+ == info->receive_credit_max);
+ destroy_receive_buffers(info);
+
+ ib_free_cq(info->send_cq);
+ ib_free_cq(info->recv_cq);
+ ib_dealloc_pd(info->pd);
+ rdma_destroy_id(info->id);
+
+ /* free mempools */
+ mempool_destroy(info->request_mempool);
+ kmem_cache_destroy(info->request_cache);
+
+ mempool_destroy(info->response_mempool);
+ kmem_cache_destroy(info->response_cache);
+
+ info->transport_status = SMBD_DESTROYED;
+ wake_up_all(&info->wait_destroy);
+}
+
+static int smbd_process_disconnected(struct smbd_connection *info)
+{
+ schedule_work(&info->destroy_work);
+ return 0;
+}
+
+static void smbd_disconnect_rdma_work(struct work_struct *work)
+{
+ struct smbd_connection *info =
+ container_of(work, struct smbd_connection, disconnect_work);
+
+ if (info->transport_status == SMBD_CONNECTED) {
+ info->transport_status = SMBD_DISCONNECTING;
+ rdma_disconnect(info->id);
+ }
+}
+
+static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
+{
+ queue_work(info->workqueue, &info->disconnect_work);
+}
+
+/* Upcall from RDMA CM */
+static int smbd_conn_upcall(
+ struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+ struct smbd_connection *info = id->context;
+
+ log_rdma_event(INFO, "event=%d status=%d\n",
+ event->event, event->status);
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ info->ri_rc = 0;
+ complete(&info->ri_done);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ info->ri_rc = -EHOSTUNREACH;
+ complete(&info->ri_done);
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ info->ri_rc = -ENETUNREACH;
+ complete(&info->ri_done);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ log_rdma_event(INFO, "connected event=%d\n", event->event);
+ info->transport_status = SMBD_CONNECTED;
+ wake_up_interruptible(&info->conn_wait);
+ break;
+
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
+ info->transport_status = SMBD_DISCONNECTED;
+ wake_up_interruptible(&info->conn_wait);
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_DISCONNECTED:
+ /* This happenes when we fail the negotiation */
+ if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
+ info->transport_status = SMBD_DISCONNECTED;
+ wake_up(&info->conn_wait);
+ break;
+ }
+
+ info->transport_status = SMBD_DISCONNECTED;
+ smbd_process_disconnected(info);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/* Upcall from RDMA QP */
+static void
+smbd_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+ struct smbd_connection *info = context;
+
+ log_rdma_event(ERR, "%s on device %s info %p\n",
+ ib_event_msg(event->event), event->device->name, info);
+
+ switch (event->event) {
+ case IB_EVENT_CQ_ERR:
+ case IB_EVENT_QP_FATAL:
+ smbd_disconnect_rdma_connection(info);
+
+ default:
+ break;
+ }
+}
+
+static inline void *smbd_request_payload(struct smbd_request *request)
+{
+ return (void *)request->packet;
+}
+
+static inline void *smbd_response_payload(struct smbd_response *response)
+{
+ return (void *)response->packet;
+}
+
+/* Called when a RDMA send is done */
+static void send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ int i;
+ struct smbd_request *request =
+ container_of(wc->wr_cqe, struct smbd_request, cqe);
+
+ log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
+ request, wc->status);
+
+ if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
+ log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
+ wc->status, wc->opcode);
+ smbd_disconnect_rdma_connection(request->info);
+ }
+
+ for (i = 0; i < request->num_sge; i++)
+ ib_dma_unmap_single(request->info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+
+ if (request->has_payload) {
+ if (atomic_dec_and_test(&request->info->send_payload_pending))
+ wake_up(&request->info->wait_send_payload_pending);
+ } else {
+ if (atomic_dec_and_test(&request->info->send_pending))
+ wake_up(&request->info->wait_send_pending);
+ }
+
+ mempool_free(request, request->info->request_mempool);
+}
+
+static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
+{
+ log_rdma_event(INFO, "resp message min_version %u max_version %u "
+ "negotiated_version %u credits_requested %u "
+ "credits_granted %u status %u max_readwrite_size %u "
+ "preferred_send_size %u max_receive_size %u "
+ "max_fragmented_size %u\n",
+ resp->min_version, resp->max_version, resp->negotiated_version,
+ resp->credits_requested, resp->credits_granted, resp->status,
+ resp->max_readwrite_size, resp->preferred_send_size,
+ resp->max_receive_size, resp->max_fragmented_size);
+}
+
+/*
+ * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
+ * response, packet_length: the negotiation response message
+ * return value: true if negotiation is a success, false if failed
+ */
+static bool process_negotiation_response(
+ struct smbd_response *response, int packet_length)
+{
+ struct smbd_connection *info = response->info;
+ struct smbd_negotiate_resp *packet = smbd_response_payload(response);
+
+ if (packet_length < sizeof(struct smbd_negotiate_resp)) {
+ log_rdma_event(ERR,
+ "error: packet_length=%d\n", packet_length);
+ return false;
+ }
+
+ if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
+ log_rdma_event(ERR, "error: negotiated_version=%x\n",
+ le16_to_cpu(packet->negotiated_version));
+ return false;
+ }
+ info->protocol = le16_to_cpu(packet->negotiated_version);
+
+ if (packet->credits_requested == 0) {
+ log_rdma_event(ERR, "error: credits_requested==0\n");
+ return false;
+ }
+ info->receive_credit_target = le16_to_cpu(packet->credits_requested);
+
+ if (packet->credits_granted == 0) {
+ log_rdma_event(ERR, "error: credits_granted==0\n");
+ return false;
+ }
+ atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
+
+ atomic_set(&info->receive_credits, 0);
+
+ if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
+ log_rdma_event(ERR, "error: preferred_send_size=%d\n",
+ le32_to_cpu(packet->preferred_send_size));
+ return false;
+ }
+ info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
+
+ if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
+ log_rdma_event(ERR, "error: max_receive_size=%d\n",
+ le32_to_cpu(packet->max_receive_size));
+ return false;
+ }
+ info->max_send_size = min_t(int, info->max_send_size,
+ le32_to_cpu(packet->max_receive_size));
+
+ if (le32_to_cpu(packet->max_fragmented_size) <
+ SMBD_MIN_FRAGMENTED_SIZE) {
+ log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
+ le32_to_cpu(packet->max_fragmented_size));
+ return false;
+ }
+ info->max_fragmented_send_size =
+ le32_to_cpu(packet->max_fragmented_size);
+ info->rdma_readwrite_threshold =
+ rdma_readwrite_threshold > info->max_fragmented_send_size ?
+ info->max_fragmented_send_size :
+ rdma_readwrite_threshold;
+
+
+ info->max_readwrite_size = min_t(u32,
+ le32_to_cpu(packet->max_readwrite_size),
+ info->max_frmr_depth * PAGE_SIZE);
+ info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
+
+ return true;
+}
+
+/*
+ * Check and schedule to send an immediate packet
+ * This is used to extend credtis to remote peer to keep the transport busy
+ */
+static void check_and_send_immediate(struct smbd_connection *info)
+{
+ if (info->transport_status != SMBD_CONNECTED)
+ return;
+
+ info->send_immediate = true;
+
+ /*
+ * Promptly send a packet if our peer is running low on receive
+ * credits
+ */
+ if (atomic_read(&info->receive_credits) <
+ info->receive_credit_target - 1)
+ queue_delayed_work(
+ info->workqueue, &info->send_immediate_work, 0);
+}
+
+static void smbd_post_send_credits(struct work_struct *work)
+{
+ int ret = 0;
+ int use_receive_queue = 1;
+ int rc;
+ struct smbd_response *response;
+ struct smbd_connection *info =
+ container_of(work, struct smbd_connection,
+ post_send_credits_work);
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ wake_up(&info->wait_receive_queues);
+ return;
+ }
+
+ if (info->receive_credit_target >
+ atomic_read(&info->receive_credits)) {
+ while (true) {
+ if (use_receive_queue)
+ response = get_receive_buffer(info);
+ else
+ response = get_empty_queue_buffer(info);
+ if (!response) {
+ /* now switch to emtpy packet queue */
+ if (use_receive_queue) {
+ use_receive_queue = 0;
+ continue;
+ } else
+ break;
+ }
+
+ response->type = SMBD_TRANSFER_DATA;
+ response->first_segment = false;
+ rc = smbd_post_recv(info, response);
+ if (rc) {
+ log_rdma_recv(ERR,
+ "post_recv failed rc=%d\n", rc);
+ put_receive_buffer(info, response);
+ break;
+ }
+
+ ret++;
+ }
+ }
+
+ spin_lock(&info->lock_new_credits_offered);
+ info->new_credits_offered += ret;
+ spin_unlock(&info->lock_new_credits_offered);
+
+ atomic_add(ret, &info->receive_credits);
+
+ /* Check if we can post new receive and grant credits to peer */
+ check_and_send_immediate(info);
+}
+
+static void smbd_recv_done_work(struct work_struct *work)
+{
+ struct smbd_connection *info =
+ container_of(work, struct smbd_connection, recv_done_work);
+
+ /*
+ * We may have new send credits granted from remote peer
+ * If any sender is blcoked on lack of credets, unblock it
+ */
+ if (atomic_read(&info->send_credits))
+ wake_up_interruptible(&info->wait_send_queue);
+
+ /*
+ * Check if we need to send something to remote peer to
+ * grant more credits or respond to KEEP_ALIVE packet
+ */
+ check_and_send_immediate(info);
+}
+
+/* Called from softirq, when recv is done */
+static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbd_data_transfer *data_transfer;
+ struct smbd_response *response =
+ container_of(wc->wr_cqe, struct smbd_response, cqe);
+ struct smbd_connection *info = response->info;
+ int data_length = 0;
+
+ log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d "
+ "byte_len=%d pkey_index=%x\n",
+ response, response->type, wc->status, wc->opcode,
+ wc->byte_len, wc->pkey_index);
+
+ if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
+ log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
+ wc->status, wc->opcode);
+ smbd_disconnect_rdma_connection(info);
+ goto error;
+ }
+
+ ib_dma_sync_single_for_cpu(
+ wc->qp->device,
+ response->sge.addr,
+ response->sge.length,
+ DMA_FROM_DEVICE);
+
+ switch (response->type) {
+ /* SMBD negotiation response */
+ case SMBD_NEGOTIATE_RESP:
+ dump_smbd_negotiate_resp(smbd_response_payload(response));
+ info->full_packet_received = true;
+ info->negotiate_done =
+ process_negotiation_response(response, wc->byte_len);
+ complete(&info->negotiate_completion);
+ break;
+
+ /* SMBD data transfer packet */
+ case SMBD_TRANSFER_DATA:
+ data_transfer = smbd_response_payload(response);
+ data_length = le32_to_cpu(data_transfer->data_length);
+
+ /*
+ * If this is a packet with data playload place the data in
+ * reassembly queue and wake up the reading thread
+ */
+ if (data_length) {
+ if (info->full_packet_received)
+ response->first_segment = true;
+
+ if (le32_to_cpu(data_transfer->remaining_data_length))
+ info->full_packet_received = false;
+ else
+ info->full_packet_received = true;
+
+ enqueue_reassembly(
+ info,
+ response,
+ data_length);
+ } else
+ put_empty_packet(info, response);
+
+ if (data_length)
+ wake_up_interruptible(&info->wait_reassembly_queue);
+
+ atomic_dec(&info->receive_credits);
+ info->receive_credit_target =
+ le16_to_cpu(data_transfer->credits_requested);
+ atomic_add(le16_to_cpu(data_transfer->credits_granted),
+ &info->send_credits);
+
+ log_incoming(INFO, "data flags %d data_offset %d "
+ "data_length %d remaining_data_length %d\n",
+ le16_to_cpu(data_transfer->flags),
+ le32_to_cpu(data_transfer->data_offset),
+ le32_to_cpu(data_transfer->data_length),
+ le32_to_cpu(data_transfer->remaining_data_length));
+
+ /* Send a KEEP_ALIVE response right away if requested */
+ info->keep_alive_requested = KEEP_ALIVE_NONE;
+ if (le16_to_cpu(data_transfer->flags) &
+ SMB_DIRECT_RESPONSE_REQUESTED) {
+ info->keep_alive_requested = KEEP_ALIVE_PENDING;
+ }
+
+ queue_work(info->workqueue, &info->recv_done_work);
+ return;
+
+ default:
+ log_rdma_recv(ERR,
+ "unexpected response type=%d\n", response->type);
+ }
+
+error:
+ put_receive_buffer(info, response);
+}
+
+static struct rdma_cm_id *smbd_create_id(
+ struct smbd_connection *info,
+ struct sockaddr *dstaddr, int port)
+{
+ struct rdma_cm_id *id;
+ int rc;
+ __be16 *sport;
+
+ id = rdma_create_id(&init_net, smbd_conn_upcall, info,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(id)) {
+ rc = PTR_ERR(id);
+ log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
+ return id;
+ }
+
+ if (dstaddr->sa_family == AF_INET6)
+ sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
+ else
+ sport = &((struct sockaddr_in *)dstaddr)->sin_port;
+
+ *sport = htons(port);
+
+ init_completion(&info->ri_done);
+ info->ri_rc = -ETIMEDOUT;
+
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
+ RDMA_RESOLVE_TIMEOUT);
+ if (rc) {
+ log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
+ goto out;
+ }
+ wait_for_completion_interruptible_timeout(
+ &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ rc = info->ri_rc;
+ if (rc) {
+ log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
+ goto out;
+ }
+
+ info->ri_rc = -ETIMEDOUT;
+ rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+ if (rc) {
+ log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
+ goto out;
+ }
+ wait_for_completion_interruptible_timeout(
+ &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ rc = info->ri_rc;
+ if (rc) {
+ log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
+ goto out;
+ }
+
+ return id;
+
+out:
+ rdma_destroy_id(id);
+ return ERR_PTR(rc);
+}
+
+/*
+ * Test if FRWR (Fast Registration Work Requests) is supported on the device
+ * This implementation requries FRWR on RDMA read/write
+ * return value: true if it is supported
+ */
+static bool frwr_is_supported(struct ib_device_attr *attrs)
+{
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return false;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ return false;
+ return true;
+}
+
+static int smbd_ia_open(
+ struct smbd_connection *info,
+ struct sockaddr *dstaddr, int port)
+{
+ int rc;
+
+ info->id = smbd_create_id(info, dstaddr, port);
+ if (IS_ERR(info->id)) {
+ rc = PTR_ERR(info->id);
+ goto out1;
+ }
+
+ if (!frwr_is_supported(&info->id->device->attrs)) {
+ log_rdma_event(ERR,
+ "Fast Registration Work Requests "
+ "(FRWR) is not supported\n");
+ log_rdma_event(ERR,
+ "Device capability flags = %llx "
+ "max_fast_reg_page_list_len = %u\n",
+ info->id->device->attrs.device_cap_flags,
+ info->id->device->attrs.max_fast_reg_page_list_len);
+ rc = -EPROTONOSUPPORT;
+ goto out2;
+ }
+ info->max_frmr_depth = min_t(int,
+ smbd_max_frmr_depth,
+ info->id->device->attrs.max_fast_reg_page_list_len);
+ info->mr_type = IB_MR_TYPE_MEM_REG;
+ if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ info->mr_type = IB_MR_TYPE_SG_GAPS;
+
+ info->pd = ib_alloc_pd(info->id->device, 0);
+ if (IS_ERR(info->pd)) {
+ rc = PTR_ERR(info->pd);
+ log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
+ goto out2;
+ }
+
+ return 0;
+
+out2:
+ rdma_destroy_id(info->id);
+ info->id = NULL;
+
+out1:
+ return rc;
+}
+
+/*
+ * Send a negotiation request message to the peer
+ * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
+ * After negotiation, the transport is connected and ready for
+ * carrying upper layer SMB payload
+ */
+static int smbd_post_send_negotiate_req(struct smbd_connection *info)
+{
+ struct ib_send_wr send_wr, *send_wr_fail;
+ int rc = -ENOMEM;
+ struct smbd_request *request;
+ struct smbd_negotiate_req *packet;
+
+ request = mempool_alloc(info->request_mempool, GFP_KERNEL);
+ if (!request)
+ return rc;
+
+ request->info = info;
+
+ packet = smbd_request_payload(request);
+ packet->min_version = cpu_to_le16(SMBD_V1);
+ packet->max_version = cpu_to_le16(SMBD_V1);
+ packet->reserved = 0;
+ packet->credits_requested = cpu_to_le16(info->send_credit_target);
+ packet->preferred_send_size = cpu_to_le32(info->max_send_size);
+ packet->max_receive_size = cpu_to_le32(info->max_receive_size);
+ packet->max_fragmented_size =
+ cpu_to_le32(info->max_fragmented_recv_size);
+
+ request->num_sge = 1;
+ request->sge[0].addr = ib_dma_map_single(
+ info->id->device, (void *)packet,
+ sizeof(*packet), DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ rc = -EIO;
+ goto dma_mapping_failed;
+ }
+
+ request->sge[0].length = sizeof(*packet);
+ request->sge[0].lkey = info->pd->local_dma_lkey;
+
+ ib_dma_sync_single_for_device(
+ info->id->device, request->sge[0].addr,
+ request->sge[0].length, DMA_TO_DEVICE);
+
+ request->cqe.done = send_done;
+
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &request->cqe;
+ send_wr.sg_list = request->sge;
+ send_wr.num_sge = request->num_sge;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
+ request->sge[0].addr,
+ request->sge[0].length, request->sge[0].lkey);
+
+ request->has_payload = false;
+ atomic_inc(&info->send_pending);
+ rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
+ if (!rc)
+ return 0;
+
+ /* if we reach here, post send failed */
+ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
+ atomic_dec(&info->send_pending);
+ ib_dma_unmap_single(info->id->device, request->sge[0].addr,
+ request->sge[0].length, DMA_TO_DEVICE);
+
+dma_mapping_failed:
+ mempool_free(request, info->request_mempool);
+ return rc;
+}
+
+/*
+ * Extend the credits to remote peer
+ * This implements [MS-SMBD] 3.1.5.9
+ * The idea is that we should extend credits to remote peer as quickly as
+ * it's allowed, to maintain data flow. We allocate as much receive
+ * buffer as possible, and extend the receive credits to remote peer
+ * return value: the new credtis being granted.
+ */
+static int manage_credits_prior_sending(struct smbd_connection *info)
+{
+ int new_credits;
+
+ spin_lock(&info->lock_new_credits_offered);
+ new_credits = info->new_credits_offered;
+ info->new_credits_offered = 0;
+ spin_unlock(&info->lock_new_credits_offered);
+
+ return new_credits;
+}
+
+/*
+ * Check if we need to send a KEEP_ALIVE message
+ * The idle connection timer triggers a KEEP_ALIVE message when expires
+ * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
+ * back a response.
+ * return value:
+ * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
+ * 0: otherwise
+ */
+static int manage_keep_alive_before_sending(struct smbd_connection *info)
+{
+ if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
+ info->keep_alive_requested = KEEP_ALIVE_SENT;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Build and prepare the SMBD packet header
+ * This function waits for avaialbe send credits and build a SMBD packet
+ * header. The caller then optional append payload to the packet after
+ * the header
+ * intput values
+ * size: the size of the payload
+ * remaining_data_length: remaining data to send if this is part of a
+ * fragmented packet
+ * output values
+ * request_out: the request allocated from this function
+ * return values: 0 on success, otherwise actual error code returned
+ */
+static int smbd_create_header(struct smbd_connection *info,
+ int size, int remaining_data_length,
+ struct smbd_request **request_out)
+{
+ struct smbd_request *request;
+ struct smbd_data_transfer *packet;
+ int header_length;
+ int rc;
+
+ /* Wait for send credits. A SMBD packet needs one credit */
+ rc = wait_event_interruptible(info->wait_send_queue,
+ atomic_read(&info->send_credits) > 0 ||
+ info->transport_status != SMBD_CONNECTED);
+ if (rc)
+ return rc;
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending\n");
+ return -ENOENT;
+ }
+ atomic_dec(&info->send_credits);
+
+ request = mempool_alloc(info->request_mempool, GFP_KERNEL);
+ if (!request) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ request->info = info;
+
+ /* Fill in the packet header */
+ packet = smbd_request_payload(request);
+ packet->credits_requested = cpu_to_le16(info->send_credit_target);
+ packet->credits_granted =
+ cpu_to_le16(manage_credits_prior_sending(info));
+ info->send_immediate = false;
+
+ packet->flags = 0;
+ if (manage_keep_alive_before_sending(info))
+ packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
+
+ packet->reserved = 0;
+ if (!size)
+ packet->data_offset = 0;
+ else
+ packet->data_offset = cpu_to_le32(24);
+ packet->data_length = cpu_to_le32(size);
+ packet->remaining_data_length = cpu_to_le32(remaining_data_length);
+ packet->padding = 0;
+
+ log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
+ "data_offset=%d data_length=%d remaining_data_length=%d\n",
+ le16_to_cpu(packet->credits_requested),
+ le16_to_cpu(packet->credits_granted),
+ le32_to_cpu(packet->data_offset),
+ le32_to_cpu(packet->data_length),
+ le32_to_cpu(packet->remaining_data_length));
+
+ /* Map the packet to DMA */
+ header_length = sizeof(struct smbd_data_transfer);
+ /* If this is a packet without payload, don't send padding */
+ if (!size)
+ header_length = offsetof(struct smbd_data_transfer, padding);
+
+ request->num_sge = 1;
+ request->sge[0].addr = ib_dma_map_single(info->id->device,
+ (void *)packet,
+ header_length,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ mempool_free(request, info->request_mempool);
+ rc = -EIO;
+ goto err;
+ }
+
+ request->sge[0].length = header_length;
+ request->sge[0].lkey = info->pd->local_dma_lkey;
+
+ *request_out = request;
+ return 0;
+
+err:
+ atomic_inc(&info->send_credits);
+ return rc;
+}
+
+static void smbd_destroy_header(struct smbd_connection *info,
+ struct smbd_request *request)
+{
+
+ ib_dma_unmap_single(info->id->device,
+ request->sge[0].addr,
+ request->sge[0].length,
+ DMA_TO_DEVICE);
+ mempool_free(request, info->request_mempool);
+ atomic_inc(&info->send_credits);
+}
+
+/* Post the send request */
+static int smbd_post_send(struct smbd_connection *info,
+ struct smbd_request *request, bool has_payload)
+{
+ struct ib_send_wr send_wr, *send_wr_fail;
+ int rc, i;
+
+ for (i = 0; i < request->num_sge; i++) {
+ log_rdma_send(INFO,
+ "rdma_request sge[%d] addr=%llu legnth=%u\n",
+ i, request->sge[0].addr, request->sge[0].length);
+ ib_dma_sync_single_for_device(
+ info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+ }
+
+ request->cqe.done = send_done;
+
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &request->cqe;
+ send_wr.sg_list = request->sge;
+ send_wr.num_sge = request->num_sge;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ if (has_payload) {
+ request->has_payload = true;
+ atomic_inc(&info->send_payload_pending);
+ } else {
+ request->has_payload = false;
+ atomic_inc(&info->send_pending);
+ }
+
+ rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
+ if (rc) {
+ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
+ if (has_payload) {
+ if (atomic_dec_and_test(&info->send_payload_pending))
+ wake_up(&info->wait_send_payload_pending);
+ } else {
+ if (atomic_dec_and_test(&info->send_pending))
+ wake_up(&info->wait_send_pending);
+ }
+ } else
+ /* Reset timer for idle connection after packet is sent */
+ mod_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+
+ return rc;
+}
+
+static int smbd_post_send_sgl(struct smbd_connection *info,
+ struct scatterlist *sgl, int data_length, int remaining_data_length)
+{
+ int num_sgs;
+ int i, rc;
+ struct smbd_request *request;
+ struct scatterlist *sg;
+
+ rc = smbd_create_header(
+ info, data_length, remaining_data_length, &request);
+ if (rc)
+ return rc;
+
+ num_sgs = sgl ? sg_nents(sgl) : 0;
+ for_each_sg(sgl, sg, num_sgs, i) {
+ request->sge[i+1].addr =
+ ib_dma_map_page(info->id->device, sg_page(sg),
+ sg->offset, sg->length, DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(
+ info->id->device, request->sge[i+1].addr)) {
+ rc = -EIO;
+ request->sge[i+1].addr = 0;
+ goto dma_mapping_failure;
+ }
+ request->sge[i+1].length = sg->length;
+ request->sge[i+1].lkey = info->pd->local_dma_lkey;
+ request->num_sge++;
+ }
+
+ rc = smbd_post_send(info, request, data_length);
+ if (!rc)
+ return 0;
+
+dma_mapping_failure:
+ for (i = 1; i < request->num_sge; i++)
+ if (request->sge[i].addr)
+ ib_dma_unmap_single(info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+ smbd_destroy_header(info, request);
+ return rc;
+}
+
+/*
+ * Send a page
+ * page: the page to send
+ * offset: offset in the page to send
+ * size: length in the page to send
+ * remaining_data_length: remaining data to send in this payload
+ */
+static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
+ unsigned long offset, size_t size, int remaining_data_length)
+{
+ struct scatterlist sgl;
+
+ sg_init_table(&sgl, 1);
+ sg_set_page(&sgl, page, size, offset);
+
+ return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
+}
+
+/*
+ * Send an empty message
+ * Empty message is used to extend credits to peer to for keep live
+ * while there is no upper layer payload to send at the time
+ */
+static int smbd_post_send_empty(struct smbd_connection *info)
+{
+ info->count_send_empty++;
+ return smbd_post_send_sgl(info, NULL, 0, 0);
+}
+
+/*
+ * Send a data buffer
+ * iov: the iov array describing the data buffers
+ * n_vec: number of iov array
+ * remaining_data_length: remaining data to send following this packet
+ * in segmented SMBD packet
+ */
+static int smbd_post_send_data(
+ struct smbd_connection *info, struct kvec *iov, int n_vec,
+ int remaining_data_length)
+{
+ int i;
+ u32 data_length = 0;
+ struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+
+ if (n_vec > SMBDIRECT_MAX_SGE) {
+ cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
+ return -ENOMEM;
+ }
+
+ sg_init_table(sgl, n_vec);
+ for (i = 0; i < n_vec; i++) {
+ data_length += iov[i].iov_len;
+ sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
+ }
+
+ return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
+}
+
+/*
+ * Post a receive request to the transport
+ * The remote peer can only send data when a receive request is posted
+ * The interaction is controlled by send/receive credit system
+ */
+static int smbd_post_recv(
+ struct smbd_connection *info, struct smbd_response *response)
+{
+ struct ib_recv_wr recv_wr, *recv_wr_fail = NULL;
+ int rc = -EIO;
+
+ response->sge.addr = ib_dma_map_single(
+ info->id->device, response->packet,
+ info->max_receive_size, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(info->id->device, response->sge.addr))
+ return rc;
+
+ response->sge.length = info->max_receive_size;
+ response->sge.lkey = info->pd->local_dma_lkey;
+
+ response->cqe.done = recv_done;
+
+ recv_wr.wr_cqe = &response->cqe;
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &response->sge;
+ recv_wr.num_sge = 1;
+
+ rc = ib_post_recv(info->id->qp, &recv_wr, &recv_wr_fail);
+ if (rc) {
+ ib_dma_unmap_single(info->id->device, response->sge.addr,
+ response->sge.length, DMA_FROM_DEVICE);
+
+ log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
+ }
+
+ return rc;
+}
+
+/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
+static int smbd_negotiate(struct smbd_connection *info)
+{
+ int rc;
+ struct smbd_response *response = get_receive_buffer(info);
+
+ response->type = SMBD_NEGOTIATE_RESP;
+ rc = smbd_post_recv(info, response);
+ log_rdma_event(INFO,
+ "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
+ "iov.lkey=%x\n",
+ rc, response->sge.addr,
+ response->sge.length, response->sge.lkey);
+ if (rc)
+ return rc;
+
+ init_completion(&info->negotiate_completion);
+ info->negotiate_done = false;
+ rc = smbd_post_send_negotiate_req(info);
+ if (rc)
+ return rc;
+
+ rc = wait_for_completion_interruptible_timeout(
+ &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
+ log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
+
+ if (info->negotiate_done)
+ return 0;
+
+ if (rc == 0)
+ rc = -ETIMEDOUT;
+ else if (rc == -ERESTARTSYS)
+ rc = -EINTR;
+ else
+ rc = -ENOTCONN;
+
+ return rc;
+}
+
+static void put_empty_packet(
+ struct smbd_connection *info, struct smbd_response *response)
+{
+ spin_lock(&info->empty_packet_queue_lock);
+ list_add_tail(&response->list, &info->empty_packet_queue);
+ info->count_empty_packet_queue++;
+ spin_unlock(&info->empty_packet_queue_lock);
+
+ queue_work(info->workqueue, &info->post_send_credits_work);
+}
+
+/*
+ * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
+ * This is a queue for reassembling upper layer payload and present to upper
+ * layer. All the inncoming payload go to the reassembly queue, regardless of
+ * if reassembly is required. The uuper layer code reads from the queue for all
+ * incoming payloads.
+ * Put a received packet to the reassembly queue
+ * response: the packet received
+ * data_length: the size of payload in this packet
+ */
+static void enqueue_reassembly(
+ struct smbd_connection *info,
+ struct smbd_response *response,
+ int data_length)
+{
+ spin_lock(&info->reassembly_queue_lock);
+ list_add_tail(&response->list, &info->reassembly_queue);
+ info->reassembly_queue_length++;
+ /*
+ * Make sure reassembly_data_length is updated after list and
+ * reassembly_queue_length are updated. On the dequeue side
+ * reassembly_data_length is checked without a lock to determine
+ * if reassembly_queue_length and list is up to date
+ */
+ virt_wmb();
+ info->reassembly_data_length += data_length;
+ spin_unlock(&info->reassembly_queue_lock);
+ info->count_reassembly_queue++;
+ info->count_enqueue_reassembly_queue++;
+}
+
+/*
+ * Get the first entry at the front of reassembly queue
+ * Caller is responsible for locking
+ * return value: the first entry if any, NULL if queue is empty
+ */
+static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
+{
+ struct smbd_response *ret = NULL;
+
+ if (!list_empty(&info->reassembly_queue)) {
+ ret = list_first_entry(
+ &info->reassembly_queue,
+ struct smbd_response, list);
+ }
+ return ret;
+}
+
+static struct smbd_response *get_empty_queue_buffer(
+ struct smbd_connection *info)
+{
+ struct smbd_response *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
+ if (!list_empty(&info->empty_packet_queue)) {
+ ret = list_first_entry(
+ &info->empty_packet_queue,
+ struct smbd_response, list);
+ list_del(&ret->list);
+ info->count_empty_packet_queue--;
+ }
+ spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Get a receive buffer
+ * For each remote send, we need to post a receive. The receive buffers are
+ * pre-allocated in advance.
+ * return value: the receive buffer, NULL if none is available
+ */
+static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
+{
+ struct smbd_response *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&info->receive_queue_lock, flags);
+ if (!list_empty(&info->receive_queue)) {
+ ret = list_first_entry(
+ &info->receive_queue,
+ struct smbd_response, list);
+ list_del(&ret->list);
+ info->count_receive_queue--;
+ info->count_get_receive_buffer++;
+ }
+ spin_unlock_irqrestore(&info->receive_queue_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Return a receive buffer
+ * Upon returning of a receive buffer, we can post new receive and extend
+ * more receive credits to remote peer. This is done immediately after a
+ * receive buffer is returned.
+ */
+static void put_receive_buffer(
+ struct smbd_connection *info, struct smbd_response *response)
+{
+ unsigned long flags;
+
+ ib_dma_unmap_single(info->id->device, response->sge.addr,
+ response->sge.length, DMA_FROM_DEVICE);
+
+ spin_lock_irqsave(&info->receive_queue_lock, flags);
+ list_add_tail(&response->list, &info->receive_queue);
+ info->count_receive_queue++;
+ info->count_put_receive_buffer++;
+ spin_unlock_irqrestore(&info->receive_queue_lock, flags);
+
+ queue_work(info->workqueue, &info->post_send_credits_work);
+}
+
+/* Preallocate all receive buffer on transport establishment */
+static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
+{
+ int i;
+ struct smbd_response *response;
+
+ INIT_LIST_HEAD(&info->reassembly_queue);
+ spin_lock_init(&info->reassembly_queue_lock);
+ info->reassembly_data_length = 0;
+ info->reassembly_queue_length = 0;
+
+ INIT_LIST_HEAD(&info->receive_queue);
+ spin_lock_init(&info->receive_queue_lock);
+ info->count_receive_queue = 0;
+
+ INIT_LIST_HEAD(&info->empty_packet_queue);
+ spin_lock_init(&info->empty_packet_queue_lock);
+ info->count_empty_packet_queue = 0;
+
+ init_waitqueue_head(&info->wait_receive_queues);
+
+ for (i = 0; i < num_buf; i++) {
+ response = mempool_alloc(info->response_mempool, GFP_KERNEL);
+ if (!response)
+ goto allocate_failed;
+
+ response->info = info;
+ list_add_tail(&response->list, &info->receive_queue);
+ info->count_receive_queue++;
+ }
+
+ return 0;
+
+allocate_failed:
+ while (!list_empty(&info->receive_queue)) {
+ response = list_first_entry(
+ &info->receive_queue,
+ struct smbd_response, list);
+ list_del(&response->list);
+ info->count_receive_queue--;
+
+ mempool_free(response, info->response_mempool);
+ }
+ return -ENOMEM;
+}
+
+static void destroy_receive_buffers(struct smbd_connection *info)
+{
+ struct smbd_response *response;
+
+ while ((response = get_receive_buffer(info)))
+ mempool_free(response, info->response_mempool);
+
+ while ((response = get_empty_queue_buffer(info)))
+ mempool_free(response, info->response_mempool);
+}
+
+/*
+ * Check and send an immediate or keep alive packet
+ * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
+ * Connection.KeepaliveRequested and Connection.SendImmediate
+ * The idea is to extend credits to server as soon as it becomes available
+ */
+static void send_immediate_work(struct work_struct *work)
+{
+ struct smbd_connection *info = container_of(
+ work, struct smbd_connection,
+ send_immediate_work.work);
+
+ if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
+ info->send_immediate) {
+ log_keep_alive(INFO, "send an empty message\n");
+ smbd_post_send_empty(info);
+ }
+}
+
+/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
+static void idle_connection_timer(struct work_struct *work)
+{
+ struct smbd_connection *info = container_of(
+ work, struct smbd_connection,
+ idle_timer_work.work);
+
+ if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
+ log_keep_alive(ERR,
+ "error status info->keep_alive_requested=%d\n",
+ info->keep_alive_requested);
+ smbd_disconnect_rdma_connection(info);
+ return;
+ }
+
+ log_keep_alive(INFO, "about to send an empty idle message\n");
+ smbd_post_send_empty(info);
+
+ /* Setup the next idle timeout work */
+ queue_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+}
+
+/* Destroy this SMBD connection, called from upper layer */
+void smbd_destroy(struct smbd_connection *info)
+{
+ log_rdma_event(INFO, "destroying rdma session\n");
+
+ /* Kick off the disconnection process */
+ smbd_disconnect_rdma_connection(info);
+
+ log_rdma_event(INFO, "wait for transport being destroyed\n");
+ wait_event(info->wait_destroy,
+ info->transport_status == SMBD_DESTROYED);
+
+ destroy_workqueue(info->workqueue);
+ kfree(info);
+}
+
+/*
+ * Reconnect this SMBD connection, called from upper layer
+ * return value: 0 on success, or actual error code
+ */
+int smbd_reconnect(struct TCP_Server_Info *server)
+{
+ log_rdma_event(INFO, "reconnecting rdma session\n");
+
+ if (!server->smbd_conn) {
+ log_rdma_event(ERR, "rdma session already destroyed\n");
+ return -EINVAL;
+ }
+
+ /*
+ * This is possible if transport is disconnected and we haven't received
+ * notification from RDMA, but upper layer has detected timeout
+ */
+ if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
+ log_rdma_event(INFO, "disconnecting transport\n");
+ smbd_disconnect_rdma_connection(server->smbd_conn);
+ }
+
+ /* wait until the transport is destroyed */
+ wait_event(server->smbd_conn->wait_destroy,
+ server->smbd_conn->transport_status == SMBD_DESTROYED);
+
+ destroy_workqueue(server->smbd_conn->workqueue);
+ kfree(server->smbd_conn);
+
+ log_rdma_event(INFO, "creating rdma session\n");
+ server->smbd_conn = smbd_get_connection(
+ server, (struct sockaddr *) &server->dstaddr);
+
+ return server->smbd_conn ? 0 : -ENOENT;
+}
+
+static void destroy_caches_and_workqueue(struct smbd_connection *info)
+{
+ destroy_receive_buffers(info);
+ destroy_workqueue(info->workqueue);
+ mempool_destroy(info->response_mempool);
+ kmem_cache_destroy(info->response_cache);
+ mempool_destroy(info->request_mempool);
+ kmem_cache_destroy(info->request_cache);
+}
+
+#define MAX_NAME_LEN 80
+static int allocate_caches_and_workqueue(struct smbd_connection *info)
+{
+ char name[MAX_NAME_LEN];
+ int rc;
+
+ snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+ info->request_cache =
+ kmem_cache_create(
+ name,
+ sizeof(struct smbd_request) +
+ sizeof(struct smbd_data_transfer),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!info->request_cache)
+ return -ENOMEM;
+
+ info->request_mempool =
+ mempool_create(info->send_credit_target, mempool_alloc_slab,
+ mempool_free_slab, info->request_cache);
+ if (!info->request_mempool)
+ goto out1;
+
+ snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+ info->response_cache =
+ kmem_cache_create(
+ name,
+ sizeof(struct smbd_response) +
+ info->max_receive_size,
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!info->response_cache)
+ goto out2;
+
+ info->response_mempool =
+ mempool_create(info->receive_credit_max, mempool_alloc_slab,
+ mempool_free_slab, info->response_cache);
+ if (!info->response_mempool)
+ goto out3;
+
+ snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+ info->workqueue = create_workqueue(name);
+ if (!info->workqueue)
+ goto out4;
+
+ rc = allocate_receive_buffers(info, info->receive_credit_max);
+ if (rc) {
+ log_rdma_event(ERR, "failed to allocate receive buffers\n");
+ goto out5;
+ }
+
+ return 0;
+
+out5:
+ destroy_workqueue(info->workqueue);
+out4:
+ mempool_destroy(info->response_mempool);
+out3:
+ kmem_cache_destroy(info->response_cache);
+out2:
+ mempool_destroy(info->request_mempool);
+out1:
+ kmem_cache_destroy(info->request_cache);
+ return -ENOMEM;
+}
+
+/* Create a SMBD connection, called by upper layer */
+static struct smbd_connection *_smbd_get_connection(
+ struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
+{
+ int rc;
+ struct smbd_connection *info;
+ struct rdma_conn_param conn_param;
+ struct ib_qp_init_attr qp_attr;
+ struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
+ struct ib_port_immutable port_immutable;
+ u32 ird_ord_hdr[2];
+
+ info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
+ if (!info)
+ return NULL;
+
+ info->transport_status = SMBD_CONNECTING;
+ rc = smbd_ia_open(info, dstaddr, port);
+ if (rc) {
+ log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
+ goto create_id_failed;
+ }
+
+ if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
+ smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
+ log_rdma_event(ERR,
+ "consider lowering send_credit_target = %d. "
+ "Possible CQE overrun, device "
+ "reporting max_cpe %d max_qp_wr %d\n",
+ smbd_send_credit_target,
+ info->id->device->attrs.max_cqe,
+ info->id->device->attrs.max_qp_wr);
+ goto config_failed;
+ }
+
+ if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
+ smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
+ log_rdma_event(ERR,
+ "consider lowering receive_credit_max = %d. "
+ "Possible CQE overrun, device "
+ "reporting max_cpe %d max_qp_wr %d\n",
+ smbd_receive_credit_max,
+ info->id->device->attrs.max_cqe,
+ info->id->device->attrs.max_qp_wr);
+ goto config_failed;
+ }
+
+ info->receive_credit_max = smbd_receive_credit_max;
+ info->send_credit_target = smbd_send_credit_target;
+ info->max_send_size = smbd_max_send_size;
+ info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
+ info->max_receive_size = smbd_max_receive_size;
+ info->keep_alive_interval = smbd_keep_alive_interval;
+
+ if (info->id->device->attrs.max_sge < SMBDIRECT_MAX_SGE) {
+ log_rdma_event(ERR, "warning: device max_sge = %d too small\n",
+ info->id->device->attrs.max_sge);
+ log_rdma_event(ERR, "Queue Pair creation may fail\n");
+ }
+
+ info->send_cq = NULL;
+ info->recv_cq = NULL;
+ info->send_cq = ib_alloc_cq(info->id->device, info,
+ info->send_credit_target, 0, IB_POLL_SOFTIRQ);
+ if (IS_ERR(info->send_cq)) {
+ info->send_cq = NULL;
+ goto alloc_cq_failed;
+ }
+
+ info->recv_cq = ib_alloc_cq(info->id->device, info,
+ info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
+ if (IS_ERR(info->recv_cq)) {
+ info->recv_cq = NULL;
+ goto alloc_cq_failed;
+ }
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.event_handler = smbd_qp_async_error_upcall;
+ qp_attr.qp_context = info;
+ qp_attr.cap.max_send_wr = info->send_credit_target;
+ qp_attr.cap.max_recv_wr = info->receive_credit_max;
+ qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
+ qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
+ qp_attr.cap.max_inline_data = 0;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = info->send_cq;
+ qp_attr.recv_cq = info->recv_cq;
+ qp_attr.port_num = ~0;
+
+ rc = rdma_create_qp(info->id, info->pd, &qp_attr);
+ if (rc) {
+ log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
+ goto create_qp_failed;
+ }
+
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.initiator_depth = 0;
+
+ conn_param.responder_resources =
+ info->id->device->attrs.max_qp_rd_atom
+ < SMBD_CM_RESPONDER_RESOURCES ?
+ info->id->device->attrs.max_qp_rd_atom :
+ SMBD_CM_RESPONDER_RESOURCES;
+ info->responder_resources = conn_param.responder_resources;
+ log_rdma_mr(INFO, "responder_resources=%d\n",
+ info->responder_resources);
+
+ /* Need to send IRD/ORD in private data for iWARP */
+ info->id->device->get_port_immutable(
+ info->id->device, info->id->port_num, &port_immutable);
+ if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
+ ird_ord_hdr[0] = info->responder_resources;
+ ird_ord_hdr[1] = 1;
+ conn_param.private_data = ird_ord_hdr;
+ conn_param.private_data_len = sizeof(ird_ord_hdr);
+ } else {
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ }
+
+ conn_param.retry_count = SMBD_CM_RETRY;
+ conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
+ conn_param.flow_control = 0;
+ init_waitqueue_head(&info->wait_destroy);
+
+ log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
+ &addr_in->sin_addr, port);
+
+ init_waitqueue_head(&info->conn_wait);
+ rc = rdma_connect(info->id, &conn_param);
+ if (rc) {
+ log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
+ goto rdma_connect_failed;
+ }
+
+ wait_event_interruptible(
+ info->conn_wait, info->transport_status != SMBD_CONNECTING);
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
+ goto rdma_connect_failed;
+ }
+
+ log_rdma_event(INFO, "rdma_connect connected\n");
+
+ rc = allocate_caches_and_workqueue(info);
+ if (rc) {
+ log_rdma_event(ERR, "cache allocation failed\n");
+ goto allocate_cache_failed;
+ }
+
+ init_waitqueue_head(&info->wait_send_queue);
+ init_waitqueue_head(&info->wait_reassembly_queue);
+
+ INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
+ INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
+ queue_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+
+ init_waitqueue_head(&info->wait_smbd_send_pending);
+ info->smbd_send_pending = 0;
+
+ init_waitqueue_head(&info->wait_smbd_recv_pending);
+ info->smbd_recv_pending = 0;
+
+ init_waitqueue_head(&info->wait_send_pending);
+ atomic_set(&info->send_pending, 0);
+
+ init_waitqueue_head(&info->wait_send_payload_pending);
+ atomic_set(&info->send_payload_pending, 0);
+
+ INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
+ INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
+ INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
+ INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
+ info->new_credits_offered = 0;
+ spin_lock_init(&info->lock_new_credits_offered);
+
+ rc = smbd_negotiate(info);
+ if (rc) {
+ log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
+ goto negotiation_failed;
+ }
+
+ rc = allocate_mr_list(info);
+ if (rc) {
+ log_rdma_mr(ERR, "memory registration allocation failed\n");
+ goto allocate_mr_failed;
+ }
+
+ return info;
+
+allocate_mr_failed:
+ /* At this point, need to a full transport shutdown */
+ smbd_destroy(info);
+ return NULL;
+
+negotiation_failed:
+ cancel_delayed_work_sync(&info->idle_timer_work);
+ destroy_caches_and_workqueue(info);
+ info->transport_status = SMBD_NEGOTIATE_FAILED;
+ init_waitqueue_head(&info->conn_wait);
+ rdma_disconnect(info->id);
+ wait_event(info->conn_wait,
+ info->transport_status == SMBD_DISCONNECTED);
+
+allocate_cache_failed:
+rdma_connect_failed:
+ rdma_destroy_qp(info->id);
+
+create_qp_failed:
+alloc_cq_failed:
+ if (info->send_cq)
+ ib_free_cq(info->send_cq);
+ if (info->recv_cq)
+ ib_free_cq(info->recv_cq);
+
+config_failed:
+ ib_dealloc_pd(info->pd);
+ rdma_destroy_id(info->id);
+
+create_id_failed:
+ kfree(info);
+ return NULL;
+}
+
+struct smbd_connection *smbd_get_connection(
+ struct TCP_Server_Info *server, struct sockaddr *dstaddr)
+{
+ struct smbd_connection *ret;
+ int port = SMBD_PORT;
+
+try_again:
+ ret = _smbd_get_connection(server, dstaddr, port);
+
+ /* Try SMB_PORT if SMBD_PORT doesn't work */
+ if (!ret && port == SMBD_PORT) {
+ port = SMB_PORT;
+ goto try_again;
+ }
+ return ret;
+}
+
+/*
+ * Receive data from receive reassembly queue
+ * All the incoming data packets are placed in reassembly queue
+ * buf: the buffer to read data into
+ * size: the length of data to read
+ * return value: actual data read
+ * Note: this implementation copies the data from reassebmly queue to receive
+ * buffers used by upper layer. This is not the optimal code path. A better way
+ * to do it is to not have upper layer allocate its receive buffers but rather
+ * borrow the buffer from reassembly queue, and return it after data is
+ * consumed. But this will require more changes to upper layer code, and also
+ * need to consider packet boundaries while they still being reassembled.
+ */
+static int smbd_recv_buf(struct smbd_connection *info, char *buf,
+ unsigned int size)
+{
+ struct smbd_response *response;
+ struct smbd_data_transfer *data_transfer;
+ int to_copy, to_read, data_read, offset;
+ u32 data_length, remaining_data_length, data_offset;
+ int rc;
+
+again:
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_read(ERR, "disconnected\n");
+ return -ENODEV;
+ }
+
+ /*
+ * No need to hold the reassembly queue lock all the time as we are
+ * the only one reading from the front of the queue. The transport
+ * may add more entries to the back of the queue at the same time
+ */
+ log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
+ info->reassembly_data_length);
+ if (info->reassembly_data_length >= size) {
+ int queue_length;
+ int queue_removed = 0;
+
+ /*
+ * Need to make sure reassembly_data_length is read before
+ * reading reassembly_queue_length and calling
+ * _get_first_reassembly. This call is lock free
+ * as we never read at the end of the queue which are being
+ * updated in SOFTIRQ as more data is received
+ */
+ virt_rmb();
+ queue_length = info->reassembly_queue_length;
+ data_read = 0;
+ to_read = size;
+ offset = info->first_entry_offset;
+ while (data_read < size) {
+ response = _get_first_reassembly(info);
+ data_transfer = smbd_response_payload(response);
+ data_length = le32_to_cpu(data_transfer->data_length);
+ remaining_data_length =
+ le32_to_cpu(
+ data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+
+ /*
+ * The upper layer expects RFC1002 length at the
+ * beginning of the payload. Return it to indicate
+ * the total length of the packet. This minimize the
+ * change to upper layer packet processing logic. This
+ * will be eventually remove when an intermediate
+ * transport layer is added
+ */
+ if (response->first_segment && size == 4) {
+ unsigned int rfc1002_len =
+ data_length + remaining_data_length;
+ *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
+ data_read = 4;
+ response->first_segment = false;
+ log_read(INFO, "returning rfc1002 length %d\n",
+ rfc1002_len);
+ goto read_rfc1002_done;
+ }
+
+ to_copy = min_t(int, data_length - offset, to_read);
+ memcpy(
+ buf + data_read,
+ (char *)data_transfer + data_offset + offset,
+ to_copy);
+
+ /* move on to the next buffer? */
+ if (to_copy == data_length - offset) {
+ queue_length--;
+ /*
+ * No need to lock if we are not at the
+ * end of the queue
+ */
+ if (!queue_length)
+ spin_lock_irq(
+ &info->reassembly_queue_lock);
+ list_del(&response->list);
+ queue_removed++;
+ if (!queue_length)
+ spin_unlock_irq(
+ &info->reassembly_queue_lock);
+
+ info->count_reassembly_queue--;
+ info->count_dequeue_reassembly_queue++;
+ put_receive_buffer(info, response);
+ offset = 0;
+ log_read(INFO, "put_receive_buffer offset=0\n");
+ } else
+ offset += to_copy;
+
+ to_read -= to_copy;
+ data_read += to_copy;
+
+ log_read(INFO, "_get_first_reassembly memcpy %d bytes "
+ "data_transfer_length-offset=%d after that "
+ "to_read=%d data_read=%d offset=%d\n",
+ to_copy, data_length - offset,
+ to_read, data_read, offset);
+ }
+
+ spin_lock_irq(&info->reassembly_queue_lock);
+ info->reassembly_data_length -= data_read;
+ info->reassembly_queue_length -= queue_removed;
+ spin_unlock_irq(&info->reassembly_queue_lock);
+
+ info->first_entry_offset = offset;
+ log_read(INFO, "returning to thread data_read=%d "
+ "reassembly_data_length=%d first_entry_offset=%d\n",
+ data_read, info->reassembly_data_length,
+ info->first_entry_offset);
+read_rfc1002_done:
+ return data_read;
+ }
+
+ log_read(INFO, "wait_event on more data\n");
+ rc = wait_event_interruptible(
+ info->wait_reassembly_queue,
+ info->reassembly_data_length >= size ||
+ info->transport_status != SMBD_CONNECTED);
+ /* Don't return any data if interrupted */
+ if (rc)
+ return -ENODEV;
+
+ goto again;
+}
+
+/*
+ * Receive a page from receive reassembly queue
+ * page: the page to read data into
+ * to_read: the length of data to read
+ * return value: actual data read
+ */
+static int smbd_recv_page(struct smbd_connection *info,
+ struct page *page, unsigned int to_read)
+{
+ int ret;
+ char *to_address;
+
+ /* make sure we have the page ready for read */
+ ret = wait_event_interruptible(
+ info->wait_reassembly_queue,
+ info->reassembly_data_length >= to_read ||
+ info->transport_status != SMBD_CONNECTED);
+ if (ret)
+ return 0;
+
+ /* now we can read from reassembly queue and not sleep */
+ to_address = kmap_atomic(page);
+
+ log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
+ page, to_address, to_read);
+
+ ret = smbd_recv_buf(info, to_address, to_read);
+ kunmap_atomic(to_address);
+
+ return ret;
+}
+
+/*
+ * Receive data from transport
+ * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
+ * return: total bytes read, or 0. SMB Direct will not do partial read.
+ */
+int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
+{
+ char *buf;
+ struct page *page;
+ unsigned int to_read;
+ int rc;
+
+ info->smbd_recv_pending++;
+
+ switch (msg->msg_iter.type) {
+ case READ | ITER_KVEC:
+ buf = msg->msg_iter.kvec->iov_base;
+ to_read = msg->msg_iter.kvec->iov_len;
+ rc = smbd_recv_buf(info, buf, to_read);
+ break;
+
+ case READ | ITER_BVEC:
+ page = msg->msg_iter.bvec->bv_page;
+ to_read = msg->msg_iter.bvec->bv_len;
+ rc = smbd_recv_page(info, page, to_read);
+ break;
+
+ default:
+ /* It's a bug in upper layer to get there */
+ cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
+ msg->msg_iter.type);
+ rc = -EIO;
+ }
+
+ info->smbd_recv_pending--;
+ wake_up(&info->wait_smbd_recv_pending);
+
+ /* SMBDirect will read it all or nothing */
+ if (rc > 0)
+ msg->msg_iter.count = 0;
+ return rc;
+}
+
+/*
+ * Send data to transport
+ * Each rqst is transported as a SMBDirect payload
+ * rqst: the data to write
+ * return value: 0 if successfully write, otherwise error code
+ */
+int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
+{
+ struct kvec vec;
+ int nvecs;
+ int size;
+ int buflen = 0, remaining_data_length;
+ int start, i, j;
+ int max_iov_size =
+ info->max_send_size - sizeof(struct smbd_data_transfer);
+ struct kvec iov[SMBDIRECT_MAX_SGE];
+ int rc;
+
+ info->smbd_send_pending++;
+ if (info->transport_status != SMBD_CONNECTED) {
+ rc = -ENODEV;
+ goto done;
+ }
+
+ /*
+ * This usually means a configuration error
+ * We use RDMA read/write for packet size > rdma_readwrite_threshold
+ * as long as it's properly configured we should never get into this
+ * situation
+ */
+ if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) {
+ log_write(ERR, "maximum send segment %x exceeding %x\n",
+ rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE);
+ rc = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * Remove the RFC1002 length defined in MS-SMB2 section 2.1
+ * It is used only for TCP transport
+ * In future we may want to add a transport layer under protocol
+ * layer so this will only be issued to TCP transport
+ */
+ iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4;
+ iov[0].iov_len = rqst->rq_iov[0].iov_len - 4;
+ buflen += iov[0].iov_len;
+
+ /* total up iov array first */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ iov[i].iov_base = rqst->rq_iov[i].iov_base;
+ iov[i].iov_len = rqst->rq_iov[i].iov_len;
+ buflen += iov[i].iov_len;
+ }
+
+ /* add in the page array if there is one */
+ if (rqst->rq_npages) {
+ buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+ buflen += rqst->rq_tailsz;
+ }
+
+ if (buflen + sizeof(struct smbd_data_transfer) >
+ info->max_fragmented_send_size) {
+ log_write(ERR, "payload size %d > max size %d\n",
+ buflen, info->max_fragmented_send_size);
+ rc = -EINVAL;
+ goto done;
+ }
+
+ remaining_data_length = buflen;
+
+ log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
+ "rq_tailsz=%d buflen=%d\n",
+ rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
+ rqst->rq_tailsz, buflen);
+
+ start = i = iov[0].iov_len ? 0 : 1;
+ buflen = 0;
+ while (true) {
+ buflen += iov[i].iov_len;
+ if (buflen > max_iov_size) {
+ if (i > start) {
+ remaining_data_length -=
+ (buflen-iov[i].iov_len);
+ log_write(INFO, "sending iov[] from start=%d "
+ "i=%d nvecs=%d "
+ "remaining_data_length=%d\n",
+ start, i, i-start,
+ remaining_data_length);
+ rc = smbd_post_send_data(
+ info, &iov[start], i-start,
+ remaining_data_length);
+ if (rc)
+ goto done;
+ } else {
+ /* iov[start] is too big, break it */
+ nvecs = (buflen+max_iov_size-1)/max_iov_size;
+ log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
+ " break to %d vectors\n",
+ start, iov[start].iov_base,
+ buflen, nvecs);
+ for (j = 0; j < nvecs; j++) {
+ vec.iov_base =
+ (char *)iov[start].iov_base +
+ j*max_iov_size;
+ vec.iov_len = max_iov_size;
+ if (j == nvecs-1)
+ vec.iov_len =
+ buflen -
+ max_iov_size*(nvecs-1);
+ remaining_data_length -= vec.iov_len;
+ log_write(INFO,
+ "sending vec j=%d iov_base=%p"
+ " iov_len=%zu "
+ "remaining_data_length=%d\n",
+ j, vec.iov_base, vec.iov_len,
+ remaining_data_length);
+ rc = smbd_post_send_data(
+ info, &vec, 1,
+ remaining_data_length);
+ if (rc)
+ goto done;
+ }
+ i++;
+ }
+ start = i;
+ buflen = 0;
+ } else {
+ i++;
+ if (i == rqst->rq_nvec) {
+ /* send out all remaining vecs */
+ remaining_data_length -= buflen;
+ log_write(INFO,
+ "sending iov[] from start=%d i=%d "
+ "nvecs=%d remaining_data_length=%d\n",
+ start, i, i-start,
+ remaining_data_length);
+ rc = smbd_post_send_data(info, &iov[start],
+ i-start, remaining_data_length);
+ if (rc)
+ goto done;
+ break;
+ }
+ }
+ log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
+ }
+
+ /* now sending pages if there are any */
+ for (i = 0; i < rqst->rq_npages; i++) {
+ buflen = (i == rqst->rq_npages-1) ?
+ rqst->rq_tailsz : rqst->rq_pagesz;
+ nvecs = (buflen + max_iov_size - 1) / max_iov_size;
+ log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
+ buflen, nvecs);
+ for (j = 0; j < nvecs; j++) {
+ size = max_iov_size;
+ if (j == nvecs-1)
+ size = buflen - j*max_iov_size;
+ remaining_data_length -= size;
+ log_write(INFO, "sending pages i=%d offset=%d size=%d"
+ " remaining_data_length=%d\n",
+ i, j*max_iov_size, size, remaining_data_length);
+ rc = smbd_post_send_page(
+ info, rqst->rq_pages[i], j*max_iov_size,
+ size, remaining_data_length);
+ if (rc)
+ goto done;
+ }
+ }
+
+done:
+ /*
+ * As an optimization, we don't wait for individual I/O to finish
+ * before sending the next one.
+ * Send them all and wait for pending send count to get to 0
+ * that means all the I/Os have been out and we are good to return
+ */
+
+ wait_event(info->wait_send_payload_pending,
+ atomic_read(&info->send_payload_pending) == 0);
+
+ info->smbd_send_pending--;
+ wake_up(&info->wait_smbd_send_pending);
+
+ return rc;
+}
+
+static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbd_mr *mr;
+ struct ib_cqe *cqe;
+
+ if (wc->status) {
+ log_rdma_mr(ERR, "status=%d\n", wc->status);
+ cqe = wc->wr_cqe;
+ mr = container_of(cqe, struct smbd_mr, cqe);
+ smbd_disconnect_rdma_connection(mr->conn);
+ }
+}
+
+/*
+ * The work queue function that recovers MRs
+ * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
+ * again. Both calls are slow, so finish them in a workqueue. This will not
+ * block I/O path.
+ * There is one workqueue that recovers MRs, there is no need to lock as the
+ * I/O requests calling smbd_register_mr will never update the links in the
+ * mr_list.
+ */
+static void smbd_mr_recovery_work(struct work_struct *work)
+{
+ struct smbd_connection *info =
+ container_of(work, struct smbd_connection, mr_recovery_work);
+ struct smbd_mr *smbdirect_mr;
+ int rc;
+
+ list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
+ if (smbdirect_mr->state == MR_INVALIDATED ||
+ smbdirect_mr->state == MR_ERROR) {
+
+ if (smbdirect_mr->state == MR_INVALIDATED) {
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ smbdirect_mr->state = MR_READY;
+ } else if (smbdirect_mr->state == MR_ERROR) {
+
+ /* recover this MR entry */
+ rc = ib_dereg_mr(smbdirect_mr->mr);
+ if (rc) {
+ log_rdma_mr(ERR,
+ "ib_dereg_mr faield rc=%x\n",
+ rc);
+ smbd_disconnect_rdma_connection(info);
+ }
+
+ smbdirect_mr->mr = ib_alloc_mr(
+ info->pd, info->mr_type,
+ info->max_frmr_depth);
+ if (IS_ERR(smbdirect_mr->mr)) {
+ log_rdma_mr(ERR,
+ "ib_alloc_mr failed mr_type=%x "
+ "max_frmr_depth=%x\n",
+ info->mr_type,
+ info->max_frmr_depth);
+ smbd_disconnect_rdma_connection(info);
+ }
+
+ smbdirect_mr->state = MR_READY;
+ }
+ /* smbdirect_mr->state is updated by this function
+ * and is read and updated by I/O issuing CPUs trying
+ * to get a MR, the call to atomic_inc_return
+ * implicates a memory barrier and guarantees this
+ * value is updated before waking up any calls to
+ * get_mr() from the I/O issuing CPUs
+ */
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
+ }
+ }
+}
+
+static void destroy_mr_list(struct smbd_connection *info)
+{
+ struct smbd_mr *mr, *tmp;
+
+ cancel_work_sync(&info->mr_recovery_work);
+ list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
+ if (mr->state == MR_INVALIDATED)
+ ib_dma_unmap_sg(info->id->device, mr->sgl,
+ mr->sgl_count, mr->dir);
+ ib_dereg_mr(mr->mr);
+ kfree(mr->sgl);
+ kfree(mr);
+ }
+}
+
+/*
+ * Allocate MRs used for RDMA read/write
+ * The number of MRs will not exceed hardware capability in responder_resources
+ * All MRs are kept in mr_list. The MR can be recovered after it's used
+ * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
+ * as MRs are used and recovered for I/O, but the list links will not change
+ */
+static int allocate_mr_list(struct smbd_connection *info)
+{
+ int i;
+ struct smbd_mr *smbdirect_mr, *tmp;
+
+ INIT_LIST_HEAD(&info->mr_list);
+ init_waitqueue_head(&info->wait_mr);
+ spin_lock_init(&info->mr_list_lock);
+ atomic_set(&info->mr_ready_count, 0);
+ atomic_set(&info->mr_used_count, 0);
+ init_waitqueue_head(&info->wait_for_mr_cleanup);
+ /* Allocate more MRs (2x) than hardware responder_resources */
+ for (i = 0; i < info->responder_resources * 2; i++) {
+ smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
+ if (!smbdirect_mr)
+ goto out;
+ smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
+ info->max_frmr_depth);
+ if (IS_ERR(smbdirect_mr->mr)) {
+ log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
+ "max_frmr_depth=%x\n",
+ info->mr_type, info->max_frmr_depth);
+ goto out;
+ }
+ smbdirect_mr->sgl = kcalloc(
+ info->max_frmr_depth,
+ sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!smbdirect_mr->sgl) {
+ log_rdma_mr(ERR, "failed to allocate sgl\n");
+ ib_dereg_mr(smbdirect_mr->mr);
+ goto out;
+ }
+ smbdirect_mr->state = MR_READY;
+ smbdirect_mr->conn = info;
+
+ list_add_tail(&smbdirect_mr->list, &info->mr_list);
+ atomic_inc(&info->mr_ready_count);
+ }
+ INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
+ return 0;
+
+out:
+ kfree(smbdirect_mr);
+
+ list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
+ ib_dereg_mr(smbdirect_mr->mr);
+ kfree(smbdirect_mr->sgl);
+ kfree(smbdirect_mr);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Get a MR from mr_list. This function waits until there is at least one
+ * MR available in the list. It may access the list while the
+ * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
+ * as they never modify the same places. However, there may be several CPUs
+ * issueing I/O trying to get MR at the same time, mr_list_lock is used to
+ * protect this situation.
+ */
+static struct smbd_mr *get_mr(struct smbd_connection *info)
+{
+ struct smbd_mr *ret;
+ int rc;
+again:
+ rc = wait_event_interruptible(info->wait_mr,
+ atomic_read(&info->mr_ready_count) ||
+ info->transport_status != SMBD_CONNECTED);
+ if (rc) {
+ log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
+ return NULL;
+ }
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_rdma_mr(ERR, "info->transport_status=%x\n",
+ info->transport_status);
+ return NULL;
+ }
+
+ spin_lock(&info->mr_list_lock);
+ list_for_each_entry(ret, &info->mr_list, list) {
+ if (ret->state == MR_READY) {
+ ret->state = MR_REGISTERED;
+ spin_unlock(&info->mr_list_lock);
+ atomic_dec(&info->mr_ready_count);
+ atomic_inc(&info->mr_used_count);
+ return ret;
+ }
+ }
+
+ spin_unlock(&info->mr_list_lock);
+ /*
+ * It is possible that we could fail to get MR because other processes may
+ * try to acquire a MR at the same time. If this is the case, retry it.
+ */
+ goto again;
+}
+
+/*
+ * Register memory for RDMA read/write
+ * pages[]: the list of pages to register memory with
+ * num_pages: the number of pages to register
+ * tailsz: if non-zero, the bytes to register in the last page
+ * writing: true if this is a RDMA write (SMB read), false for RDMA read
+ * need_invalidate: true if this MR needs to be locally invalidated after I/O
+ * return value: the MR registered, NULL if failed.
+ */
+struct smbd_mr *smbd_register_mr(
+ struct smbd_connection *info, struct page *pages[], int num_pages,
+ int tailsz, bool writing, bool need_invalidate)
+{
+ struct smbd_mr *smbdirect_mr;
+ int rc, i;
+ enum dma_data_direction dir;
+ struct ib_reg_wr *reg_wr;
+ struct ib_send_wr *bad_wr;
+
+ if (num_pages > info->max_frmr_depth) {
+ log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
+ num_pages, info->max_frmr_depth);
+ return NULL;
+ }
+
+ smbdirect_mr = get_mr(info);
+ if (!smbdirect_mr) {
+ log_rdma_mr(ERR, "get_mr returning NULL\n");
+ return NULL;
+ }
+ smbdirect_mr->need_invalidate = need_invalidate;
+ smbdirect_mr->sgl_count = num_pages;
+ sg_init_table(smbdirect_mr->sgl, num_pages);
+
+ for (i = 0; i < num_pages - 1; i++)
+ sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
+
+ sg_set_page(&smbdirect_mr->sgl[i], pages[i],
+ tailsz ? tailsz : PAGE_SIZE, 0);
+
+ dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ smbdirect_mr->dir = dir;
+ rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
+ if (!rc) {
+ log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
+ num_pages, dir, rc);
+ goto dma_map_error;
+ }
+
+ rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
+ NULL, PAGE_SIZE);
+ if (rc != num_pages) {
+ log_rdma_mr(INFO,
+ "ib_map_mr_sg failed rc = %x num_pages = %x\n",
+ rc, num_pages);
+ goto map_mr_error;
+ }
+
+ ib_update_fast_reg_key(smbdirect_mr->mr,
+ ib_inc_rkey(smbdirect_mr->mr->rkey));
+ reg_wr = &smbdirect_mr->wr;
+ reg_wr->wr.opcode = IB_WR_REG_MR;
+ smbdirect_mr->cqe.done = register_mr_done;
+ reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
+ reg_wr->wr.num_sge = 0;
+ reg_wr->wr.send_flags = IB_SEND_SIGNALED;
+ reg_wr->mr = smbdirect_mr->mr;
+ reg_wr->key = smbdirect_mr->mr->rkey;
+ reg_wr->access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
+
+ /*
+ * There is no need for waiting for complemtion on ib_post_send
+ * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
+ * on the next ib_post_send when we actaully send I/O to remote peer
+ */
+ rc = ib_post_send(info->id->qp, &reg_wr->wr, &bad_wr);
+ if (!rc)
+ return smbdirect_mr;
+
+ log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
+ rc, reg_wr->key);
+
+ /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
+map_mr_error:
+ ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count, smbdirect_mr->dir);
+
+dma_map_error:
+ smbdirect_mr->state = MR_ERROR;
+ if (atomic_dec_and_test(&info->mr_used_count))
+ wake_up(&info->wait_for_mr_cleanup);
+
+ return NULL;
+}
+
+static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smbd_mr *smbdirect_mr;
+ struct ib_cqe *cqe;
+
+ cqe = wc->wr_cqe;
+ smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
+ smbdirect_mr->state = MR_INVALIDATED;
+ if (wc->status != IB_WC_SUCCESS) {
+ log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
+ smbdirect_mr->state = MR_ERROR;
+ }
+ complete(&smbdirect_mr->invalidate_done);
+}
+
+/*
+ * Deregister a MR after I/O is done
+ * This function may wait if remote invalidation is not used
+ * and we have to locally invalidate the buffer to prevent data is being
+ * modified by remote peer after upper layer consumes it
+ */
+int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
+{
+ struct ib_send_wr *wr, *bad_wr;
+ struct smbd_connection *info = smbdirect_mr->conn;
+ int rc = 0;
+
+ if (smbdirect_mr->need_invalidate) {
+ /* Need to finish local invalidation before returning */
+ wr = &smbdirect_mr->inv_wr;
+ wr->opcode = IB_WR_LOCAL_INV;
+ smbdirect_mr->cqe.done = local_inv_done;
+ wr->wr_cqe = &smbdirect_mr->cqe;
+ wr->num_sge = 0;
+ wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
+ wr->send_flags = IB_SEND_SIGNALED;
+
+ init_completion(&smbdirect_mr->invalidate_done);
+ rc = ib_post_send(info->id->qp, wr, &bad_wr);
+ if (rc) {
+ log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
+ smbd_disconnect_rdma_connection(info);
+ goto done;
+ }
+ wait_for_completion(&smbdirect_mr->invalidate_done);
+ smbdirect_mr->need_invalidate = false;
+ } else
+ /*
+ * For remote invalidation, just set it to MR_INVALIDATED
+ * and defer to mr_recovery_work to recover the MR for next use
+ */
+ smbdirect_mr->state = MR_INVALIDATED;
+
+ /*
+ * Schedule the work to do MR recovery for future I/Os
+ * MR recovery is slow and we don't want it to block the current I/O
+ */
+ queue_work(info->workqueue, &info->mr_recovery_work);
+
+done:
+ if (atomic_dec_and_test(&info->mr_used_count))
+ wake_up(&info->wait_for_mr_cleanup);
+
+ return rc;
+}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
new file mode 100644
index 000000000000..f9038daea194
--- /dev/null
+++ b/fs/cifs/smbdirect.h
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ *
+ * Author(s): Long Li <longli@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#ifndef _SMBDIRECT_H
+#define _SMBDIRECT_H
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#define cifs_rdma_enabled(server) ((server)->rdma)
+
+#include "cifsglob.h"
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mempool.h>
+
+extern int rdma_readwrite_threshold;
+extern int smbd_max_frmr_depth;
+extern int smbd_keep_alive_interval;
+extern int smbd_max_receive_size;
+extern int smbd_max_fragmented_recv_size;
+extern int smbd_max_send_size;
+extern int smbd_send_credit_target;
+extern int smbd_receive_credit_max;
+
+enum keep_alive_status {
+ KEEP_ALIVE_NONE,
+ KEEP_ALIVE_PENDING,
+ KEEP_ALIVE_SENT,
+};
+
+enum smbd_connection_status {
+ SMBD_CREATED,
+ SMBD_CONNECTING,
+ SMBD_CONNECTED,
+ SMBD_NEGOTIATE_FAILED,
+ SMBD_DISCONNECTING,
+ SMBD_DISCONNECTED,
+ SMBD_DESTROYED
+};
+
+/*
+ * The context for the SMBDirect transport
+ * Everything related to the transport is here. It has several logical parts
+ * 1. RDMA related structures
+ * 2. SMBDirect connection parameters
+ * 3. Memory registrations
+ * 4. Receive and reassembly queues for data receive path
+ * 5. mempools for allocating packets
+ */
+struct smbd_connection {
+ enum smbd_connection_status transport_status;
+
+ /* RDMA related */
+ struct rdma_cm_id *id;
+ struct ib_qp_init_attr qp_attr;
+ struct ib_pd *pd;
+ struct ib_cq *send_cq, *recv_cq;
+ struct ib_device_attr dev_attr;
+ int ri_rc;
+ struct completion ri_done;
+ wait_queue_head_t conn_wait;
+ wait_queue_head_t wait_destroy;
+
+ struct completion negotiate_completion;
+ bool negotiate_done;
+
+ struct work_struct destroy_work;
+ struct work_struct disconnect_work;
+ struct work_struct recv_done_work;
+ struct work_struct post_send_credits_work;
+
+ spinlock_t lock_new_credits_offered;
+ int new_credits_offered;
+
+ /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */
+ int receive_credit_max;
+ int send_credit_target;
+ int max_send_size;
+ int max_fragmented_recv_size;
+ int max_fragmented_send_size;
+ int max_receive_size;
+ int keep_alive_interval;
+ int max_readwrite_size;
+ enum keep_alive_status keep_alive_requested;
+ int protocol;
+ atomic_t send_credits;
+ atomic_t receive_credits;
+ int receive_credit_target;
+ int fragment_reassembly_remaining;
+
+ /* Memory registrations */
+ /* Maximum number of RDMA read/write outstanding on this connection */
+ int responder_resources;
+ /* Maximum number of SGEs in a RDMA write/read */
+ int max_frmr_depth;
+ /*
+ * If payload is less than or equal to the threshold,
+ * use RDMA send/recv to send upper layer I/O.
+ * If payload is more than the threshold,
+ * use RDMA read/write through memory registration for I/O.
+ */
+ int rdma_readwrite_threshold;
+ enum ib_mr_type mr_type;
+ struct list_head mr_list;
+ spinlock_t mr_list_lock;
+ /* The number of available MRs ready for memory registration */
+ atomic_t mr_ready_count;
+ atomic_t mr_used_count;
+ wait_queue_head_t wait_mr;
+ struct work_struct mr_recovery_work;
+ /* Used by transport to wait until all MRs are returned */
+ wait_queue_head_t wait_for_mr_cleanup;
+
+ /* Activity accoutning */
+ /* Pending reqeusts issued from upper layer */
+ int smbd_send_pending;
+ wait_queue_head_t wait_smbd_send_pending;
+
+ int smbd_recv_pending;
+ wait_queue_head_t wait_smbd_recv_pending;
+
+ atomic_t send_pending;
+ wait_queue_head_t wait_send_pending;
+ atomic_t send_payload_pending;
+ wait_queue_head_t wait_send_payload_pending;
+
+ /* Receive queue */
+ struct list_head receive_queue;
+ int count_receive_queue;
+ spinlock_t receive_queue_lock;
+
+ struct list_head empty_packet_queue;
+ int count_empty_packet_queue;
+ spinlock_t empty_packet_queue_lock;
+
+ wait_queue_head_t wait_receive_queues;
+
+ /* Reassembly queue */
+ struct list_head reassembly_queue;
+ spinlock_t reassembly_queue_lock;
+ wait_queue_head_t wait_reassembly_queue;
+
+ /* total data length of reassembly queue */
+ int reassembly_data_length;
+ int reassembly_queue_length;
+ /* the offset to first buffer in reassembly queue */
+ int first_entry_offset;
+
+ bool send_immediate;
+
+ wait_queue_head_t wait_send_queue;
+
+ /*
+ * Indicate if we have received a full packet on the connection
+ * This is used to identify the first SMBD packet of a assembled
+ * payload (SMB packet) in reassembly queue so we can return a
+ * RFC1002 length to upper layer to indicate the length of the SMB
+ * packet received
+ */
+ bool full_packet_received;
+
+ struct workqueue_struct *workqueue;
+ struct delayed_work idle_timer_work;
+ struct delayed_work send_immediate_work;
+
+ /* Memory pool for preallocating buffers */
+ /* request pool for RDMA send */
+ struct kmem_cache *request_cache;
+ mempool_t *request_mempool;
+
+ /* response pool for RDMA receive */
+ struct kmem_cache *response_cache;
+ mempool_t *response_mempool;
+
+ /* for debug purposes */
+ unsigned int count_get_receive_buffer;
+ unsigned int count_put_receive_buffer;
+ unsigned int count_reassembly_queue;
+ unsigned int count_enqueue_reassembly_queue;
+ unsigned int count_dequeue_reassembly_queue;
+ unsigned int count_send_empty;
+};
+
+enum smbd_message_type {
+ SMBD_NEGOTIATE_RESP,
+ SMBD_TRANSFER_DATA,
+};
+
+#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
+
+/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
+struct smbd_negotiate_req {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */
+struct smbd_negotiate_resp {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 negotiated_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le32 status;
+ __le32 max_readwrite_size;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */
+struct smbd_data_transfer {
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le16 flags;
+ __le16 reserved;
+ __le32 remaining_data_length;
+ __le32 data_offset;
+ __le32 data_length;
+ __le32 padding;
+ __u8 buffer[];
+} __packed;
+
+/* The packet fields for a registered RDMA buffer */
+struct smbd_buffer_descriptor_v1 {
+ __le64 offset;
+ __le32 token;
+ __le32 length;
+} __packed;
+
+/* Default maximum number of SGEs in a RDMA send/recv */
+#define SMBDIRECT_MAX_SGE 16
+/* The context for a SMBD request */
+struct smbd_request {
+ struct smbd_connection *info;
+ struct ib_cqe cqe;
+
+ /* true if this request carries upper layer payload */
+ bool has_payload;
+
+ /* the SGE entries for this packet */
+ struct ib_sge sge[SMBDIRECT_MAX_SGE];
+ int num_sge;
+
+ /* SMBD packet header follows this structure */
+ u8 packet[];
+};
+
+/* The context for a SMBD response */
+struct smbd_response {
+ struct smbd_connection *info;
+ struct ib_cqe cqe;
+ struct ib_sge sge;
+
+ enum smbd_message_type type;
+
+ /* Link to receive queue or reassembly queue */
+ struct list_head list;
+
+ /* Indicate if this is the 1st packet of a payload */
+ bool first_segment;
+
+ /* SMBD packet header and payload follows this structure */
+ u8 packet[];
+};
+
+/* Create a SMBDirect session */
+struct smbd_connection *smbd_get_connection(
+ struct TCP_Server_Info *server, struct sockaddr *dstaddr);
+
+/* Reconnect SMBDirect session */
+int smbd_reconnect(struct TCP_Server_Info *server);
+/* Destroy SMBDirect session */
+void smbd_destroy(struct smbd_connection *info);
+
+/* Interface for carrying upper layer I/O through send/recv */
+int smbd_recv(struct smbd_connection *info, struct msghdr *msg);
+int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst);
+
+enum mr_state {
+ MR_READY,
+ MR_REGISTERED,
+ MR_INVALIDATED,
+ MR_ERROR
+};
+
+struct smbd_mr {
+ struct smbd_connection *conn;
+ struct list_head list;
+ enum mr_state state;
+ struct ib_mr *mr;
+ struct scatterlist *sgl;
+ int sgl_count;
+ enum dma_data_direction dir;
+ union {
+ struct ib_reg_wr wr;
+ struct ib_send_wr inv_wr;
+ };
+ struct ib_cqe cqe;
+ bool need_invalidate;
+ struct completion invalidate_done;
+};
+
+/* Interfaces to register and deregister MR for RDMA read/write */
+struct smbd_mr *smbd_register_mr(
+ struct smbd_connection *info, struct page *pages[], int num_pages,
+ int tailsz, bool writing, bool need_invalidate);
+int smbd_deregister_mr(struct smbd_mr *mr);
+
+#else
+#define cifs_rdma_enabled(server) 0
+struct smbd_connection {};
+static inline void *smbd_get_connection(
+ struct TCP_Server_Info *server, struct sockaddr *dstaddr) {return NULL;}
+static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; }
+static inline void smbd_destroy(struct smbd_connection *info) {}
+static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; }
+static inline int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) {return -1; }
+#endif
+
+#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7efbab013957..9779b3292d8e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -37,6 +37,10 @@
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+#include "smbdirect.h"
+
+/* Max number of iovectors we can use off the stack when sending requests. */
+#define CIFS_MAX_IOV_SIZE 8
void
cifs_wake_up_task(struct mid_q_entry *mid)
@@ -229,7 +233,10 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
int val = 1;
-
+ if (cifs_rdma_enabled(server) && server->smbd_conn) {
+ rc = smbd_send(server->smbd_conn, rqst);
+ goto smbd_done;
+ }
if (ssocket == NULL)
return -ENOTSOCK;
@@ -298,7 +305,7 @@ uncork:
*/
server->tcpStatus = CifsNeedReconnect;
}
-
+smbd_done:
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
@@ -803,12 +810,16 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
const int flags, struct kvec *resp_iov)
{
struct smb_rqst rqst;
- struct kvec *new_iov;
+ struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov;
int rc;
- new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL);
- if (!new_iov)
- return -ENOMEM;
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE) {
+ new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1),
+ GFP_KERNEL);
+ if (!new_iov)
+ return -ENOMEM;
+ } else
+ new_iov = s_iov;
/* 1st iov is a RFC1001 length followed by the rest of the packet */
memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec));
@@ -823,7 +834,51 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
rqst.rq_nvec = n_vec + 1;
rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov);
- kfree(new_iov);
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE)
+ kfree(new_iov);
+ return rc;
+}
+
+/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */
+int
+smb2_send_recv(const unsigned int xid, struct cifs_ses *ses,
+ struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
+ const int flags, struct kvec *resp_iov)
+{
+ struct smb_rqst rqst;
+ struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov;
+ int rc;
+ int i;
+ __u32 count;
+ __be32 rfc1002_marker;
+
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE) {
+ new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1),
+ GFP_KERNEL);
+ if (!new_iov)
+ return -ENOMEM;
+ } else
+ new_iov = s_iov;
+
+ /* 1st iov is an RFC1002 Session Message length */
+ memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec));
+
+ count = 0;
+ for (i = 1; i < n_vec + 1; i++)
+ count += new_iov[i].iov_len;
+
+ rfc1002_marker = cpu_to_be32(count);
+
+ new_iov[0].iov_base = &rfc1002_marker;
+ new_iov[0].iov_len = 4;
+
+ memset(&rqst, 0, sizeof(struct smb_rqst));
+ rqst.rq_iov = new_iov;
+ rqst.rq_nvec = n_vec + 1;
+
+ rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov);
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE)
+ kfree(new_iov);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 52f975d848a0..316af84674f1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
#ifdef CONFIG_CIFS_POSIX
if (!value)
goto out;
- if (sb->s_flags & MS_POSIXACL)
+ if (sb->s_flags & SB_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
value, (const int)size,
ACL_TYPE_ACCESS, cifs_sb->local_nls,
@@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
#ifdef CONFIG_CIFS_POSIX
if (!value)
goto out;
- if (sb->s_flags & MS_POSIXACL)
+ if (sb->s_flags & SB_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
value, (const int)size,
ACL_TYPE_DEFAULT, cifs_sb->local_nls,
@@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & MS_POSIXACL)
+ if (sb->s_flags & SB_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
value, size, ACL_TYPE_ACCESS,
cifs_sb->local_nls,
@@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
case XATTR_ACL_DEFAULT:
#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & MS_POSIXACL)
+ if (sb->s_flags & SB_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
value, size, ACL_TYPE_DEFAULT,
cifs_sb->local_nls,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6f0a6a4d5faa..97424cf206c0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,7 +96,7 @@ void coda_destroy_inodecache(void)
static int coda_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_NOATIME;
+ *flags |= SB_NOATIME;
return 0;
}
@@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
mutex_unlock(&vc->vc_mutex);
sb->s_fs_info = vc;
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
sb->s_blocksize = 4096; /* XXXXX what do we put here?? */
sb->s_blocksize_bits = 12;
sb->s_magic = CODA_SUPER_MAGIC;
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index f937082f3244..58e2fe40b2a0 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -34,6 +34,7 @@ config CRAMFS_BLOCKDEV
config CRAMFS_MTD
bool "Support CramFs image directly mapped in physical memory"
depends on CRAMFS && MTD
+ depends on CRAMFS=m || MTD=y
default y if !CRAMFS_BLOCKDEV
help
This option allows the CramFs driver to load data directly from
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9a2ab419ba62..017b0ab19bc4 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -505,7 +505,7 @@ static void cramfs_kill_sb(struct super_block *sb)
static int cramfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -592,7 +592,7 @@ static int cramfs_finalize_super(struct super_block *sb,
struct inode *root;
/* Set it all up.. */
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
sb->s_op = &cramfs_ops;
root = get_cramfs_inode(sb, cramfs_root, 0);
if (IS_ERR(root))
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7df1df81ff..379dce86f001 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1636,8 +1636,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname[name->len] = 0;
/* Make sure we always see the terminating NUL character */
- smp_wmb();
- dentry->d_name.name = dname;
+ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
dentry->d_lockref.count = 1;
dentry->d_flags = 0;
@@ -3047,17 +3046,14 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
*
- * Data dependency barrier is needed to make sure that we see that terminating
- * NUL. Alpha strikes again, film at 11...
+ * Load acquire is needed to make sure that we see that terminating NUL.
*/
static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
{
- const char *dname = READ_ONCE(name->name);
+ const char *dname = smp_load_acquire(&name->name); /* ^^^ */
u32 dlen = READ_ONCE(name->len);
char *p;
- smp_read_barrier_depends();
-
*buflen -= dlen + 1;
if (*buflen < 0)
return -ENAMETOOLONG;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f2677c90d96e..025d66a705db 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
* Set the POSIX ACL flag based on whether they're enabled in the lower
* mount.
*/
- s->s_flags = flags & ~MS_POSIXACL;
- s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+ s->s_flags = flags & ~SB_POSIXACL;
+ s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL;
/**
* Force a read-only eCryptfs mount when:
@@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
* 2) The ecryptfs_encrypted_view mount option is specified
*/
if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
- s->s_flags |= MS_RDONLY;
+ s->s_flags |= SB_RDONLY;
s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
s->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
ecryptfs_set_dentry_private(s->s_root, root_info);
root_info->lower_path = path;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
return dget(s->s_root);
out_free:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 65b59009555b..6ffb7ba1547a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -116,7 +116,7 @@ static void destroy_inodecache(void)
static int efs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
#ifdef DEBUG
pr_info("forcing read-only mode\n");
#endif
- s->s_flags |= MS_RDONLY;
+ s->s_flags |= SB_RDONLY;
}
s->s_op = &efs_superblock_operations;
s->s_export_op = &efs_export_ops;
diff --git a/fs/exec.c b/fs/exec.c
index 1d6243d9f2b6..7eb8d21bcab9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1216,15 +1216,14 @@ killed:
return -EAGAIN;
}
-char *get_task_comm(char *buf, struct task_struct *tsk)
+char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
- /* buf must be at least sizeof(tsk->comm) in size */
task_lock(tsk);
- strncpy(buf, tsk->comm, sizeof(tsk->comm));
+ strncpy(buf, tsk->comm, buf_size);
task_unlock(tsk);
return buf;
}
-EXPORT_SYMBOL_GPL(get_task_comm);
+EXPORT_SYMBOL_GPL(__get_task_comm);
/*
* These functions flushes out all traces of the currently running executable
@@ -1350,9 +1349,14 @@ void setup_new_exec(struct linux_binprm * bprm)
current->sas_ss_sp = current->sas_ss_size = 0;
- /* Figure out dumpability. */
+ /*
+ * Figure out dumpability. Note that this checking only of current
+ * is wrong, but userspace depends on it. This should be testing
+ * bprm->secureexec instead.
+ */
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
- bprm->secureexec)
+ !(uid_eq(current_euid(), current_uid()) &&
+ gid_eq(current_egid(), current_gid())))
set_dumpable(current->mm, suid_dumpable);
else
set_dumpable(current->mm, SUID_DUMP_USER);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 98233a97b7b8..c5a53fcc43ea 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -31,6 +31,7 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/iversion.h>
#include "exofs.h"
static inline unsigned exofs_chunk_size(struct inode *inode)
@@ -60,7 +61,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
int err = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
if (!PageUptodate(page))
SetPageUptodate(page);
@@ -241,7 +242,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- int need_revalidate = (file->f_version != inode->i_version);
+ bool need_revalidate = inode_cmp_iversion(inode, file->f_version);
if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
return 0;
@@ -264,8 +265,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
- file->f_version = inode->i_version;
- need_revalidate = 0;
+ file->f_version = inode_query_iversion(inode);
+ need_revalidate = false;
}
de = (struct exofs_dir_entry *)(kaddr + offset);
limit = kaddr + exofs_last_byte(inode, n) -
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 819624cfc8da..7e244093c0e5 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -38,6 +38,7 @@
#include <linux/module.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "exofs.h"
@@ -159,7 +160,7 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
- oi->vfs_inode.i_version = 1;
+ inode_set_iversion(&oi->vfs_inode, 1);
return &oi->vfs_inode;
}
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e1b3724bebf2..33db13365c5e 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -548,7 +548,7 @@ do_more:
}
mark_buffer_dirty(bitmap_bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);
group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
@@ -1424,7 +1424,7 @@ allocated:
percpu_counter_sub(&sbi->s_freeblocks_counter, num);
mark_buffer_dirty(bitmap_bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);
*errp = 0;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 987647986f47..4111085a129f 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -26,6 +26,7 @@
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/iversion.h>
typedef struct ext2_dir_entry_2 ext2_dirent;
@@ -92,7 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
int err = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
if (pos+len > dir->i_size) {
@@ -293,7 +294,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
unsigned char *types = NULL;
- int need_revalidate = file->f_version != inode->i_version;
+ bool need_revalidate = inode_cmp_iversion(inode, file->f_version);
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
@@ -319,8 +320,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
- file->f_version = inode->i_version;
- need_revalidate = 0;
+ file->f_version = inode_query_iversion(inode);
+ need_revalidate = false;
}
de = (ext2_dirent *)(kaddr+offset);
limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a1fc3dabca41..6484199b35d1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode)
else
ext2_release_inode(sb, block_group, is_directory);
mark_buffer_dirty(bitmap_bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);
brelse(bitmap_bh);
@@ -517,7 +517,7 @@ repeat_in_this_group:
goto fail;
got:
mark_buffer_dirty(bitmap_bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);
brelse(bitmap_bh);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e2b6be03e69b..554c98b8a93a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -33,6 +33,7 @@
#include <linux/quotaops.h>
#include <linux/uaccess.h>
#include <linux/dax.h>
+#include <linux/iversion.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -75,7 +76,7 @@ void ext2_error(struct super_block *sb, const char *function,
if (test_opt(sb, ERRORS_RO)) {
ext2_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
}
@@ -184,7 +185,7 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
- ei->vfs_inode.i_version = 1;
+ inode_set_iversion(&ei->vfs_inode, 1);
#ifdef CONFIG_QUOTA
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
@@ -656,7 +657,7 @@ static int ext2_setup_super (struct super_block * sb,
ext2_msg(sb, KERN_ERR,
"error: revision level too high, "
"forcing read-only mode");
- res = MS_RDONLY;
+ res = SB_RDONLY;
}
if (read_only)
return res;
@@ -924,9 +925,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_resuid = opts.s_resuid;
sbi->s_resgid = opts.s_resgid;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
- MS_POSIXACL : 0);
+ SB_POSIXACL : 0);
sb->s_iflags |= SB_I_CGROUPWB;
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
@@ -1178,7 +1179,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_msg(sb, KERN_WARNING,
"warning: mounting ext3 filesystem as ext2");
if (ext2_setup_super (sb, es, sb_rdonly(sb)))
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ext2_write_super(sb);
return 0;
@@ -1341,9 +1342,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
"dax flag with busy inodes while remounting");
new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
}
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out_set;
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
!(sbi->s_mount_state & EXT2_VALID_FS))
goto out_set;
@@ -1379,7 +1380,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
*/
sbi->s_mount_state = le16_to_cpu(es->s_state);
if (!ext2_setup_super (sb, es, 0))
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
spin_unlock(&sbi->s_lock);
ext2_write_super(sb);
@@ -1392,8 +1393,8 @@ out_set:
sbi->s_mount_opt = new_opts.s_mount_opt;
sbi->s_resuid = new_opts.s_resuid;
sbi->s_resgid = new_opts.s_resgid;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
spin_unlock(&sbi->s_lock);
return 0;
@@ -1569,7 +1570,7 @@ out:
return err;
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_version++;
+ inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
return len - towrite;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d5babc9f222b..afda0a0499ce 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "ext4.h"
#include "xattr.h"
@@ -208,7 +209,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (file->f_version != inode->i_version) {
+ if (inode_cmp_iversion(inode, file->f_version)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -227,7 +228,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- file->f_version = inode->i_version;
+ file->f_version = inode_query_iversion(inode);
}
while (ctx->pos < inode->i_size
@@ -568,10 +569,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
* cached entries.
*/
if ((!info->curr_node) ||
- (file->f_version != inode->i_version)) {
+ inode_cmp_iversion(inode, file->f_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- file->f_version = inode->i_version;
+ file->f_version = inode_query_iversion(inode);
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 07bca11749d4..c941251ac0c0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4722,6 +4722,7 @@ retry:
EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
ret2 = ext4_journal_stop(handle);
if (ret2)
break;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b4267d72f249..b32cf263750d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(p))
+ return ERR_CAST(p);
if (p) {
int acl_size = p->a_count * sizeof(ext4_acl_entry);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1367553c43bb..a8b987b71173 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -14,6 +14,7 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -1042,7 +1043,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
*/
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
- dir->i_version++;
+ inode_inc_iversion(dir);
return 1;
}
@@ -1494,7 +1495,7 @@ int ext4_read_inline_dir(struct file *file,
* dirent right now. Scan from the start of the inline
* dir to make sure.
*/
- if (file->f_version != inode->i_version) {
+ if (inode_cmp_iversion(inode, file->f_version)) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
@@ -1526,7 +1527,7 @@ int ext4_read_inline_dir(struct file *file,
}
offset = i;
ctx->pos = offset;
- file->f_version = inode->i_version;
+ file->f_version = inode_query_iversion(inode);
}
while (ctx->pos < extra_size) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0992d76f7ab1..0eff5b761c6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
+#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -149,6 +150,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+ }
return S_ISLNK(inode->i_mode) && inode->i_size &&
(inode->i_size < EXT4_N_BLOCKS * 4);
}
@@ -2742,7 +2752,7 @@ static int ext4_writepages(struct address_space *mapping,
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
- * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
* the latter could be true if the filesystem is mounted
* read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
@@ -4873,12 +4883,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ u64 ivers = le32_to_cpu(raw_inode->i_disk_version);
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
+ ivers |=
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
+ inode_set_iversion_queried(inode, ivers);
}
ret = 0;
@@ -5164,11 +5176,13 @@ static int ext4_do_update_inode(handle_t *handle,
}
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ u64 ivers = inode_peek_iversion(inode);
+
+ raw_inode->i_disk_version = cpu_to_le32(ivers);
if (ei->i_extra_isize) {
if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
+ cpu_to_le32(ivers >> 32);
raw_inode->i_extra_isize =
cpu_to_le16(ei->i_extra_isize);
}
@@ -5183,7 +5197,7 @@ static int ext4_do_update_inode(handle_t *handle,
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
- if (inode->i_sb->s_flags & MS_LAZYTIME)
+ if (inode->i_sb->s_flags & SB_LAZYTIME)
ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
bh->b_data);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1eec25014f62..7e99ad02f1ba 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -19,6 +19,7 @@
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
+#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
@@ -144,7 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
i_gid_write(inode_bl, 0);
inode_bl->i_flags = 0;
ei_bl->i_flags = 0;
- inode_bl->i_version = 1;
+ inode_set_iversion(inode_bl, 1);
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
if (ext4_has_feature_extents(sb)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 798b3ac680db..6660686e505a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -34,6 +34,7 @@
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
+#include <linux/iversion.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -1399,6 +1400,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
"falling back\n"));
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ if (!nblocks) {
+ ret = NULL;
+ goto cleanup_and_exit;
+ }
start = EXT4_I(dir)->i_dir_start_lookup;
if (start >= nblocks)
start = 0;
@@ -2955,7 +2960,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
"empty directory '%.*s' has too many links (%u)",
dentry->d_name.len, dentry->d_name.name,
inode->i_nlink);
- inode->i_version++;
+ inode_inc_iversion(inode);
clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
* zero will ensure that the right thing happens during any
@@ -3361,7 +3366,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
ent->de->inode = cpu_to_le32(ino);
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
- ent->dir->i_version++;
+ inode_inc_iversion(ent->dir);
ent->dir->i_ctime = ent->dir->i_mtime =
current_time(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0556cd036b69..5de959fb0244 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -40,6 +40,7 @@
#include <linux/dax.h>
#include <linux/cleancache.h>
#include <linux/uaccess.h>
+#include <linux/iversion.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -422,7 +423,7 @@ static void ext4_handle_error(struct super_block *sb)
* before ->s_flags update
*/
smp_wmb();
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC)) {
if (EXT4_SB(sb)->s_journal &&
@@ -635,7 +636,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
* before ->s_flags update
*/
smp_wmb();
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -967,7 +968,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
- ei->vfs_inode.i_version = 1;
+ inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
@@ -1682,10 +1683,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sb->s_flags |= SB_I_VERSION;
return 1;
case Opt_lazytime:
- sb->s_flags |= MS_LAZYTIME;
+ sb->s_flags |= SB_LAZYTIME;
return 1;
case Opt_nolazytime:
- sb->s_flags &= ~MS_LAZYTIME;
+ sb->s_flags &= ~SB_LAZYTIME;
return 1;
}
@@ -2116,7 +2117,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
- res = MS_RDONLY;
+ res = SB_RDONLY;
}
if (read_only)
goto done;
@@ -2429,7 +2430,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
/* don't clear list on RO mount w/ errors */
- if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
+ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
"clearing orphan list.\n");
es->s_last_orphan = 0;
@@ -2438,19 +2439,19 @@ static void ext4_orphan_cleanup(struct super_block *sb,
return;
}
- if (s_flags & MS_RDONLY) {
+ if (s_flags & SB_RDONLY) {
ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
}
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
*/
- if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
int ret = ext4_enable_quotas(sb);
if (!ret)
@@ -2539,7 +2540,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
}
#endif
- sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sb->s_flags = s_flags; /* Restore SB_RDONLY status */
}
/*
@@ -2741,7 +2742,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (ext4_has_feature_readonly(sb)) {
ext4_msg(sb, KERN_INFO, "filesystem is read-only");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
return 1;
}
@@ -3623,8 +3624,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_iflags |= SB_I_CGROUPWB;
}
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
@@ -4199,7 +4200,7 @@ no_journal:
}
if (ext4_setup_super(sb, es, sb_rdonly(sb)))
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4693,7 +4694,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
* the clock is set in the future, and this will cause e2fsck
* to complain and force a full file system check.
*/
- if (!(sb->s_flags & MS_RDONLY))
+ if (!(sb->s_flags & SB_RDONLY))
es->s_wtime = cpu_to_le32(get_seconds());
if (sb->s_bdev->bd_part)
es->s_kbytes_written =
@@ -5047,8 +5048,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, "Abort forced by user");
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
es = sbi->s_es;
@@ -5057,16 +5058,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
- if (*flags & MS_LAZYTIME)
- sb->s_flags |= MS_LAZYTIME;
+ if (*flags & SB_LAZYTIME)
+ sb->s_flags |= SB_LAZYTIME;
- if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+ if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
goto restore_opts;
}
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
err = sync_filesystem(sb);
if (err < 0)
goto restore_opts;
@@ -5078,7 +5079,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* First of all, the unconditional stuff we have to do
* to disable replay of the journal when we next remount
*/
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
/*
* OK, test if we are remounting a valid rw partition
@@ -5140,7 +5141,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
if (!ext4_setup_super(sb, es, 0))
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
@@ -5164,7 +5165,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+ if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
@@ -5182,7 +5183,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
#endif
- *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
+ *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 218a7ba57819..63656dbafdc4 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -56,6 +56,7 @@
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
+#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
@@ -294,13 +295,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
return ((u64)ea_inode->i_ctime.tv_sec << 32) |
- ((u32)ea_inode->i_version);
+ (u32) inode_peek_iversion_raw(ea_inode);
}
static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
- ea_inode->i_version = (u32)ref_count;
+ inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}
static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dd2e73e10857..4aa69bc1c70a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -617,17 +617,17 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
return 0;
- if (s_flags & MS_RDONLY) {
+ if (s_flags & SB_RDONLY) {
f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
- sbi->sb->s_flags &= ~MS_RDONLY;
+ sbi->sb->s_flags &= ~SB_RDONLY;
}
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= MS_ACTIVE;
+ sbi->sb->s_flags |= SB_ACTIVE;
/* Turn on quotas so that they are updated correctly */
- quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
#endif
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -658,7 +658,7 @@ out:
if (quota_enabled)
f2fs_quota_off_umount(sbi->sb);
#endif
- sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 516fa0d3ff9c..455f086cce3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio)
int i;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
+ if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
f2fs_show_injection_info(FAULT_IO);
bio->bi_status = BLK_STS_IOERR;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f4e094e816c6..6abf26c31d01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2378,7 +2378,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
static inline int f2fs_readonly(struct super_block *sb)
{
- return sb->s_flags & MS_RDONLY;
+ return sb->s_flags & SB_RDONLY;
}
static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5d5bba462f26..d844dcb80570 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1005,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
cpc.reason = __get_cp_reason(sbi);
gc_more:
- if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+ if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
ret = -EINVAL;
goto stop;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 92c57ace1939..b3a14b0429f2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -598,16 +598,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
int quota_enabled;
#endif
- if (s_flags & MS_RDONLY) {
+ if (s_flags & SB_RDONLY) {
f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
- sbi->sb->s_flags &= ~MS_RDONLY;
+ sbi->sb->s_flags &= ~SB_RDONLY;
}
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= MS_ACTIVE;
+ sbi->sb->s_flags |= SB_ACTIVE;
/* Turn on quotas so that they are updated correctly */
- quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
#endif
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -671,7 +671,7 @@ out:
if (quota_enabled)
f2fs_quota_off_umount(sbi->sb);
#endif
- sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return ret ? ret: err;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a6c5dd450002..708155d9c2e4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -534,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options)
#endif
break;
case Opt_lazytime:
- sb->s_flags |= MS_LAZYTIME;
+ sb->s_flags |= SB_LAZYTIME;
break;
case Opt_nolazytime:
- sb->s_flags &= ~MS_LAZYTIME;
+ sb->s_flags &= ~SB_LAZYTIME;
break;
#ifdef CONFIG_QUOTA
case Opt_quota:
@@ -1168,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, EXTENT_CACHE);
set_opt(sbi, NOHEAP);
- sbi->sb->s_flags |= MS_LAZYTIME;
+ sbi->sb->s_flags |= SB_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
@@ -1236,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
#endif
/* recover superblocks we couldn't write due to previous RO mount */
- if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
err = f2fs_commit_super(sbi, false);
f2fs_msg(sb, KERN_INFO,
"Try to recover all the superblocks, ret: %d", err);
@@ -1255,17 +1255,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* Previous and new state of filesystem is RO,
* so skip checking GC and FLUSH_MERGE conditions.
*/
- if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
+ if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
goto skip;
#ifdef CONFIG_QUOTA
- if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+ if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
} else {
/* dquot_resume needs RW */
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
if (sb_any_quota_suspended(sb)) {
dquot_resume(sb, -1);
} else if (f2fs_sb_has_quota_ino(sb)) {
@@ -1288,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
if (sbi->gc_thread) {
stop_gc_thread(sbi);
need_restart_gc = true;
@@ -1300,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
writeback_inodes_sb(sb, WB_REASON_SYNC);
sync_inodes_sb(sb);
@@ -1314,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
*/
- if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+ if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
clear_opt(sbi, FLUSH_MERGE);
destroy_flush_cmd_control(sbi, false);
} else {
@@ -1329,8 +1329,8 @@ skip:
kfree(s_qf_names[i]);
#endif
/* Update the POSIXACL Flag */
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
return 0;
restore_gc:
@@ -2472,8 +2472,8 @@ try_onemore:
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
sb->s_time_gran = 1;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b833ffeee1e1..8e100c3bf72c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include <linux/iversion.h>
#include "fat.h"
/*
@@ -1055,7 +1056,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
brelse(bh);
if (err)
return err;
- dir->i_version++;
+ inode_inc_iversion(dir);
if (nr_slots) {
/*
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 48b2336692f9..bac10de678cc 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
set_buffer_uptodate(c_bh);
mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
err = sync_dirty_buffer(c_bh);
brelse(c_bh);
if (err)
@@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
}
if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
- if (sb->s_flags & MS_SYNCHRONOUS) {
+ if (sb->s_flags & SB_SYNCHRONOUS) {
err = fat_sync_bhs(bhs, nr_bhs);
if (err)
goto error;
@@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
fat_collect_bhs(bhs, &nr_bhs, &fatent);
} while (cluster != FAT_ENT_EOF);
- if (sb->s_flags & MS_SYNCHRONOUS) {
+ if (sb->s_flags & SB_SYNCHRONOUS) {
err = fat_sync_bhs(bhs, nr_bhs);
if (err)
goto error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 30c52394a7ad..ffbbf0520d9e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <asm/unaligned.h>
+#include <linux/iversion.h>
#include "fat.h"
#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -507,7 +508,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
MSDOS_I(inode)->i_pos = 0;
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
- inode->i_version++;
+ inode_inc_iversion(inode);
inode->i_generation = get_seconds();
if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
@@ -590,7 +591,7 @@ struct inode *fat_build_inode(struct super_block *sb,
goto out;
}
inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
- inode->i_version = 1;
+ inode_set_iversion(inode, 1);
err = fat_fill_inode(inode, de);
if (err) {
iput(inode);
@@ -779,14 +780,14 @@ static void __exit fat_destroy_inodecache(void)
static int fat_remount(struct super_block *sb, int *flags, char *data)
{
- int new_rdonly;
+ bool new_rdonly;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+ *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
sync_filesystem(sb);
/* make sure we update state on remount. */
- new_rdonly = *flags & MS_RDONLY;
+ new_rdonly = *flags & SB_RDONLY;
if (new_rdonly != sb_rdonly(sb)) {
if (new_rdonly)
fat_set_state(sb, 0, 0);
@@ -1352,7 +1353,7 @@ out:
if (opts->unicode_xlate)
opts->utf8 = 0;
if (opts->nfs == FAT_NFS_NOSTALE_RO) {
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
sb->s_export_op = &fat_export_ops_nostale;
}
@@ -1377,7 +1378,7 @@ static int fat_read_root(struct inode *inode)
MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
- inode->i_version++;
+ inode_inc_iversion(inode);
inode->i_generation = 0;
inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
inode->i_op = sbi->dir_ops;
@@ -1608,7 +1609,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
return -ENOMEM;
sb->s_fs_info = sbi;
- sb->s_flags |= MS_NODIRATIME;
+ sb->s_flags |= SB_NODIRATIME;
sb->s_magic = MSDOS_SUPER_MAGIC;
sb->s_op = &fat_sops;
sb->s_export_op = &fat_export_ops;
@@ -1828,7 +1829,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (!root_inode)
goto out_fail;
root_inode->i_ino = MSDOS_ROOT_INO;
- root_inode->i_version = 1;
+ inode_set_iversion(root_inode, 1);
error = fat_read_root(root_inode);
if (error < 0) {
iput(root_inode);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index acc3aa30ee54..f9bdc1e01c98 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
if (opts->errors == FAT_ERRORS_PANIC)
panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) {
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
}
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7d6a105d601b..582ca731a6c9 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -7,6 +7,7 @@
*/
#include <linux/module.h>
+#include <linux/iversion.h>
#include "fat.h"
/* Characters that are undesirable in an MS-DOS file name */
@@ -480,7 +481,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
} else
mark_inode_dirty(old_inode);
- old_dir->i_version++;
+ inode_inc_iversion(old_dir);
old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
@@ -508,7 +509,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
goto out;
new_i_pos = sinfo.i_pos;
}
- new_dir->i_version++;
+ inode_inc_iversion(new_dir);
fat_detach(old_inode);
fat_attach(old_inode, new_i_pos);
@@ -540,7 +541,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
old_sinfo.bh = NULL;
if (err)
goto error_dotdot;
- old_dir->i_version++;
+ inode_inc_iversion(old_dir);
old_dir->i_ctime = old_dir->i_mtime = ts;
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
@@ -646,7 +647,7 @@ static void setup(struct super_block *sb)
{
MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
sb->s_d_op = &msdos_dentry_operations;
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
}
static int msdos_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 02c066663a3a..cefea792cde8 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -20,7 +20,7 @@
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/kernel.h>
-
+#include <linux/iversion.h>
#include "fat.h"
static inline unsigned long vfat_d_version(struct dentry *dentry)
@@ -46,7 +46,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
{
int ret = 1;
spin_lock(&dentry->d_lock);
- if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version)
+ if (inode_cmp_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry)))
ret = 0;
spin_unlock(&dentry->d_lock);
return ret;
@@ -759,7 +759,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
if (!inode)
- vfat_d_version_set(dentry, dir->i_version);
+ vfat_d_version_set(dentry, inode_query_iversion(dir));
return d_splice_alias(inode, dentry);
error:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -781,7 +781,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
if (err)
goto out;
- dir->i_version++;
+ inode_inc_iversion(dir);
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
brelse(sinfo.bh);
@@ -789,7 +789,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = PTR_ERR(inode);
goto out;
}
- inode->i_version++;
+ inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -823,7 +823,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
fat_detach(inode);
- vfat_d_version_set(dentry, dir->i_version);
+ vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -849,7 +849,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
fat_detach(inode);
- vfat_d_version_set(dentry, dir->i_version);
+ vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -875,7 +875,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
if (err)
goto out_free;
- dir->i_version++;
+ inode_inc_iversion(dir);
inc_nlink(dir);
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
@@ -885,7 +885,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
/* the directory was completed, just return a error */
goto out;
}
- inode->i_version++;
+ inode_inc_iversion(inode);
set_nlink(inode, 2);
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -951,7 +951,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
new_i_pos = sinfo.i_pos;
}
- new_dir->i_version++;
+ inode_inc_iversion(new_dir);
fat_detach(old_inode);
fat_attach(old_inode, new_i_pos);
@@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
old_sinfo.bh = NULL;
if (err)
goto error_dotdot;
- old_dir->i_version++;
+ inode_inc_iversion(old_dir);
old_dir->i_ctime = old_dir->i_mtime = ts;
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0522e283a4f4..c17369659f4a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -737,6 +737,7 @@ static void send_sigio_to_task(struct task_struct *p,
delivered even if we can't queue. Failure to
queue in this case _should_ be reported; we fall
back to SIGIO in that case. --sct */
+ clear_siginfo(&si);
si.si_signo = signum;
si.si_errno = 0;
si.si_code = reason;
diff --git a/fs/file.c b/fs/file.c
index 3b080834b870..fc0eeb812e2c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files)
struct file * file = xchg(&fdt->fd[i], NULL);
if (file) {
filp_close(file, files);
- cond_resched_rcu_qs();
+ cond_resched();
}
}
i++;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 455ce5b77e9b..f989efa051a0 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
static int vxfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
int ret = -EINVAL;
u32 j;
- sbp->s_flags |= MS_RDONLY;
+ sbp->s_flags |= SB_RDONLY;
infp = kzalloc(sizeof(*infp), GFP_KERNEL);
if (!infp) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08f5debd07d1..d4d04fee568a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
* inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
- * @head: one of @wb->b_{dirty|io|more_io}
+ * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
*
* Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
@@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
- if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2f504d615d92..624f18bbfd2b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- if (inode->i_sb->s_flags & MS_ACTIVE) {
+ if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
@@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode)
static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- if (*flags & MS_MANDLOCK)
+ if (*flags & SB_MANDLOCK)
return -EINVAL;
return 0;
@@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
int is_bdev = sb->s_bdev != NULL;
err = -EINVAL;
- if (sb->s_flags & MS_MANDLOCK)
+ if (sb->s_flags & SB_MANDLOCK)
goto err;
- sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
if (!parse_fuse_opt(data, &d, is_bdev))
goto err;
@@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
goto err_dev_free;
/* Handle umasking inside the fuse code */
- if (sb->s_flags & MS_POSIXACL)
+ if (sb->s_flags & SB_POSIXACL)
fc->dont_mask = 1;
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
fc->default_permissions = d.default_permissions;
fc->allow_other = d.allow_other;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a3711f543405..ad55eb86a250 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sdp->sd_args = *args;
if (sdp->sd_args.ar_spectator) {
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
set_bit(SDF_RORECOVERY, &sdp->sd_flags);
}
if (sdp->sd_args.ar_posix_acl)
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
if (sdp->sd_args.ar_nobarrier)
set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
- sb->s_flags |= MS_NOSEC;
+ sb->s_flags |= SB_NOSEC;
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_d_op = &gfs2_dops;
@@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
struct gfs2_args args;
struct gfs2_sbd *sdp;
- if (!(flags & MS_RDONLY))
+ if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if (s->s_root) {
error = -EBUSY;
- if ((flags ^ s->s_flags) & MS_RDONLY)
+ if ((flags ^ s->s_flags) & SB_RDONLY)
goto error_super;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+ error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
if (error)
goto error_super;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
bdev->bd_super = s;
}
@@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
pr_warn("gfs2 mount does not exist\n");
return ERR_CAST(s);
}
- if ((flags ^ s->s_flags) & MS_RDONLY) {
+ if ((flags ^ s->s_flags) & SB_RDONLY) {
deactivate_locked_super(s);
return ERR_PTR(-EBUSY);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9cb5c9a97d69..d81d46e19726 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1256,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
return -EINVAL;
if (sdp->sd_args.ar_spectator)
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
- if ((sb->s_flags ^ *flags) & MS_RDONLY) {
- if (*flags & MS_RDONLY)
+ if ((sb->s_flags ^ *flags) & SB_RDONLY) {
+ if (*flags & SB_RDONLY)
error = gfs2_make_fs_ro(sdp);
else
error = gfs2_make_fs_rw(sdp);
@@ -1269,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
sdp->sd_args = args;
if (sdp->sd_args.ar_posix_acl)
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
else
- sb->s_flags &= ~MS_POSIXACL;
+ sb->s_flags &= ~SB_POSIXACL;
if (sdp->sd_args.ar_nobarrier)
set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
else
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a85ca8b2c9ba..ca8b72d0a831 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
kfree(tr);
up_read(&sdp->sd_log_flush_lock);
- if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+ if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
if (alloced)
sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 894994d2c885..460281b1299e 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb)
attrib = mdb->drAtrb;
if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
pr_warn("filesystem is marked locked, mounting read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
if (!sb_rdonly(sb)) {
/* Mark the volume uncleanly unmounted in case we crash */
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 7e0d65e9586c..173876782f73 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int hfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_NODIRATIME;
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ *flags |= SB_NODIRATIME;
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & MS_RDONLY)) {
+ if (!(*flags & SB_RDONLY)) {
if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n");
- sb->s_flags |= MS_RDONLY;
- *flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
+ *flags |= SB_RDONLY;
} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
- sb->s_flags |= MS_RDONLY;
- *flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
+ *flags |= SB_RDONLY;
}
}
return 0;
@@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &hfs_super_operations;
sb->s_xattr = hfs_xattr_handlers;
- sb->s_flags |= MS_NODIRATIME;
+ sb->s_flags |= SB_NODIRATIME;
mutex_init(&sbi->bitmap_lock);
res = hfs_mdb_get(sb);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e5bb2de2262a..1d458b716957 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & MS_RDONLY)) {
+ if (!(*flags & SB_RDONLY)) {
struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
int force = 0;
@@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n");
- sb->s_flags |= MS_RDONLY;
- *flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
+ *flags |= SB_RDONLY;
} else if (force) {
/* nothing */
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
- sb->s_flags |= MS_RDONLY;
- *flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
+ *flags |= SB_RDONLY;
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
pr_warn("filesystem is marked journaled, leaving read-only.\n");
- sb->s_flags |= MS_RDONLY;
- *flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
+ *flags |= SB_RDONLY;
}
}
return 0;
@@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
pr_warn("Filesystem is marked locked, mounting read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
!sb_rdonly(sb)) {
pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
err = -EINVAL;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8d6b7e35faf9..c83ece7facc5 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -150,7 +150,6 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx)
if (unlikely(ret < 0))
goto out;
ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
- file->f_version = inode->i_version;
}
next_pos = ctx->pos;
if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 3b834563b1f1..a4ad18afbdec 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -419,7 +419,6 @@ int hpfs_add_dirent(struct inode *i,
c = 1;
goto ret;
}
- i->i_version++;
c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0);
ret:
return c;
@@ -726,7 +725,6 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de,
return 2;
}
}
- i->i_version++;
for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1);
hpfs_delete_de(i->i_sb, dnode, de);
hpfs_mark_4buffers_dirty(qbh);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index e0e60b148400..7c49f1ef0c85 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
goto bail;
}
if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
- if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+ if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok;
hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
goto bail;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1516fb4e28f4..f2c3ebcd309c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
else {
pr_cont("; remounting read-only\n");
mark_dirty(s, 0);
- s->s_flags |= MS_RDONLY;
+ s->s_flags |= SB_RDONLY;
}
} else if (sb_rdonly(s))
pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
@@ -235,7 +235,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
- ei->vfs_inode.i_version = 1;
return &ei->vfs_inode;
}
@@ -457,7 +456,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
sync_filesystem(s);
- *flags |= MS_NOATIME;
+ *flags |= SB_NOATIME;
hpfs_lock(s);
uid = sbi->sb_uid; gid = sbi->sb_gid;
@@ -488,7 +487,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
- if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+ if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
hpfs_unlock(s);
return 0;
@@ -614,7 +613,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
goto bail4;
}
- s->s_flags |= MS_NOATIME;
+ s->s_flags |= SB_NOATIME;
/* Fill superblock stuff */
s->s_magic = HPFS_SUPER_MAGIC;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1e76730aac0d..8a85f3f53446 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
- * page_put due to reference from alloc_huge_page()
* unlock_page because locked by add_to_page_cache()
+ * page_put due to reference from alloc_huge_page()
*/
- put_page(page);
unlock_page(page);
+ put_page(page);
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
diff --git a/fs/inode.c b/fs/inode.c
index fd401028a309..e2ca0f4b5151 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <linux/iversion.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -416,7 +417,7 @@ void inode_add_lru(struct inode *inode)
{
if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+ !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
inode_lru_list_add(inode);
}
@@ -595,7 +596,7 @@ static void dispose_list(struct list_head *head)
* @sb: superblock to operate on
*
* Make sure that no inodes with zero refcount are retained. This is
- * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * called by superblock shutdown after having SB_ACTIVE flag removed,
* so any inode reaching zero refcount during or after that call will
* be immediately evicted.
*/
@@ -1492,7 +1493,7 @@ static void iput_final(struct inode *inode)
else
drop = generic_drop_inode(inode);
- if (!drop && (sb->s_flags & MS_ACTIVE)) {
+ if (!drop && (sb->s_flags & SB_ACTIVE)) {
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
@@ -1634,17 +1635,21 @@ static int relatime_need_update(const struct path *path, struct inode *inode,
int generic_update_time(struct inode *inode, struct timespec *time, int flags)
{
int iflags = I_DIRTY_TIME;
+ bool dirty = false;
if (flags & S_ATIME)
inode->i_atime = *time;
if (flags & S_VERSION)
- inode_inc_iversion(inode);
+ dirty = inode_maybe_inc_iversion(inode, false);
if (flags & S_CTIME)
inode->i_ctime = *time;
if (flags & S_MTIME)
inode->i_mtime = *time;
+ if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
+ !(inode->i_sb->s_flags & SB_LAZYTIME))
+ dirty = true;
- if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+ if (dirty)
iflags |= I_DIRTY_SYNC;
__mark_inode_dirty(inode, iflags);
return 0;
@@ -1691,7 +1696,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode,
if (IS_NOATIME(inode))
return false;
- if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+ if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
return false;
if (mnt->mnt_flags & MNT_NOATIME)
@@ -1863,7 +1868,7 @@ int file_update_time(struct file *file)
if (!timespec_equal(&inode->i_ctime, &now))
sync_it |= S_CTIME;
- if (IS_I_VERSION(inode))
+ if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
sync_it |= S_VERSION;
if (!sync_it)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 447a24d77b89..bc258a4402f6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static void destroy_inodecache(void)
static int isofs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- if (!(*flags & MS_RDONLY))
+ if (!(*flags & SB_RDONLY))
return -EROFS;
return 0;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e96c6b05e43e..d8c274d39ddb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
mutex_unlock(&c->alloc_sem);
}
- if (!(*flags & MS_RDONLY))
+ if (!(*flags & SB_RDONLY))
jffs2_start_garbage_collect_thread(c);
- *flags |= MS_NOATIME;
+ *flags |= SB_NOATIME;
return 0;
}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 824e61ede465..c2fbec19c616 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
}
-#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY)
#define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 153f1c6eb169..f60dee7faf03 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &jffs2_super_operations;
sb->s_export_op = &jffs2_export_ops;
- sb->s_flags = sb->s_flags | MS_NOATIME;
+ sb->s_flags = sb->s_flags | SB_NOATIME;
sb->s_xattr = jffs2_xattr_handlers;
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
#endif
ret = jffs2_do_fill_super(sb, data, silent);
return ret;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2f7b3af5b8b7..90373aebfdca 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb)
else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
jfs_err("ERROR: (device %s): remounting filesystem as read-only",
sb->s_id);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
/* nothing is done for continue beyond marking the superblock dirty */
@@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
return rc;
}
- if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+ if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
/*
* Invalidate any previously read metadata. fsck may have
* changed the on-disk data since we mounted r/o
@@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
ret = jfs_mount_rw(sb, 1);
/* mark the fs r/w for quota activity */
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
dquot_resume(sb, -1);
return ret;
}
- if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+ if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
rc = dquot_suspend(sb, -1);
if (rc < 0)
return rc;
@@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sbi->flag = flag;
#ifdef CONFIG_JFS_POSIX_ACL
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
#endif
if (newLVSize) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 95a7c88baed9..26dd9a50f383 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
deactivate_locked_super(sb);
return ERR_PTR(error);
}
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
mutex_lock(&kernfs_mutex);
list_add(&info->node, &root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index 3aabe553fc45..7ff3cb904acd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
struct inode *root;
struct qstr d_name = QSTR_INIT(name, strlen(name));
- s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+ s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
&init_user_ns, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
d_instantiate(dentry, root);
s->s_root = dentry;
s->s_d_op = dops;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
return dget(s->s_root);
Enomem:
@@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
spin_lock(&pin_fs_lock);
if (unlikely(!*mount)) {
spin_unlock(&pin_fs_lock);
- mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);
+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 0d4e590e0549..826a89184f90 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -578,8 +578,10 @@ static void nlm_complain_hosts(struct net *net)
if (ln->nrhosts == 0)
return;
- printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net);
- dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net);
+ pr_warn("lockd: couldn't shutdown host module for net %x!\n",
+ net->ns.inum);
+ dprintk("lockd: %lu hosts left in net %x:\n", ln->nrhosts,
+ net->ns.inum);
} else {
if (nrhosts == 0)
return;
@@ -590,9 +592,9 @@ static void nlm_complain_hosts(struct net *net)
for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
- dprintk(" %s (cnt %d use %d exp %ld net %p)\n",
+ dprintk(" %s (cnt %d use %d exp %ld net %x)\n",
host->h_name, atomic_read(&host->h_count),
- host->h_inuse, host->h_expires, host->net);
+ host->h_inuse, host->h_expires, host->net->ns.inum);
}
}
@@ -605,7 +607,8 @@ nlm_shutdown_hosts_net(struct net *net)
mutex_lock(&nlm_host_mutex);
/* First, make all hosts eligible for gc */
- dprintk("lockd: nuking all hosts in net %p...\n", net);
+ dprintk("lockd: nuking all hosts in net %x...\n",
+ net ? net->ns.inum : 0);
for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
@@ -618,9 +621,8 @@ nlm_shutdown_hosts_net(struct net *net)
/* Then, perform a garbage collection pass */
nlm_gc_hosts(net);
- mutex_unlock(&nlm_host_mutex);
-
nlm_complain_hosts(net);
+ mutex_unlock(&nlm_host_mutex);
}
/*
@@ -646,7 +648,8 @@ nlm_gc_hosts(struct net *net)
struct hlist_node *next;
struct nlm_host *host;
- dprintk("lockd: host garbage collection for net %p\n", net);
+ dprintk("lockd: host garbage collection for net %x\n",
+ net ? net->ns.inum : 0);
for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
@@ -662,9 +665,10 @@ nlm_gc_hosts(struct net *net)
if (atomic_read(&host->h_count) || host->h_inuse
|| time_before(jiffies, host->h_expires)) {
dprintk("nlm_gc_hosts skipping %s "
- "(cnt %d use %d exp %ld net %p)\n",
+ "(cnt %d use %d exp %ld net %x)\n",
host->h_name, atomic_read(&host->h_count),
- host->h_inuse, host->h_expires, host->net);
+ host->h_inuse, host->h_expires,
+ host->net->ns.inum);
continue;
}
nlm_destroy_host_locked(host);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 9fbbd11f9ecb..96cfb2967ac7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -110,7 +110,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
clnt = nsm_create(host->net, host->nodename);
if (IS_ERR(clnt)) {
dprintk("lockd: failed to create NSM upcall transport, "
- "status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+ "status=%ld, net=%x\n", PTR_ERR(clnt),
+ host->net->ns.inum);
return PTR_ERR(clnt);
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a8e3777c94dc..9c36d614bf89 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,6 +57,9 @@ static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
+atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+
unsigned int lockd_net_id;
/*
@@ -259,7 +262,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
if (error < 0)
goto err_bind;
set_grace_period(net);
- dprintk("lockd_up_net: per-net data created; net=%p\n", net);
+ dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum);
return 0;
err_bind:
@@ -274,12 +277,15 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
svc_shutdown_net(serv, net);
- dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
+ dprintk("%s: per-net data destroyed; net=%x\n",
+ __func__, net->ns.inum);
}
} else {
- printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
- nlmsvc_task, net);
+ pr_err("%s: no users! task=%p, net=%x\n",
+ __func__, nlmsvc_task, net->ns.inum);
BUG();
}
}
@@ -290,7 +296,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
struct sockaddr_in sin;
- if (event != NETDEV_DOWN)
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nlm_ntf_refcnt))
goto out;
if (nlmsvc_rqst) {
@@ -301,6 +308,8 @@ static int lockd_inetaddr_event(struct notifier_block *this,
svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
(struct sockaddr *)&sin);
}
+ atomic_dec(&nlm_ntf_refcnt);
+ wake_up(&nlm_ntf_wq);
out:
return NOTIFY_DONE;
@@ -317,7 +326,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
struct sockaddr_in6 sin6;
- if (event != NETDEV_DOWN)
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nlm_ntf_refcnt))
goto out;
if (nlmsvc_rqst) {
@@ -329,6 +339,8 @@ static int lockd_inet6addr_event(struct notifier_block *this,
svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
(struct sockaddr *)&sin6);
}
+ atomic_dec(&nlm_ntf_refcnt);
+ wake_up(&nlm_ntf_wq);
out:
return NOTIFY_DONE;
@@ -345,10 +357,12 @@ static void lockd_unregister_notifiers(void)
#if IS_ENABLED(CONFIG_IPV6)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
+ wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0);
}
static void lockd_svc_exit_thread(void)
{
+ atomic_dec(&nlm_ntf_refcnt);
lockd_unregister_notifiers();
svc_exit_thread(nlmsvc_rqst);
}
@@ -373,6 +387,7 @@ static int lockd_start_svc(struct svc_serv *serv)
goto out_rqst;
}
+ atomic_inc(&nlm_ntf_refcnt);
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
@@ -676,6 +691,17 @@ static int lockd_init_net(struct net *net)
static void lockd_exit_net(struct net *net)
{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+ WARN_ONCE(!list_empty(&ln->lockd_manager.list),
+ "net %x %s: lockd_manager.list is not empty\n",
+ net->ns.inum, __func__);
+ WARN_ONCE(!list_empty(&ln->nsm_handles),
+ "net %x %s: nsm_handles list is not empty\n",
+ net->ns.inum, __func__);
+ WARN_ONCE(delayed_work_pending(&ln->grace_period_end),
+ "net %x %s: grace_period_end was not cancelled\n",
+ net->ns.inum, __func__);
}
static struct pernet_operations lockd_net_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a563ddbc19e6..4ec3d6e03e76 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -370,7 +370,7 @@ nlmsvc_mark_resources(struct net *net)
{
struct nlm_host hint;
- dprintk("lockd: nlmsvc_mark_resources for net %p\n", net);
+ dprintk("lockd: %s for net %x\n", __func__, net ? net->ns.inum : 0);
hint.net = net;
nlm_traverse_files(&hint, nlmsvc_mark_host, NULL);
}
diff --git a/fs/locks.c b/fs/locks.c
index 1bd71c4d663a..21b4dfa289ee 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -141,7 +141,7 @@
static inline bool is_remote_lock(struct file *filp)
{
- return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+ return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK));
}
static bool lease_breaking(struct file_lock *fl)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index d818fd236787..b8b8b9ced9f8 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
struct mb_cache *cache = container_of(shrink, struct mb_cache,
c_shrink);
+ /* Unlikely, but not impossible */
+ if (unlikely(cache->c_entry_count < 0))
+ return 0;
return cache->c_entry_count;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index b6829d679643..72e308c3e66b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
sync_filesystem(sb);
ms = sbi->s_ms;
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
if (ms->s_state & MINIX_VALID_FS ||
!(sbi->s_mount_state & MINIX_VALID_FS))
return 0;
diff --git a/fs/namei.c b/fs/namei.c
index f47118ed36e7..4e3fc58dae72 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd,
* of the daemon to instantiate them before they can be used.
*/
if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE |
- LOOKUP_AUTOMOUNT))) {
- /* Positive dentry that isn't meant to trigger an
- * automount, EISDIR will allow it to be used,
- * otherwise there's no mount here "now" so return
- * ENOENT.
- */
- if (path->dentry->d_inode)
- return -EISDIR;
- else
- return -ENOENT;
- }
+ LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+ path->dentry->d_inode)
+ return -EISDIR;
nd->total_link_count++;
if (nd->total_link_count >= 40)
diff --git a/fs/namespace.c b/fs/namespace.c
index e158ec6b527b..9d1374ab6e06 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
SB_DIRSYNC |
SB_SILENT |
SB_POSIXACL |
+ SB_LAZYTIME |
SB_I_VERSION);
if (flags & MS_REMOUNT)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 129f1937fa2c..41de88cdc053 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -103,7 +103,7 @@ static void destroy_inodecache(void)
static int ncp_remount(struct super_block *sb, int *flags, char* data)
{
sync_filesystem(sb);
- *flags |= MS_NODIRATIME;
+ *flags |= SB_NODIRATIME;
return 0;
}
@@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
else
default_bufsize = 1024;
- sb->s_flags |= MS_NODIRATIME; /* probably even noatime */
+ sb->s_flags |= SB_NODIRATIME; /* probably even noatime */
sb->s_maxbytes = 0xFFFFFFFFU;
sb->s_blocksize = 1024; /* Eh... Is this correct? */
sb->s_blocksize_bits = 10;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0ac2fb1c6b63..b9129e2befea 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
const struct sockaddr *sap = data->addr;
struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+again:
list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
/* Don't match clients that failed to initialise properly */
if (clp->cl_cons_state < 0)
continue;
+ /* If a client is still initializing then we need to wait */
+ if (clp->cl_cons_state > NFS_CS_READY) {
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ nfs_wait_client_init_complete(clp);
+ nfs_put_client(clp);
+ spin_lock(&nn->nfs_client_lock);
+ goto again;
+ }
+
/* Different NFS versions cannot share the same nfs_client */
if (clp->rpc_ops != data->nfs_mod->rpc_ops)
continue;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ade44ca0c66c..d8b47624fee2 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -12,6 +12,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/iversion.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
@@ -347,7 +348,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
delegation->pagemod_limit = res->pagemod_limit;
- delegation->change_attr = inode->i_version;
+ delegation->change_attr = inode_peek_iversion_raw(inode);
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e51ae52ed14f..2f3f86726f5b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1256,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry)
/* Unhash it, so that ->d_iput() would be called */
return 1;
}
- if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+ if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
/* Unhash it, so that ancestors of killed async unlink
* files will be cleaned up during umount */
return 1;
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 3025fe8584a0..0ee4b93d36ea 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -16,6 +16,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_fs_sb.h>
#include <linux/in6.h>
+#include <linux/iversion.h>
#include "internal.h"
#include "fscache.h"
@@ -211,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
auxdata.ctime = nfsi->vfs_inode.i_ctime;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = nfsi->vfs_inode.i_version;
+ auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
if (bufmax > sizeof(auxdata))
bufmax = sizeof(auxdata);
@@ -243,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
auxdata.ctime = nfsi->vfs_inode.i_ctime;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = nfsi->vfs_inode.i_version;
+ auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
if (memcmp(data, &auxdata, datalen) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 38b93d54c02e..93552c482992 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -38,8 +38,8 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/freezer.h>
-
#include <linux/uaccess.h>
+#include <linux/iversion.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -483,7 +483,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
- inode->i_version = 0;
+ inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
clear_nlink(inode);
inode->i_uid = make_kuid(&init_user_ns, -2);
@@ -508,7 +508,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
else if (nfs_server_capable(inode, NFS_CAP_CTIME))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
- inode->i_version = fattr->change_attr;
+ inode_set_iversion_raw(inode, fattr->change_attr);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE);
@@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
* Note that we only have to check the vfsmount flags here:
* - NFS always sets S_NOATIME by so checking it would give a
* bogus result
- * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+ * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
* no point in checking those.
*/
if ((path->mnt->mnt_flags & MNT_NOATIME) ||
@@ -1289,8 +1289,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
- && inode->i_version == fattr->pre_change_attr) {
- inode->i_version = fattr->change_attr;
+ && !inode_cmp_iversion_raw(inode, fattr->pre_change_attr)) {
+ inode_set_iversion_raw(inode, fattr->change_attr);
if (S_ISDIR(inode->i_mode))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
ret |= NFS_INO_INVALID_ATTR;
@@ -1348,7 +1348,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (!nfs_file_has_buffered_writers(nfsi)) {
/* Verify a few of the more important attributes */
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode_cmp_iversion_raw(inode, fattr->change_attr))
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
@@ -1642,7 +1642,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
}
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
- fattr->pre_change_attr = inode->i_version;
+ fattr->pre_change_attr = inode_peek_iversion_raw(inode);
fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
}
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1778,7 +1778,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
- if (inode->i_version != fattr->change_attr) {
+ if (inode_cmp_iversion_raw(inode, fattr->change_attr)) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
/* Could it be a race with writeback? */
@@ -1790,7 +1790,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
}
- inode->i_version = fattr->change_attr;
+ inode_set_iversion_raw(inode, fattr->change_attr);
}
} else {
nfsi->cache_validity |= save_cache_validity;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ab17fd4700a..8357ff69962f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -10,7 +10,7 @@
#include <linux/nfs_page.h>
#include <linux/wait_bit.h>
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
extern const struct export_operations nfs_export_ops;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 12bbab0becb4..65a7e5da508c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
if (error < 0)
goto error;
- if (!nfs4_has_session(clp))
- nfs_mark_client_ready(clp, NFS_CS_READY);
-
error = nfs4_discover_server_trunking(clp, &old);
if (error < 0)
goto error;
- if (clp != old)
+ if (clp != old) {
clp->cl_preserve_clid = true;
+ /*
+ * Mark the client as having failed initialization so other
+ * processes walking the nfs_client_list in nfs_match_client()
+ * won't try to use it.
+ */
+ nfs_mark_client_ready(clp, -EPERM);
+ }
nfs_put_client(clp);
clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
return old;
@@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+ if (pos == new)
+ goto found;
+
status = nfs4_match_client(pos, new, &prev, nn);
if (status < 0)
goto out_unlock;
@@ -559,6 +566,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
* way that a SETCLIENTID_CONFIRM to pos can succeed is
* if new and pos point to the same server:
*/
+found:
refcount_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
@@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
case 0:
nfs4_swap_callback_idents(pos, new);
pos->cl_confirm = new->cl_confirm;
+ nfs_mark_client_ready(pos, NFS_CS_READY);
prev = NULL;
*result = pos;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 56fa5a16e097..17a03f2c4330 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -54,6 +54,7 @@
#include <linux/xattr.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
+#include <linux/iversion.h>
#include "nfs4_fs.h"
#include "delegation.h"
@@ -1045,16 +1046,16 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
spin_lock(&dir->i_lock);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
- if (cinfo->atomic && cinfo->before == dir->i_version) {
+ if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) {
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
nfsi->attrtimeo_timestamp = jiffies;
} else {
nfs_force_lookup_revalidate(dir);
- if (cinfo->before != dir->i_version)
+ if (cinfo->before != inode_peek_iversion_raw(dir))
nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
NFS_INO_INVALID_ACL;
}
- dir->i_version = cinfo->after;
+ inode_set_iversion_raw(dir, cinfo->after);
nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfs_fscache_invalidate(dir);
@@ -2454,7 +2455,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
data->file_created = true;
else if (o_res->cinfo.before != o_res->cinfo.after)
data->file_created = true;
- if (data->file_created || dir->i_version != o_res->cinfo.after)
+ if (data->file_created ||
+ inode_peek_iversion_raw(dir) != o_res->cinfo.after)
update_changeattr(dir, &o_res->cinfo,
o_res->f_attr->time_start);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 54fd56d715a8..e4f4a09ed9f4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,8 +71,8 @@ const nfs4_stateid zero_stateid = {
};
const nfs4_stateid invalid_stateid = {
{
- .seqid = cpu_to_be32(0xffffffffU),
- .other = { 0 },
+ /* Funky initialiser keeps older gcc versions happy */
+ .data = { 0xff, 0xff, 0xff, 0xff, 0 },
},
.type = NFS4_INVALID_STATEID_TYPE,
};
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 093290c42d7c..610d89d8942e 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -9,6 +9,7 @@
#define _TRACE_NFS_H
#include <linux/tracepoint.h>
+#include <linux/iversion.h>
#define nfs_show_file_type(ftype) \
__print_symbolic(ftype, \
@@ -61,7 +62,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
- __entry->version = inode->i_version;
+ __entry->version = inode_peek_iversion_raw(inode);
),
TP_printk(
@@ -100,7 +101,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->type = nfs_umode_to_dtype(inode->i_mode);
- __entry->version = inode->i_version;
+ __entry->version = inode_peek_iversion_raw(inode);
__entry->size = i_size_read(inode);
__entry->nfsi_flags = nfsi->flags;
__entry->cache_validity = nfsi->cache_validity;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 43cadb28db6e..29bacdc56f6a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
*/
seq_printf(m, "\n\topts:\t");
seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
- seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
- seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
- seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
nfs_show_mount_options(m, nfss, 1);
seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
@@ -2296,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
/*
* noac is a special case. It implies -o sync, but that's not
* necessarily reflected in the mtab options. do_remount_sb
- * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+ * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
* remount options, so we have to explicitly reset it.
*/
if (data->flags & NFS_MOUNT_NOAC)
- *flags |= MS_SYNCHRONOUS;
+ *flags |= SB_SYNCHRONOUS;
/* compare new mount options with old ones */
error = nfs_compare_remount_data(nfss, data);
@@ -2349,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
sb->s_time_gran = 1;
sb->s_export_op = &nfs_export_ops;
}
@@ -2379,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb,
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
}
nfs_initialise_sb(sb);
@@ -2600,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
/* -o noac implies -o sync */
if (server->flags & NFS_MOUNT_NOAC)
- sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+ sb_mntdata.mntflags |= SB_SYNCHRONOUS;
if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
- if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
- sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+ if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS)
+ sb_mntdata.mntflags |= SB_SYNCHRONOUS;
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
@@ -2641,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
if (error)
goto error_splat_root;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
out:
return mntroot;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5b5f464f6f2a..12b2d477836b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -23,6 +23,7 @@
#include <linux/export.h>
#include <linux/freezer.h>
#include <linux/wait.h>
+#include <linux/iversion.h>
#include <linux/uaccess.h>
@@ -753,11 +754,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
*/
spin_lock(&mapping->private_lock);
if (!nfs_have_writebacks(inode) &&
- NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) {
- spin_lock(&inode->i_lock);
- inode->i_version++;
- spin_unlock(&inode->i_lock);
- }
+ NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ inode_inc_iversion_raw(inode);
if (likely(!PageSwapCache(req->wb_page))) {
set_bit(PG_MAPPED, &req->wb_flags);
SetPagePrivate(req->wb_page);
@@ -1890,6 +1888,8 @@ int nfs_commit_inode(struct inode *inode, int how)
if (res)
error = nfs_generic_commit_list(inode, &head, how, &cinfo);
nfs_commit_end(cinfo.mds);
+ if (res == 0)
+ return res;
if (error < 0)
goto out_error;
if (!may_wait)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 897b299db55e..5be08f02a76b 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm)
struct list_head *grace_list = net_generic(net, grace_net_id);
spin_lock(&grace_lock);
- list_add(&lm->list, grace_list);
+ if (list_empty(&lm->list))
+ list_add(&lm->list, grace_list);
+ else
+ WARN(1, "double list_add attempt detected in net %x %s\n",
+ net->ns.inum, (net == &init_net) ? "(init_net)" : "");
spin_unlock(&grace_lock);
}
EXPORT_SYMBOL_GPL(locks_start_grace);
@@ -104,7 +108,9 @@ grace_exit_net(struct net *net)
{
struct list_head *grace_list = net_generic(net, grace_net_id);
- BUG_ON(!list_empty(grace_list));
+ WARN_ONCE(!list_empty(grace_list),
+ "net %x %s: grace_list is not empty\n",
+ net->ns.inum, __func__);
}
static struct pernet_operations grace_net_ops = {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 697f8ae7792d..fdf2aad73470 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -61,6 +61,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
else
gi->gid[i] = rqgi->gid[i];
}
+
+ /* Each thread allocates its own gi, no race */
+ groups_sort(gi);
} else {
gi = get_group_info(rqgi);
}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 46b48dbbdd32..8ceb25a10ea0 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -232,7 +232,7 @@ static struct cache_head *expkey_alloc(void)
return NULL;
}
-static struct cache_detail svc_expkey_cache_template = {
+static const struct cache_detail svc_expkey_cache_template = {
.owner = THIS_MODULE,
.hash_size = EXPKEY_HASHMAX,
.name = "nfsd.fh",
@@ -748,7 +748,7 @@ static struct cache_head *svc_export_alloc(void)
return NULL;
}
-static struct cache_detail svc_export_cache_template = {
+static const struct cache_detail svc_export_cache_template = {
.owner = THIS_MODULE,
.hash_size = EXPORT_HASHMAX,
.name = "nfsd.export",
@@ -1230,7 +1230,7 @@ nfsd_export_init(struct net *net)
int rv;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- dprintk("nfsd: initializing export module (net: %p).\n", net);
+ dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum);
nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net);
if (IS_ERR(nn->svc_export_cache))
@@ -1278,7 +1278,7 @@ nfsd_export_shutdown(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- dprintk("nfsd: shutting down export module (net: %p).\n", net);
+ dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum);
cache_unregister_net(nn->svc_expkey_cache, net);
cache_unregister_net(nn->svc_export_cache, net);
@@ -1286,5 +1286,5 @@ nfsd_export_shutdown(struct net *net)
cache_destroy_net(nn->svc_export_cache, net);
svcauth_unix_purge(net);
- dprintk("nfsd: export shutdown complete (net: %p).\n", net);
+ dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum);
}
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1c91391f4805..36358d435cb0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -119,6 +119,9 @@ struct nfsd_net {
u32 clverifier_counter;
struct svc_serv *nfsd_serv;
+
+ wait_queue_head_t ntf_wq;
+ atomic_t ntf_refcnt;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6b9b6cca469f..a5bb76593ce7 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -178,7 +178,7 @@ static struct ent *idtoname_lookup(struct cache_detail *, struct ent *);
static struct ent *idtoname_update(struct cache_detail *, struct ent *,
struct ent *);
-static struct cache_detail idtoname_cache_template = {
+static const struct cache_detail idtoname_cache_template = {
.owner = THIS_MODULE,
.hash_size = ENT_HASHMAX,
.name = "nfs4.idtoname",
@@ -341,7 +341,7 @@ static struct ent *nametoid_update(struct cache_detail *, struct ent *,
struct ent *);
static int nametoid_parse(struct cache_detail *, char *, int);
-static struct cache_detail nametoid_cache_template = {
+static const struct cache_detail nametoid_cache_template = {
.owner = THIS_MODULE,
.hash_size = ENT_HASHMAX,
.name = "nfs4.nametoid",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b82817767b9d..b29b5a185a2c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -63,12 +63,16 @@ static const stateid_t zero_stateid = {
static const stateid_t currentstateid = {
.si_generation = 1,
};
+static const stateid_t close_stateid = {
+ .si_generation = 0xffffffffU,
+};
static u64 current_sessionid = 1;
#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
+#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t)))
/* forward declarations */
static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
@@ -83,6 +87,11 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
*/
static DEFINE_SPINLOCK(state_lock);
+enum nfsd4_st_mutex_lock_subclass {
+ OPEN_STATEID_MUTEX = 0,
+ LOCK_STATEID_MUTEX = 1,
+};
+
/*
* A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
* the refcount on the open stateid to drop.
@@ -3562,7 +3571,9 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
/* ignore lock owners */
if (local->st_stateowner->so_is_open_owner == 0)
continue;
- if (local->st_stateowner == &oo->oo_owner) {
+ if (local->st_stateowner != &oo->oo_owner)
+ continue;
+ if (local->st_stid.sc_type == NFS4_OPEN_STID) {
ret = local;
refcount_inc(&ret->st_stid.sc_count);
break;
@@ -3571,6 +3582,52 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
return ret;
}
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+ __be32 ret = nfs_ok;
+
+ switch (s->sc_type) {
+ default:
+ break;
+ case NFS4_CLOSED_STID:
+ case NFS4_CLOSED_DELEG_STID:
+ ret = nfserr_bad_stateid;
+ break;
+ case NFS4_REVOKED_DELEG_STID:
+ ret = nfserr_deleg_revoked;
+ }
+ return ret;
+}
+
+/* Lock the stateid st_mutex, and deal with races with CLOSE */
+static __be32
+nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+ __be32 ret;
+
+ mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
+ ret = nfsd4_verify_open_stid(&stp->st_stid);
+ if (ret != nfs_ok)
+ mutex_unlock(&stp->st_mutex);
+ return ret;
+}
+
+static struct nfs4_ol_stateid *
+nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *stp;
+ for (;;) {
+ spin_lock(&fp->fi_lock);
+ stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
+ if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok)
+ break;
+ nfs4_put_stid(&stp->st_stid);
+ }
+ return stp;
+}
+
static struct nfs4_openowner *
alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
struct nfsd4_compound_state *cstate)
@@ -3613,8 +3670,9 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
stp = open->op_stp;
/* We are moving these outside of the spinlocks to avoid the warnings */
mutex_init(&stp->st_mutex);
- mutex_lock(&stp->st_mutex);
+ mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
+retry:
spin_lock(&oo->oo_owner.so_client->cl_lock);
spin_lock(&fp->fi_lock);
@@ -3639,7 +3697,11 @@ out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
if (retstp) {
- mutex_lock(&retstp->st_mutex);
+ /* Handle races with CLOSE */
+ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+ nfs4_put_stid(&retstp->st_stid);
+ goto retry;
+ }
/* To keep mutex tracking happy */
mutex_unlock(&stp->st_mutex);
stp = retstp;
@@ -4449,6 +4511,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
+ bool new_stp = false;
/*
* Lookup file; if found, lookup stateid and check open request,
@@ -4460,9 +4523,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
- spin_lock(&fp->fi_lock);
- stp = nfsd4_find_existing_open(fp, open);
- spin_unlock(&fp->fi_lock);
+ stp = nfsd4_find_and_lock_existing_open(fp, open);
} else {
open->op_file = NULL;
status = nfserr_bad_stateid;
@@ -4470,35 +4531,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
goto out;
}
+ if (!stp) {
+ stp = init_open_stateid(fp, open);
+ if (!open->op_stp)
+ new_stp = true;
+ }
+
/*
* OPEN the file, or upgrade an existing OPEN.
* If truncate fails, the OPEN fails.
+ *
+ * stp is already locked.
*/
- if (stp) {
+ if (!new_stp) {
/* Stateid was found, this is an OPEN upgrade */
- mutex_lock(&stp->st_mutex);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
if (status) {
mutex_unlock(&stp->st_mutex);
goto out;
}
} else {
- /* stp is returned locked. */
- stp = init_open_stateid(fp, open);
- /* See if we lost the race to some other thread */
- if (stp->st_access_bmap != 0) {
- status = nfs4_upgrade_open(rqstp, fp, current_fh,
- stp, open);
- if (status) {
- mutex_unlock(&stp->st_mutex);
- goto out;
- }
- goto upgrade_out;
- }
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
- mutex_unlock(&stp->st_mutex);
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_open_stateid(stp);
+ mutex_unlock(&stp->st_mutex);
goto out;
}
@@ -4507,7 +4564,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (stp->st_clnt_odstate == open->op_odstate)
open->op_odstate = NULL;
}
-upgrade_out:
+
nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
mutex_unlock(&stp->st_mutex);
@@ -4734,7 +4791,7 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_unlock(&nn->blocked_locks_lock);
while (!list_empty(&reaplist)) {
- nbl = list_first_entry(&nn->blocked_locks_lru,
+ nbl = list_first_entry(&reaplist,
struct nfsd4_blocked_lock, nbl_lru);
list_del_init(&nbl->nbl_lru);
posix_unblock_lock(&nbl->nbl_lock);
@@ -4855,6 +4912,18 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
return nfserr_old_stateid;
}
+static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session)
+{
+ __be32 ret;
+
+ spin_lock(&s->sc_lock);
+ ret = nfsd4_verify_open_stid(s);
+ if (ret == nfs_ok)
+ ret = check_stateid_generation(in, &s->sc_stateid, has_session);
+ spin_unlock(&s->sc_lock);
+ return ret;
+}
+
static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
{
if (ols->st_stateowner->so_is_open_owner &&
@@ -4868,7 +4937,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
struct nfs4_stid *s;
__be32 status = nfserr_bad_stateid;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+ CLOSE_STATEID(stateid))
return status;
/* Client debugging aid. */
if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
@@ -4883,7 +4953,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
s = find_stateid_locked(cl, stateid);
if (!s)
goto out_unlock;
- status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
if (status)
goto out_unlock;
switch (s->sc_type) {
@@ -4926,7 +4996,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
else if (typemask & NFS4_DELEG_STID)
typemask |= NFS4_REVOKED_DELEG_STID;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+ CLOSE_STATEID(stateid))
return nfserr_bad_stateid;
status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
if (status == nfserr_stale_clientid) {
@@ -5044,7 +5115,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
&s, nn);
if (status)
return status;
- status = check_stateid_generation(stateid, &s->sc_stateid,
+ status = nfsd4_stid_check_stateid_generation(stateid, s,
nfsd4_has_session(cstate));
if (status)
goto out;
@@ -5098,7 +5169,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
struct nfs4_ol_stateid *stp = openlockstateid(s);
__be32 ret;
- mutex_lock(&stp->st_mutex);
+ ret = nfsd4_lock_ol_stateid(stp);
+ if (ret)
+ goto out_put_stid;
ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
if (ret)
@@ -5109,11 +5182,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
lockowner(stp->st_stateowner)))
goto out;
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_lock_stateid(stp);
ret = nfs_ok;
out:
mutex_unlock(&stp->st_mutex);
+out_put_stid:
nfs4_put_stid(s);
return ret;
}
@@ -5133,6 +5208,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
s = find_stateid_locked(cl, stateid);
if (!s)
goto out_unlock;
+ spin_lock(&s->sc_lock);
switch (s->sc_type) {
case NFS4_DELEG_STID:
ret = nfserr_locks_held;
@@ -5144,11 +5220,13 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
ret = nfserr_locks_held;
break;
case NFS4_LOCK_STID:
+ spin_unlock(&s->sc_lock);
refcount_inc(&s->sc_count);
spin_unlock(&cl->cl_lock);
ret = nfsd4_free_lock_stateid(stateid, s);
goto out;
case NFS4_REVOKED_DELEG_STID:
+ spin_unlock(&s->sc_lock);
dp = delegstateid(s);
list_del_init(&dp->dl_recall_lru);
spin_unlock(&cl->cl_lock);
@@ -5157,6 +5235,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
/* Default falls through and returns nfserr_bad_stateid */
}
+ spin_unlock(&s->sc_lock);
out_unlock:
spin_unlock(&cl->cl_lock);
out:
@@ -5179,15 +5258,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
status = nfsd4_check_seqid(cstate, sop, seqid);
if (status)
return status;
- if (stp->st_stid.sc_type == NFS4_CLOSED_STID
- || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
- /*
- * "Closed" stateid's exist *only* to return
- * nfserr_replay_me from the previous step, and
- * revoked delegations are kept only for free_stateid.
- */
- return nfserr_bad_stateid;
- mutex_lock(&stp->st_mutex);
+ status = nfsd4_lock_ol_stateid(stp);
+ if (status != nfs_ok)
+ return status;
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
if (status == nfs_ok)
status = nfs4_check_fh(current_fh, &stp->st_stid);
@@ -5367,7 +5440,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
bool unhashed;
LIST_HEAD(reaplist);
- s->st_stid.sc_type = NFS4_CLOSED_STID;
spin_lock(&clp->cl_lock);
unhashed = unhash_open_stateid(s, &reaplist);
@@ -5407,10 +5479,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
+
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
- mutex_unlock(&stp->st_mutex);
nfsd4_close_open_stateid(stp);
+ mutex_unlock(&stp->st_mutex);
+
+ /* See RFC5661 sectionm 18.2.4 */
+ if (stp->st_stid.sc_client->cl_minorversion)
+ memcpy(&close->cl_stateid, &close_stateid,
+ sizeof(close->cl_stateid));
/* put reference from nfs4_preprocess_seqid_op */
nfs4_put_stid(&stp->st_stid);
@@ -5436,7 +5515,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
dp = delegstateid(s);
- status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
+ status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate));
if (status)
goto put_stateid;
@@ -5642,14 +5721,41 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
return ret;
}
-static void
+static struct nfs4_ol_stateid *
+find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+{
+ struct nfs4_ol_stateid *lst;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (lst->st_stid.sc_type != NFS4_LOCK_STID)
+ continue;
+ if (lst->st_stid.sc_file == fp) {
+ refcount_inc(&lst->st_stid.sc_count);
+ return lst;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_ol_stateid *
init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
struct nfs4_file *fp, struct inode *inode,
struct nfs4_ol_stateid *open_stp)
{
struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfs4_ol_stateid *retstp;
- lockdep_assert_held(&clp->cl_lock);
+ mutex_init(&stp->st_mutex);
+ mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
+retry:
+ spin_lock(&clp->cl_lock);
+ spin_lock(&fp->fi_lock);
+ retstp = find_lock_stateid(lo, fp);
+ if (retstp)
+ goto out_unlock;
refcount_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_LOCK_STID;
@@ -5659,29 +5765,22 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
- mutex_init(&stp->st_mutex);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
- spin_lock(&fp->fi_lock);
list_add(&stp->st_perfile, &fp->fi_stateids);
+out_unlock:
spin_unlock(&fp->fi_lock);
-}
-
-static struct nfs4_ol_stateid *
-find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
-{
- struct nfs4_ol_stateid *lst;
- struct nfs4_client *clp = lo->lo_owner.so_client;
-
- lockdep_assert_held(&clp->cl_lock);
-
- list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
- if (lst->st_stid.sc_file == fp) {
- refcount_inc(&lst->st_stid.sc_count);
- return lst;
+ spin_unlock(&clp->cl_lock);
+ if (retstp) {
+ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+ nfs4_put_stid(&retstp->st_stid);
+ goto retry;
}
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ stp = retstp;
}
- return NULL;
+ return stp;
}
static struct nfs4_ol_stateid *
@@ -5694,26 +5793,25 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
struct nfs4_openowner *oo = openowner(ost->st_stateowner);
struct nfs4_client *clp = oo->oo_owner.so_client;
+ *new = false;
spin_lock(&clp->cl_lock);
lst = find_lock_stateid(lo, fi);
- if (lst == NULL) {
- spin_unlock(&clp->cl_lock);
- ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
- if (ns == NULL)
- return NULL;
-
- spin_lock(&clp->cl_lock);
- lst = find_lock_stateid(lo, fi);
- if (likely(!lst)) {
- lst = openlockstateid(ns);
- init_lock_stateid(lst, lo, fi, inode, ost);
- ns = NULL;
- *new = true;
- }
- }
spin_unlock(&clp->cl_lock);
- if (ns)
+ if (lst != NULL) {
+ if (nfsd4_lock_ol_stateid(lst) == nfs_ok)
+ goto out;
+ nfs4_put_stid(&lst->st_stid);
+ }
+ ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
+ if (ns == NULL)
+ return NULL;
+
+ lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost);
+ if (lst == openlockstateid(ns))
+ *new = true;
+ else
nfs4_put_stid(ns);
+out:
return lst;
}
@@ -5750,7 +5848,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_lockowner *lo;
struct nfs4_ol_stateid *lst;
unsigned int strhashval;
- bool hashed;
lo = find_lockowner_str(cl, &lock->lk_new_owner);
if (!lo) {
@@ -5766,25 +5863,12 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
goto out;
}
-retry:
lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
if (lst == NULL) {
status = nfserr_jukebox;
goto out;
}
- mutex_lock(&lst->st_mutex);
-
- /* See if it's still hashed to avoid race with FREE_STATEID */
- spin_lock(&cl->cl_lock);
- hashed = !list_empty(&lst->st_perfile);
- spin_unlock(&cl->cl_lock);
-
- if (!hashed) {
- mutex_unlock(&lst->st_mutex);
- nfs4_put_stid(&lst->st_stid);
- goto retry;
- }
status = nfs_ok;
*plst = lst;
out:
@@ -5990,14 +6074,16 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
- mutex_unlock(&lock_stp->st_mutex);
-
/*
* If this is a new, never-before-used stateid, and we are
* returning an error, then just go ahead and release it.
*/
- if (status && new)
+ if (status && new) {
+ lock_stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_lock_stateid(lock_stp);
+ }
+
+ mutex_unlock(&lock_stp->st_mutex);
nfs4_put_stid(&lock_stp->st_stid);
}
@@ -7017,6 +7103,10 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
nn->conf_name_tree = RB_ROOT;
nn->unconf_name_tree = RB_ROOT;
+ nn->boot_time = get_seconds();
+ nn->grace_ended = false;
+ nn->nfsd4_manager.block_opens = true;
+ INIT_LIST_HEAD(&nn->nfsd4_manager.list);
INIT_LIST_HEAD(&nn->client_lru);
INIT_LIST_HEAD(&nn->close_lru);
INIT_LIST_HEAD(&nn->del_recall_lru);
@@ -7074,9 +7164,6 @@ nfs4_state_start_net(struct net *net)
ret = nfs4_state_create_net(net);
if (ret)
return ret;
- nn->boot_time = get_seconds();
- nn->grace_ended = false;
- nn->nfsd4_manager.block_opens = true;
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
@@ -7153,7 +7240,7 @@ nfs4_state_shutdown_net(struct net *net)
spin_unlock(&nn->blocked_locks_lock);
while (!list_empty(&reaplist)) {
- nbl = list_first_entry(&nn->blocked_locks_lru,
+ nbl = list_first_entry(&reaplist,
struct nfsd4_blocked_lock, nbl_lru);
list_del_init(&nbl->nbl_lru);
posix_unblock_lock(&nbl->nbl_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6493df6b1bd5..d107b4426f7e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1241,6 +1241,9 @@ static __net_init int nfsd_init_net(struct net *net)
nn->nfsd4_grace = 90;
nn->clverifier_counter = prandom_u32();
nn->clientid_counter = prandom_u32();
+
+ atomic_set(&nn->ntf_refcnt, 0);
+ init_waitqueue_head(&nn->ntf_wq);
return 0;
out_idmap_error:
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 43f31cf49bae..b8444189223b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -11,6 +11,7 @@
#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
+#include <linux/iversion.h>
static inline __u32 ino_t_to_u32(ino_t ino)
{
@@ -259,7 +260,7 @@ static inline u64 nfsd4_change_attribute(struct inode *inode)
chattr = inode->i_ctime.tv_sec;
chattr <<= 30;
chattr += inode->i_ctime.tv_nsec;
- chattr += inode->i_version;
+ chattr += inode_query_iversion(inode);
return chattr;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 33117d4ffce0..89cb484f1cfb 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -335,7 +335,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct sockaddr_in sin;
- if (event != NETDEV_DOWN)
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nn->ntf_refcnt))
goto out;
if (nn->nfsd_serv) {
@@ -344,6 +345,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
sin.sin_addr.s_addr = ifa->ifa_local;
svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
}
+ atomic_dec(&nn->ntf_refcnt);
+ wake_up(&nn->ntf_wq);
out:
return NOTIFY_DONE;
@@ -363,7 +366,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct sockaddr_in6 sin6;
- if (event != NETDEV_DOWN)
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nn->ntf_refcnt))
goto out;
if (nn->nfsd_serv) {
@@ -374,7 +378,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
sin6.sin6_scope_id = ifa->idev->dev->ifindex;
svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
}
-
+ atomic_dec(&nn->ntf_refcnt);
+ wake_up(&nn->ntf_wq);
out:
return NOTIFY_DONE;
}
@@ -391,6 +396,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ atomic_dec(&nn->ntf_refcnt);
/* check if the notifier still has clients */
if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
@@ -398,6 +404,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
}
+ wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
/*
* write_ports can create the server without actually starting
@@ -517,6 +524,7 @@ int nfsd_create_serv(struct net *net)
register_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
}
+ atomic_inc(&nn->ntf_refcnt);
ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f572538dcc4f..9f3ffba41533 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1979,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
struct nilfs_inode_info *ii, *n;
- int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+ int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE);
int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3ce20cd44a20..3073b646e1ba 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function,
if (nilfs_test_opt(nilfs, ERRORS_RO)) {
printk(KERN_CRIT "Remounting filesystem read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
}
@@ -869,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
/* FS independent flags */
#ifdef NILFS_ATIME_DISABLE
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
#endif
nilfs_set_default_options(sb, sbp);
@@ -1133,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
err = -EINVAL;
@@ -1143,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
/* Shutting down log writer */
nilfs_detach_log_writer(sb);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
/*
* Remounting a valid RW partition RDONLY, so set
@@ -1178,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
root = NILFS_I(d_inode(sb->s_root))->i_root;
err = nilfs_attach_log_writer(sb, root);
@@ -1212,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option,
const char *msg = NULL;
int err;
- if (!(sd->flags & MS_RDONLY)) {
+ if (!(sd->flags & SB_RDONLY)) {
msg = "read-only option is not specified";
goto parse_error;
}
@@ -1286,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
struct dentry *root_dentry;
int err, s_new = false;
- if (!(flags & MS_RDONLY))
+ if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1327,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
sb_set_blocksize(s, block_size(sd.bdev));
- err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+ err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (err)
goto failed_super;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
- if ((flags ^ s->s_flags) & MS_RDONLY) {
+ if ((flags ^ s->s_flags) & SB_RDONLY) {
nilfs_msg(s, KERN_ERR,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index afebb5067cec..1a85317e83f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
if (!valid_fs) {
nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
- if (s_flags & MS_RDONLY) {
+ if (s_flags & SB_RDONLY) {
nilfs_msg(sb, KERN_INFO,
"recovery required for readonly filesystem");
nilfs_msg(sb, KERN_INFO,
@@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
if (valid_fs)
goto skip_recovery;
- if (s_flags & MS_RDONLY) {
+ if (s_flags & SB_RDONLY) {
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
@@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = -EROFS;
goto failed_unload;
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
nilfs_msg(sb, KERN_ERR,
"recovery cancelled because norecovery option was specified for a read/write mount");
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 81d8959b6aef..219b269c737e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
/*
* If i_count is zero, the inode cannot have any watches and
- * doing an __iget/iput with MS_ACTIVE clear would actually
+ * doing an __iget/iput with SB_ACTIVE clear would actually
* evict all inodes with zero i_count from icache which is
* unnecessarily violent and may in fact be illegal to do.
*/
diff --git a/fs/nsfs.c b/fs/nsfs.c
index ef243e14b6eb..7c6f76d29f56 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -255,5 +255,5 @@ void __init nsfs_init(void)
nsfs_mnt = kern_mount(&nsfs);
if (IS_ERR(nsfs_mnt))
panic("can't set nsfs up\n");
- nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+ nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 7c410f879412..1c1ee489284b 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -560,13 +560,6 @@ static int ntfs_read_locked_inode(struct inode *vi)
ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
/* Setup the generic vfs inode parts now. */
-
- /*
- * This is for checking whether an inode has changed w.r.t. a file so
- * that the file can be updated if necessary (compare with f_version).
- */
- vi->i_version = 1;
-
vi->i_uid = vol->uid;
vi->i_gid = vol->gid;
vi->i_mode = 0;
@@ -1240,7 +1233,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
base_ni = NTFS_I(base_vi);
/* Just mirror the values from the base inode. */
- vi->i_version = base_vi->i_version;
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
@@ -1507,7 +1499,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
ni = NTFS_I(vi);
base_ni = NTFS_I(base_vi);
/* Just mirror the values from the base inode. */
- vi->i_version = base_vi->i_version;
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index ee8392aee9f6..2831f495a674 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2641,12 +2641,6 @@ mft_rec_already_initialized:
goto undo_mftbmp_alloc;
}
vi->i_ino = bit;
- /*
- * This is for checking whether an inode has changed w.r.t. a
- * file so that the file can be updated if necessary (compare
- * with f_version).
- */
- vi->i_version = 1;
/* The owner and group come from the ntfs volume. */
vi->i_uid = vol->uid;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3f70f041dbe9..bb7159f697f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
#ifndef NTFS_RW
/* For read-only compiled driver, enforce read-only flag. */
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
#else /* NTFS_RW */
/*
* For the read-write compiled driver, if we are remounting read-write,
@@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
* When remounting read-only, mark the volume clean if no volume errors
* have occurred.
*/
- if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) {
+ if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
static const char *es = ". Cannot remount read-write.";
/* Remounting read-write. */
@@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
NVolSetErrors(vol);
return -EROFS;
}
- } else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) {
+ } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
/* Remounting read-only. */
if (!NVolErrors(vol)) {
if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
@@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol)
es3);
goto iput_mirr_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s",
!vol->mftmirr_ino ? es1 : es2, es3);
} else
@@ -1937,7 +1937,7 @@ get_ctx_vol_failed:
es1, es2);
goto iput_vol_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
} else
ntfs_warning(sb, "%s. Will not be able to remount "
@@ -1974,7 +1974,7 @@ get_ctx_vol_failed:
}
goto iput_logfile_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
} else
ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2019,7 +2019,7 @@ get_ctx_vol_failed:
es1, es2);
goto iput_root_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
} else
ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2042,7 +2042,7 @@ get_ctx_vol_failed:
goto iput_root_err_out;
}
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
/*
* Do not set NVolErrors() because ntfs_remount() might manage
* to set the dirty flag in which case all would be well.
@@ -2055,7 +2055,7 @@ get_ctx_vol_failed:
* If (still) a read-write mount, set the NT4 compatibility flag on
* newer NTFS version volumes.
*/
- if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+ if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
static const char *es1 = "Failed to set NT4 compatibility flag";
static const char *es2 = ". Run chkdsk.";
@@ -2069,7 +2069,7 @@ get_ctx_vol_failed:
goto iput_root_err_out;
}
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
NVolSetErrors(vol);
}
#endif
@@ -2087,7 +2087,7 @@ get_ctx_vol_failed:
goto iput_root_err_out;
}
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
NVolSetErrors(vol);
}
#endif /* NTFS_RW */
@@ -2128,7 +2128,7 @@ get_ctx_vol_failed:
es1, es2);
goto iput_quota_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
} else
ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2150,7 +2150,7 @@ get_ctx_vol_failed:
goto iput_quota_err_out;
}
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
NVolSetErrors(vol);
}
/*
@@ -2171,7 +2171,7 @@ get_ctx_vol_failed:
es1, es2);
goto iput_usnjrnl_err_out;
}
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
} else
ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2194,7 +2194,7 @@ get_ctx_vol_failed:
goto iput_usnjrnl_err_out;
}
ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
NVolSetErrors(vol);
}
#endif /* NTFS_RW */
@@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
lockdep_off();
ntfs_debug("Entering.");
#ifndef NTFS_RW
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
#endif /* ! NTFS_RW */
/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index febe6312ceff..32f9c72dff17 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -42,6 +42,7 @@
#include <linux/highmem.h>
#include <linux/quotaops.h>
#include <linux/sort.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -1174,7 +1175,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
ocfs2_journal_dirty(handle, bh);
goto bail;
}
@@ -1729,7 +1730,7 @@ int __ocfs2_add_entry(handle_t *handle,
if (ocfs2_dir_indexed(dir))
ocfs2_recalc_free_list(dir, handle, lookup);
- dir->i_version++;
+ inode_inc_iversion(dir);
ocfs2_journal_dirty(handle, insert_bh);
retval = 0;
goto bail;
@@ -1775,7 +1776,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (*f_version != inode->i_version) {
+ if (inode_cmp_iversion(inode, *f_version)) {
for (i = 0; i < i_size_read(inode) && i < offset; ) {
de = (struct ocfs2_dir_entry *)
(data->id_data + i);
@@ -1791,7 +1792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
i += le16_to_cpu(de->rec_len);
}
ctx->pos = offset = i;
- *f_version = inode->i_version;
+ *f_version = inode_query_iversion(inode);
}
de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
@@ -1869,7 +1870,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (*f_version != inode->i_version) {
+ if (inode_cmp_iversion(inode, *f_version)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ocfs2_dir_entry *) (bh->b_data + i);
/* It's too expensive to do a full
@@ -1886,7 +1887,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- *f_version = inode->i_version;
+ *f_version = inode_query_iversion(inode);
}
while (ctx->pos < i_size_read(inode)
@@ -1940,7 +1941,7 @@ static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
*/
int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- u64 version = inode->i_version;
+ u64 version = inode_query_iversion(inode);
ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index dc455d45a66a..a1d051055472 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode,
return 0;
if ((inode->i_flags & S_NOATIME) ||
- ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
return 0;
/*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 1a1e0078ab38..d51b80edd972 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
+#include <linux/iversion.h>
#include <asm/byteorder.h>
@@ -302,7 +303,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
- inode->i_version = 1;
+ inode_set_iversion(inode, 1);
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3b0a10d9b36f..c801eddc4bf3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/quotaops.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -1520,7 +1521,7 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
- new_dir->i_version++;
+ inode_inc_iversion(new_dir);
if (S_ISDIR(new_inode->i_mode))
ocfs2_set_links_count(newfe, 0);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b39d14cbfa34..7a922190a8c7 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -12,6 +12,7 @@
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/llist.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -289,7 +290,7 @@ out:
mlog_errno(err);
return err;
}
- gqinode->i_version++;
+ inode_inc_iversion(gqinode);
ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
return len;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 040bbb6a6e4b..80efa5699fb0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
/* We're going to/from readonly mode. */
- if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) {
+ if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
/* Disable quota accounting before remounting RO */
- if (*flags & MS_RDONLY) {
+ if (*flags & SB_RDONLY) {
ret = ocfs2_susp_quotas(osb, 0);
if (ret < 0)
goto out;
@@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto unlock_osb;
}
- if (*flags & MS_RDONLY) {
- sb->s_flags |= MS_RDONLY;
+ if (*flags & SB_RDONLY) {
+ sb->s_flags |= SB_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
} else {
if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
@@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto unlock_osb;
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
}
trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
unlock_osb:
spin_unlock(&osb->osb_lock);
/* Enable quota accounting after remounting RW */
- if (!ret && !(*flags & MS_RDONLY)) {
+ if (!ret && !(*flags & SB_RDONLY)) {
if (sb_any_quota_suspended(sb))
ret = ocfs2_susp_quotas(osb, 1);
else
@@ -724,7 +724,7 @@ unlock_osb:
if (ret < 0) {
/* Return back changes... */
spin_lock(&osb->osb_lock);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
goto out;
@@ -744,9 +744,9 @@ unlock_osb:
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
- MS_POSIXACL : 0);
+ SB_POSIXACL : 0);
}
out:
return ret;
@@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OCFS2_SUPER_MAGIC;
- sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
- ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
- /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+ /* Hard readonly mode only if: bdev_read_only, SB_RDONLY,
* heartbeat=none */
if (bdev_read_only(sb->s_bdev)) {
if (!sb_rdonly(sb)) {
@@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
/* this is needed to support O_LARGEFILE */
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
@@ -2568,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb)
return rv;
pr_crit("OCFS2: File system is now read-only.\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
ocfs2_set_ro_flag(osb, 0);
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5fdf269ba82e..c5898c59d411 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb,
case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
- if (!(sb->s_flags & MS_POSIXACL))
+ if (!(sb->s_flags & SB_POSIXACL))
return 0;
break;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 13215f26e321..2200662a9bf1 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
static int openprom_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_NOATIME;
+ *flags |= SB_NOATIME;
return 0;
}
@@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
struct op_inode_info *oi;
int ret;
- s->s_flags |= MS_NOATIME;
+ s->s_flags |= SB_NOATIME;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = OPENPROM_SUPER_MAGIC;
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index ded456f17de6..c584ad8d023c 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -162,7 +162,7 @@ static ssize_t orangefs_devreq_read(struct file *file,
struct orangefs_kernel_op_s *op, *temp;
__s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
- struct orangefs_kernel_op_s *cur_op = NULL;
+ struct orangefs_kernel_op_s *cur_op;
unsigned long ret;
/* We do not support blocking IO. */
@@ -186,6 +186,7 @@ static ssize_t orangefs_devreq_read(struct file *file,
return -EAGAIN;
restart:
+ cur_op = NULL;
/* Get next op (if any) from top of list. */
spin_lock(&orangefs_request_list_lock);
list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 1668fd645c45..0d228cd087e6 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -452,7 +452,7 @@ ssize_t orangefs_inode_read(struct inode *inode,
static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
- loff_t pos = *(&iocb->ki_pos);
+ loff_t pos = iocb->ki_pos;
ssize_t rc = 0;
BUG_ON(iocb->private);
@@ -492,9 +492,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
}
}
- if (file->f_pos > i_size_read(file->f_mapping->host))
- orangefs_i_size_write(file->f_mapping->host, file->f_pos);
-
rc = generic_write_checks(iocb, iter);
if (rc <= 0) {
@@ -508,7 +505,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
* pos to the end of the file, so we will wait till now to set
* pos...
*/
- pos = *(&iocb->ki_pos);
+ pos = iocb->ki_pos;
rc = do_readv_writev(ORANGEFS_IO_WRITE,
file,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 97adf7d100b5..2595453fe737 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -533,17 +533,6 @@ do { \
sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \
} while (0)
-static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
-{
-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
- inode_lock(inode);
-#endif
- i_size_write(inode, i_size);
-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
- inode_unlock(inode);
-#endif
-}
-
static inline void orangefs_set_timeout(struct dentry *dentry)
{
unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 366750eef201..36f1390b5ed7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
{
struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb);
- if (root->d_sb->s_flags & MS_POSIXACL)
+ if (root->d_sb->s_flags & SB_POSIXACL)
seq_puts(m, ",acl");
if (orangefs_sb->flags & ORANGEFS_OPT_INTR)
seq_puts(m, ",intr");
@@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
* Force any potential flags that might be set from the mount
* to zero, ie, initialize to unset.
*/
- sb->s_flags &= ~MS_POSIXACL;
+ sb->s_flags &= ~SB_POSIXACL;
orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
@@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options,
token = match_token(p, tokens, args);
switch (token) {
case Opt_acl:
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
break;
case Opt_intr:
orangefs_sb->flags |= ORANGEFS_OPT_INTR;
@@ -507,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
ret = orangefs_fill_sb(sb,
&new_op->downcall.resp.fs_mount, data,
- flags & MS_SILENT ? 1 : 0);
+ flags & SB_SILENT ? 1 : 0);
if (ret) {
d = ERR_PTR(ret);
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 835c6e148afc..0577d6dba8c8 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -29,10 +29,10 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
*/
void purge_waiting_ops(void)
{
- struct orangefs_kernel_op_s *op;
+ struct orangefs_kernel_op_s *op, *tmp;
spin_lock(&orangefs_request_list_lock);
- list_for_each_entry(op, &orangefs_request_list, list) {
+ list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) {
gossip_debug(GOSSIP_WAIT_DEBUG,
"pvfs2-client-core: purging op tag %llu %s\n",
llu(op->tag),
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index cbfc196e5dc5..5ac415466861 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR
an overlay which has redirects on a kernel that doesn't support this
feature will have unexpected results.
+config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
+ bool "Overlayfs: follow redirects even if redirects are turned off"
+ default y
+ depends on OVERLAY_FS
+ help
+ Disable this to get a possibly more secure configuration, but that
+ might not be backward compatible with previous kernels.
+
+ For more information, see Documentation/filesystems/overlayfs.txt
+
config OVERLAY_FS_INDEX
bool "Overlayfs: turn on inodes index feature by default"
depends on OVERLAY_FS
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index e13921824c70..f9788bc116a8 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -887,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
spin_unlock(&dentry->d_lock);
} else {
kfree(redirect);
- pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
+ pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n",
+ err);
/* Fall back to userspace copy-up */
err = -EXDEV;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 625ed8066570..beb945e1963c 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -435,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
/* Check if index is orphan and don't warn before cleaning it */
if (d_inode(index)->i_nlink == 1 &&
- ovl_get_nlink(index, origin.dentry, 0) == 0)
+ ovl_get_nlink(origin.dentry, index, 0) == 0)
err = -ENOENT;
dput(origin.dentry);
@@ -681,6 +681,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.stop)
break;
+ /*
+ * Following redirects can have security consequences: it's like
+ * a symlink into the lower layer without the permission checks.
+ * This is only a problem if the upper layer is untrusted (e.g
+ * comes from an USB drive). This can allow a non-readable file
+ * or directory to become readable.
+ *
+ * Only following redirects when redirects are enabled disables
+ * this attack vector when not necessary.
+ */
+ err = -EPERM;
+ if (d.redirect && !ofs->config.redirect_follow) {
+ pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry);
+ goto out_put;
+ }
+
if (d.redirect && d.redirect[0] == '/' && poe != roe) {
poe = roe;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 13eab09a6b6f..b489099ccd49 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -180,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
{
struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
- int err = IS_ERR(ret) ? PTR_ERR(ret) : 0;
+ int err = PTR_ERR_OR_ZERO(ret);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
return ret;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 752bab645879..9d0bc03bf6e4 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -14,6 +14,8 @@ struct ovl_config {
char *workdir;
bool default_permissions;
bool redirect_dir;
+ bool redirect_follow;
+ const char *redirect_mode;
bool index;
};
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 0daa4354fec4..8c98578d27a1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -499,7 +499,7 @@ out:
return err;
fail:
- pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n",
+ pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n",
p->name, err);
goto out;
}
@@ -663,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
return PTR_ERR(rdt.cache);
}
- return iterate_dir(od->realfile, &rdt.ctx);
+ err = iterate_dir(od->realfile, &rdt.ctx);
+ ctx->pos = rdt.ctx.pos;
+
+ return err;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index be03578181d2..76440feb79f6 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -33,6 +33,13 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
MODULE_PARM_DESC(ovl_redirect_dir_def,
"Default to on or off for the redirect_dir feature");
+static bool ovl_redirect_always_follow =
+ IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW);
+module_param_named(redirect_always_follow, ovl_redirect_always_follow,
+ bool, 0644);
+MODULE_PARM_DESC(ovl_redirect_always_follow,
+ "Follow redirects even if redirect_dir feature is turned off");
+
static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX);
module_param_named(index, ovl_index_def, bool, 0644);
MODULE_PARM_DESC(ovl_index_def,
@@ -232,6 +239,7 @@ static void ovl_free_fs(struct ovl_fs *ofs)
kfree(ofs->config.lowerdir);
kfree(ofs->config.upperdir);
kfree(ofs->config.workdir);
+ kfree(ofs->config.redirect_mode);
if (ofs->creator_cred)
put_cred(ofs->creator_cred);
kfree(ofs);
@@ -244,6 +252,7 @@ static void ovl_put_super(struct super_block *sb)
ovl_free_fs(ofs);
}
+/* Sync real dirty inodes in upper filesystem (if it exists) */
static int ovl_sync_fs(struct super_block *sb, int wait)
{
struct ovl_fs *ofs = sb->s_fs_info;
@@ -252,14 +261,24 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
if (!ofs->upper_mnt)
return 0;
- upper_sb = ofs->upper_mnt->mnt_sb;
- if (!upper_sb->s_op->sync_fs)
+
+ /*
+ * If this is a sync(2) call or an emergency sync, all the super blocks
+ * will be iterated, including upper_sb, so no need to do anything.
+ *
+ * If this is a syncfs(2) call, then we do need to call
+ * sync_filesystem() on upper_sb, but enough if we do it when being
+ * called with wait == 1.
+ */
+ if (!wait)
return 0;
- /* real inodes have already been synced by sync_filesystem(ovl_sb) */
+ upper_sb = ofs->upper_mnt->mnt_sb;
+
down_read(&upper_sb->s_umount);
- ret = upper_sb->s_op->sync_fs(upper_sb, wait);
+ ret = sync_filesystem(upper_sb);
up_read(&upper_sb->s_umount);
+
return ret;
}
@@ -295,6 +314,11 @@ static bool ovl_force_readonly(struct ovl_fs *ofs)
return (!ofs->upper_mnt || !ofs->workdir);
}
+static const char *ovl_redirect_mode_def(void)
+{
+ return ovl_redirect_dir_def ? "on" : "off";
+}
+
/**
* ovl_show_options
*
@@ -313,12 +337,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
}
if (ofs->config.default_permissions)
seq_puts(m, ",default_permissions");
- if (ofs->config.redirect_dir != ovl_redirect_dir_def)
- seq_printf(m, ",redirect_dir=%s",
- ofs->config.redirect_dir ? "on" : "off");
+ if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0)
+ seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode);
if (ofs->config.index != ovl_index_def)
- seq_printf(m, ",index=%s",
- ofs->config.index ? "on" : "off");
+ seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
return 0;
}
@@ -326,7 +348,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
{
struct ovl_fs *ofs = sb->s_fs_info;
- if (!(*flags & MS_RDONLY) && ovl_force_readonly(ofs))
+ if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
return -EROFS;
return 0;
@@ -348,8 +370,7 @@ enum {
OPT_UPPERDIR,
OPT_WORKDIR,
OPT_DEFAULT_PERMISSIONS,
- OPT_REDIRECT_DIR_ON,
- OPT_REDIRECT_DIR_OFF,
+ OPT_REDIRECT_DIR,
OPT_INDEX_ON,
OPT_INDEX_OFF,
OPT_ERR,
@@ -360,8 +381,7 @@ static const match_table_t ovl_tokens = {
{OPT_UPPERDIR, "upperdir=%s"},
{OPT_WORKDIR, "workdir=%s"},
{OPT_DEFAULT_PERMISSIONS, "default_permissions"},
- {OPT_REDIRECT_DIR_ON, "redirect_dir=on"},
- {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"},
+ {OPT_REDIRECT_DIR, "redirect_dir=%s"},
{OPT_INDEX_ON, "index=on"},
{OPT_INDEX_OFF, "index=off"},
{OPT_ERR, NULL}
@@ -390,10 +410,37 @@ static char *ovl_next_opt(char **s)
return sbegin;
}
+static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode)
+{
+ if (strcmp(mode, "on") == 0) {
+ config->redirect_dir = true;
+ /*
+ * Does not make sense to have redirect creation without
+ * redirect following.
+ */
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "follow") == 0) {
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "off") == 0) {
+ if (ovl_redirect_always_follow)
+ config->redirect_follow = true;
+ } else if (strcmp(mode, "nofollow") != 0) {
+ pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n",
+ mode);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ovl_parse_opt(char *opt, struct ovl_config *config)
{
char *p;
+ config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
+ if (!config->redirect_mode)
+ return -ENOMEM;
+
while ((p = ovl_next_opt(&opt)) != NULL) {
int token;
substring_t args[MAX_OPT_ARGS];
@@ -428,12 +475,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->default_permissions = true;
break;
- case OPT_REDIRECT_DIR_ON:
- config->redirect_dir = true;
- break;
-
- case OPT_REDIRECT_DIR_OFF:
- config->redirect_dir = false;
+ case OPT_REDIRECT_DIR:
+ kfree(config->redirect_mode);
+ config->redirect_mode = match_strdup(&args[0]);
+ if (!config->redirect_mode)
+ return -ENOMEM;
break;
case OPT_INDEX_ON:
@@ -458,7 +504,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->workdir = NULL;
}
- return 0;
+ return ovl_parse_redirect_mode(config, config->redirect_mode);
}
#define OVL_WORKDIR_NAME "work"
@@ -1160,7 +1206,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!cred)
goto out_err;
- ofs->config.redirect_dir = ovl_redirect_dir_def;
ofs->config.index = ovl_index_def;
err = ovl_parse_opt((char *) data, &ofs->config);
if (err)
@@ -1190,7 +1235,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_err;
if (!ofs->workdir)
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
@@ -1203,7 +1248,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ofs->upper_mnt)
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
ofs->same_sb = NULL;
@@ -1213,7 +1258,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_oe;
if (!ofs->indexdir)
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
/* Show index=off/on in /proc/mounts for any of the reasons above */
@@ -1227,7 +1272,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ovl_super_operations;
sb->s_xattr = ovl_xattr_handlers;
sb->s_fs_info = ofs;
- sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+ sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK;
err = -ENOMEM;
root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 79375fc115d2..d67a72dcb92c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -430,8 +430,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* safe because the task has stopped executing permanently.
*/
if (permitted && (task->flags & PF_DUMPCORE)) {
- eip = KSTK_EIP(task);
- esp = KSTK_ESP(task);
+ if (try_get_task_stack(task)) {
+ eip = KSTK_EIP(task);
+ esp = KSTK_ESP(task);
+ put_task_stack(task);
+ }
}
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 31934cb9dfc8..60316b52d659 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -443,8 +443,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
save_stack_trace_tsk(task, &trace);
for (i = 0; i < trace.nr_entries; i++) {
- seq_printf(m, "[<%pK>] %pB\n",
- (void *)entries[i], (void *)entries[i]);
+ seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
}
unlock_trace(task);
}
@@ -2269,7 +2268,7 @@ static int show_timer(struct seq_file *m, void *v)
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%p\n",
+ seq_printf(m, "signal: %d/%px\n",
timer->sigq->info.si_signo,
timer->sigq->info.si_value.sival_ptr);
seq_printf(m, "notify: %s/%s.%d\n",
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 225f541f7078..dd0f82622427 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
/* User space would break if executables or devices appear on proc */
s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
- s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4e42aba97f2e..ede8e64974be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
{
struct pid_namespace *ns;
- if (flags & MS_KERNMOUNT) {
+ if (flags & SB_KERNMOUNT) {
ns = data;
data = NULL;
} else {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7b635d173213..b786840facd9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -45,10 +45,10 @@ struct proc_fs_info {
static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_info fs_info[] = {
- { MS_SYNCHRONOUS, ",sync" },
- { MS_DIRSYNC, ",dirsync" },
- { MS_MANDLOCK, ",mand" },
- { MS_LAZYTIME, ",lazytime" },
+ { SB_SYNCHRONOUS, ",sync" },
+ { SB_DIRSYNC, ",dirsync" },
+ { SB_MANDLOCK, ",mand" },
+ { SB_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3a67cfb142d8..3d46fe302fcb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
sync_filesystem(sb);
qs = qnx4_sb(sb);
qs->Version = QNX4_VERSION;
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &qnx4_sops;
s->s_magic = QNX4_SUPER_MAGIC;
- s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
+ s->s_flags |= SB_RDONLY; /* Yup, read-only yet */
/* Check the superblock signature. Since the qnx4 code is
dangerous, we should leave as quickly as possible
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 1192422a1c56..4aeb26bcb4d0 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
static int qnx6_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -427,7 +427,7 @@ mmi_success:
}
s->s_op = &qnx6_sops;
s->s_magic = QNX6_SUPER_MAGIC;
- s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
+ s->s_flags |= SB_RDONLY; /* Yup, read-only yet */
/* ease the later tree level calculations */
sbi = QNX6_SB(s);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39f1b0b0c76f..020c597ef9b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -941,12 +941,13 @@ static int dqinit_needed(struct inode *inode, int type)
}
/* This routine is guarded by s_umount semaphore */
-static void add_dquot_ref(struct super_block *sb, int type)
+static int add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
int reserved = 0;
#endif
+ int err = 0;
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -966,7 +967,11 @@ static void add_dquot_ref(struct super_block *sb, int type)
reserved = 1;
#endif
iput(old_inode);
- __dquot_initialize(inode, type);
+ err = __dquot_initialize(inode, type);
+ if (err) {
+ iput(inode);
+ goto out;
+ }
/*
* We hold a reference to 'inode' so it couldn't have been
@@ -981,7 +986,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
-
+out:
#ifdef CONFIG_QUOTA_DEBUG
if (reserved) {
quota_error(sb, "Writes happened before quota was turned on "
@@ -989,6 +994,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
"Please run quotacheck(8)");
}
#endif
+ return err;
}
/*
@@ -2379,10 +2385,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
dqopt->flags |= dquot_state_flag(flags, type);
spin_unlock(&dq_state_lock);
- add_dquot_ref(sb, type);
-
- return 0;
+ error = add_dquot_ref(sb, type);
+ if (error)
+ dquot_disable(sb, type, flags);
+ return error;
out_file_init:
dqopt->files[type] = NULL;
iput(inode);
@@ -2985,7 +2992,8 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- register_shrinker(&dqcache_shrinker);
+ if (register_shrinker(&dqcache_shrinker))
+ panic("Cannot register dquot shrinker");
return 0;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 11a48affa882..b13fc024d2ee 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
journal_end(th);
goto out_inserted_sd;
}
- } else if (inode->i_sb->s_flags & MS_POSIXACL) {
+ } else if (inode->i_sb->s_flags & SB_POSIXACL) {
reiserfs_warning(inode->i_sb, "jdm-13090",
"ACLs aren't enabled in the fs, "
"but vfs thinks they are!");
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 69ff280bdfe8..70057359fbaf 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
/*
* Cancel flushing of old commits. Note that neither of these works
* will be requeued because superblock is being shutdown and doesn't
- * have MS_ACTIVE set.
+ * have SB_ACTIVE set.
*/
reiserfs_cancel_old_flush(sb);
/* wait for all commits to finish */
@@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
* Avoid queueing work when sb is being shut down. Transaction
* will be flushed on journal shutdown.
*/
- if (sb->s_flags & MS_ACTIVE)
+ if (sb->s_flags & SB_ACTIVE)
queue_delayed_work(REISERFS_SB(sb)->commit_wq,
&journal->j_work, HZ / 10);
}
@@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno)
if (!journal->j_errno)
journal->j_errno = errno;
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
set_bit(J_ABORTED, &journal->j_state);
#ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 64f49cafbc5b..7e288d97adcb 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id,
return;
reiserfs_info(sb, "Remounting filesystem read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
reiserfs_abort_journal(sb, -EIO);
}
@@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
error_buf);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
reiserfs_abort_journal(sb, errno);
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5464ec517702..1fc934d24459 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s)
* Avoid scheduling flush when sb is being shut down. It can race
* with journal shutdown and free still queued delayed work.
*/
- if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE))
+ if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
return;
spin_lock(&sbi->old_work_lock);
@@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
- if (s->s_flags & MS_ACTIVE) {
+ if (s->s_flags & SB_ACTIVE) {
ms_active_set = 0;
} else {
ms_active_set = 1;
- s->s_flags |= MS_ACTIVE;
+ s->s_flags |= SB_ACTIVE;
}
/* Turn on quotas so that they are updated correctly */
for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
@@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s)
reiserfs_write_lock(s);
if (ms_active_set)
/* Restore the flag back */
- s->s_flags &= ~MS_ACTIVE;
+ s->s_flags &= ~SB_ACTIVE;
#endif
pathrelse(&path);
if (done)
@@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
goto out_err_unlock;
}
- if (*mount_flags & MS_RDONLY) {
+ if (*mount_flags & SB_RDONLY) {
reiserfs_write_unlock(s);
reiserfs_xattr_init(s, *mount_flags);
/* remount read-only */
@@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
/* now it is safe to call journal_begin */
- s->s_flags &= ~MS_RDONLY;
+ s->s_flags &= ~SB_RDONLY;
err = journal_begin(&th, s, 10);
if (err)
goto out_err_unlock;
@@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
/* Mount a partition which is read-only, read-write */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
- s->s_flags &= ~MS_RDONLY;
+ s->s_flags &= ~SB_RDONLY;
set_sb_umount_state(rs, REISERFS_ERROR_FS);
if (!old_format_only(s))
set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
@@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
goto out_err_unlock;
reiserfs_write_unlock(s);
- if (!(*mount_flags & MS_RDONLY)) {
+ if (!(*mount_flags & SB_RDONLY)) {
dquot_resume(s, -1);
reiserfs_write_lock(s);
finish_unfinished(s);
@@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
SWARN(silent, s, "clm-7000",
"Detected readonly device, marking FS readonly");
- s->s_flags |= MS_RDONLY;
+ s->s_flags |= SB_RDONLY;
}
args.objectid = REISERFS_ROOT_OBJECTID;
args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
@@ -2591,7 +2591,6 @@ out:
return err;
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
- inode->i_version++;
inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
return len - towrite;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 46492fb37a4c..5dbf5324bdda 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
/*
* We need to take a copy of the mount flags since things like
- * MS_RDONLY don't get set until *after* we're called.
+ * SB_RDONLY don't get set until *after* we're called.
* mount_flags != mount_options
*/
int reiserfs_xattr_init(struct super_block *s, int mount_flags)
@@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
if (err)
goto error;
- if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+ if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
inode_lock(d_inode(s->s_root));
err = create_privroot(REISERFS_SB(s)->priv_root);
inode_unlock(d_inode(s->s_root));
@@ -999,11 +999,11 @@ error:
clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
}
- /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+ /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
if (reiserfs_posixacl(s))
- s->s_flags |= MS_POSIXACL;
+ s->s_flags |= SB_POSIXACL;
else
- s->s_flags &= ~MS_POSIXACL;
+ s->s_flags &= ~SB_POSIXACL;
return err;
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0186fe6d39f3..8f06fd1f3d69 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int romfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
@@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = 0xFFFFFFFF;
sb->s_magic = ROMFS_MAGIC;
- sb->s_flags |= MS_RDONLY | MS_NOATIME;
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_op = &romfs_super_ops;
#ifdef CONFIG_ROMFS_ON_MTD
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cf01e15a7b16..8a73b97217c8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
(u64) le64_to_cpu(sblk->id_table_start));
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
sb->s_op = &squashfs_super_ops;
err = -ENOMEM;
@@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int squashfs_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
diff --git a/fs/statfs.c b/fs/statfs.c
index b072a8bab71a..5b2a24f0f263 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags)
static int flags_by_sb(int s_flags)
{
int flags = 0;
- if (s_flags & MS_SYNCHRONOUS)
+ if (s_flags & SB_SYNCHRONOUS)
flags |= ST_SYNCHRONOUS;
- if (s_flags & MS_MANDLOCK)
+ if (s_flags & SB_MANDLOCK)
flags |= ST_MANDLOCK;
- if (s_flags & MS_RDONLY)
+ if (s_flags & SB_RDONLY)
flags |= ST_RDONLY;
return flags;
}
diff --git a/fs/super.c b/fs/super.c
index d4e33e8f1e6f..06bd25d90ba5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -191,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
INIT_LIST_HEAD(&s->s_mounts);
s->s_user_ns = get_user_ns(user_ns);
+ init_rwsem(&s->s_umount);
+ lockdep_set_class(&s->s_umount, &type->s_umount_key);
+ /*
+ * sget() can have s_umount recursion.
+ *
+ * When it cannot find a suitable sb, it allocates a new
+ * one (this one), and tries again to find a suitable old
+ * one.
+ *
+ * In case that succeeds, it will acquire the s_umount
+ * lock of the old one. Since these are clearly distrinct
+ * locks, and this object isn't exposed yet, there's no
+ * risk of deadlocks.
+ *
+ * Annotate this by putting this lock in a different
+ * subclass.
+ */
+ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
if (security_sb_alloc(s))
goto fail;
@@ -218,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru))
goto fail;
-
- init_rwsem(&s->s_umount);
- lockdep_set_class(&s->s_umount, &type->s_umount_key);
- /*
- * sget() can have s_umount recursion.
- *
- * When it cannot find a suitable sb, it allocates a new
- * one (this one), and tries again to find a suitable old
- * one.
- *
- * In case that succeeds, it will acquire the s_umount
- * lock of the old one. Since these are clearly distrinct
- * locks, and this object isn't exposed yet, there's no
- * risk of deadlocks.
- *
- * Annotate this by putting this lock in a different
- * subclass.
- */
- down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
s->s_count = 1;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
@@ -518,7 +517,11 @@ retry:
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
- register_shrinker(&s->s_shrink);
+ err = register_shrinker(&s->s_shrink);
+ if (err) {
+ deactivate_locked_super(s);
+ s = ERR_PTR(err);
+ }
return s;
}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 20b8f82e115b..fb49510c5dcf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
void *ns;
bool new_sb;
- if (!(flags & MS_KERNMOUNT)) {
+ if (!(flags & SB_KERNMOUNT)) {
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
return ERR_PTR(-EPERM);
}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3c47b7d5d4cf..bec9f79adb25 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
sync_filesystem(sb);
if (sbi->s_forced_ro)
- *flags |= MS_RDONLY;
+ *flags |= SB_RDONLY;
return 0;
}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0d56e486b392..89765ddfb738 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
/* set up enough so that it can read an inode */
sb->s_op = &sysv_sops;
if (sbi->s_forced_ro)
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
if (sbi->s_truncate)
sb->s_d_op = &sysv_dentry_operations;
root_inode = sysv_iget(sb, SYSV_ROOT_INO);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 417fe0b29f23..a2ea4856e67b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -220,20 +220,9 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
- if (ubifs_crypt_is_encrypted(dir)) {
- err = fscrypt_get_encryption_info(dir);
-
- /*
- * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
- * created while the directory was encrypted and we
- * have access to the key.
- */
- if (fscrypt_has_encryption_key(dir))
- fscrypt_set_encrypted_dentry(dentry);
- fscrypt_set_d_op(dentry);
- if (err && err != -ENOKEY)
- return ERR_PTR(err);
- }
+ err = fscrypt_prepare_lookup(dir, dentry, flags);
+ if (err)
+ return ERR_PTR(err);
err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
if (err)
@@ -743,9 +732,9 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
- if (ubifs_crypt_is_encrypted(dir) &&
- !fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
@@ -1353,12 +1342,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink)
ubifs_assert(inode_is_locked(new_inode));
- if (old_dir != new_dir) {
- if (ubifs_crypt_is_encrypted(new_dir) &&
- !fscrypt_has_permitted_context(new_dir, old_inode))
- return -EPERM;
- }
-
if (unlink && is_dir) {
err = ubifs_check_dir_empty(new_inode);
if (err)
@@ -1573,13 +1556,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
ubifs_assert(fst_inode && snd_inode);
- if ((ubifs_crypt_is_encrypted(old_dir) ||
- ubifs_crypt_is_encrypted(new_dir)) &&
- (old_dir != new_dir) &&
- (!fscrypt_has_permitted_context(new_dir, fst_inode) ||
- !fscrypt_has_permitted_context(old_dir, snd_inode)))
- return -EPERM;
-
err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
if (err)
return err;
@@ -1624,12 +1600,19 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
+ int err;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_WHITEOUT | RENAME_EXCHANGE))
return -EINVAL;
ubifs_assert(inode_is_locked(old_dir));
ubifs_assert(inode_is_locked(new_dir));
+ err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+ if (err)
+ return err;
+
if (flags & RENAME_EXCHANGE)
return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a02aa59d1e24..9fe194a4fa9b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1284,13 +1284,9 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if (ubifs_crypt_is_encrypted(inode) && (attr->ia_valid & ATTR_SIZE)) {
- err = fscrypt_get_encryption_info(inode);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
/* Truncation to a smaller size */
@@ -1406,7 +1402,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
if (flags & S_MTIME)
inode->i_mtime = *time;
- if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+ if (!(inode->i_sb->s_flags & SB_LAZYTIME))
iflags |= I_DIRTY_SYNC;
release = ui->dirty;
@@ -1629,35 +1625,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int ubifs_file_open(struct inode *inode, struct file *filp)
-{
- int ret;
- struct dentry *dir;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
-
- if (ubifs_crypt_is_encrypted(inode)) {
- ret = fscrypt_get_encryption_info(inode);
- if (ret)
- return -EACCES;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
-
- dir = dget_parent(file_dentry(filp));
- if (ubifs_crypt_is_encrypted(d_inode(dir)) &&
- !fscrypt_has_permitted_context(d_inode(dir), inode)) {
- ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu",
- (unsigned long) d_inode(dir)->i_ino,
- (unsigned long) inode->i_ino);
- dput(dir);
- ubifs_ro_mode(c, -EPERM);
- return -EPERM;
- }
- dput(dir);
-
- return 0;
-}
-
static const char *ubifs_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
@@ -1746,7 +1713,7 @@ const struct file_operations ubifs_file_operations = {
.unlocked_ioctl = ubifs_ioctl,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .open = ubifs_file_open,
+ .open = fscrypt_file_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3be28900bf37..fe77e9625e84 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
if (!c->ro_error) {
c->ro_error = 1;
c->no_chk_data_crc = 0;
- c->vfs_sb->s_flags |= MS_RDONLY;
+ c->vfs_sb->s_flags |= SB_RDONLY;
ubifs_warn(c, "switched to read-only mode, error %d", err);
dump_stack();
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7503e7cdf870..0beb285b143d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -968,7 +968,7 @@ static int parse_standard_option(const char *option)
pr_notice("UBIFS: parse %s\n", option);
if (!strcmp(option, "sync"))
- return MS_SYNCHRONOUS;
+ return SB_SYNCHRONOUS;
return 0;
}
@@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c)
size_t sz;
c->ro_mount = !!sb_rdonly(c->vfs_sb);
- /* Suppress error messages while probing if MS_SILENT is set */
- c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+ /* Suppress error messages while probing if SB_SILENT is set */
+ c->probing = !!(c->vfs_sb->s_flags & SB_SILENT);
err = init_constants_early(c);
if (err)
@@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
return err;
}
- if (c->ro_mount && !(*flags & MS_RDONLY)) {
+ if (c->ro_mount && !(*flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/W due to prior errors");
return -EROFS;
@@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
err = ubifs_remount_rw(c);
if (err)
return err;
- } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+ } else if (!c->ro_mount && (*flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/O due to prior errors");
return -EROFS;
@@ -2117,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- if (!(flags & MS_SILENT))
+ if (!(flags & SB_SILENT))
pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
current->pid, name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
@@ -2143,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
kfree(c);
/* A new mount point for already mounted UBIFS */
dbg_gen("this ubi volume is already mounted");
- if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+ if (!!(flags & SB_RDONLY) != c1->ro_mount) {
err = -EBUSY;
goto out_deact;
}
} else {
- err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+ err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
if (err)
goto out_deact;
/* We do not support atime */
- sb->s_flags |= MS_ACTIVE;
+ sb->s_flags |= SB_ACTIVE;
#ifndef CONFIG_UBIFS_ATIME_SUPPORT
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
#else
ubifs_msg(c, "full atime support is enabled.");
#endif
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 0a213dcba2a1..ba3d0e0f8615 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1890,35 +1890,28 @@ static int search_dh_cookie(struct ubifs_info *c, const union ubifs_key *key,
union ubifs_key *dkey;
for (;;) {
- if (!err) {
- err = tnc_next(c, &znode, n);
- if (err)
- goto out;
- }
-
zbr = &znode->zbranch[*n];
dkey = &zbr->key;
if (key_inum(c, dkey) != key_inum(c, key) ||
key_type(c, dkey) != key_type(c, key)) {
- err = -ENOENT;
- goto out;
+ return -ENOENT;
}
err = tnc_read_hashed_node(c, zbr, dent);
if (err)
- goto out;
+ return err;
if (key_hash(c, key) == key_hash(c, dkey) &&
le32_to_cpu(dent->cookie) == cookie) {
*zn = znode;
- goto out;
+ return 0;
}
- }
-
-out:
- return err;
+ err = tnc_next(c, &znode, n);
+ if (err)
+ return err;
+ }
}
static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 63c7468147eb..5ee7af879cc4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1201,7 +1201,7 @@ struct ubifs_debug_info;
* @need_recovery: %1 if the file-system needs recovery
* @replaying: %1 during journal replay
* @mounting: %1 while mounting
- * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
+ * @probing: %1 while attempting to mount if SB_SILENT mount flag is set
* @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
@@ -1850,7 +1850,7 @@ __printf(2, 3)
void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
/*
* A conditional variant of 'ubifs_err()' which doesn't output anything
- * if probing (ie. MS_SILENT set).
+ * if probing (ie. SB_SILENT set).
*/
#define ubifs_errc(c, fmt, ...) \
do { \
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5ddc89d564fd..759f1a209dbb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -381,8 +381,6 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
- ubifs_err(c, "buffer size %zd, xattr len %d",
- size, ui->data_len);
err = -ERANGE;
goto out_iput;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f80e0a0f24d3..f73239a9a97d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
sync_filesystem(sb);
if (lvidiu) {
int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
- if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+ if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
return -EACCES;
}
@@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
sbi->s_dmode = uopt.dmode;
write_unlock(&sbi->s_cred_lock);
- if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb))
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out_unlock;
- if (*flags & MS_RDONLY)
+ if (*flags & SB_RDONLY)
udf_close_lvid(sb);
else
udf_open_lvid(sb);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b5cd79065ef9..e727ee07dbe4 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
@@ -205,7 +205,7 @@ do_more:
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
if (overflow) {
@@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
@@ -688,7 +688,7 @@ cg_found:
succed:
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 2edc1755b7c5..50dfce000864 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/swap.h>
+#include <linux/iversion.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -47,7 +48,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
int err = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
@@ -428,7 +429,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
- int need_revalidate = file->f_version != inode->i_version;
+ bool need_revalidate = inode_cmp_iversion(inode, file->f_version);
unsigned flags = UFS_SB(sb)->s_flags;
UFSD("BEGIN\n");
@@ -455,8 +456,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
- file->f_version = inode->i_version;
- need_revalidate = 0;
+ file->f_version = inode_query_iversion(inode);
+ need_revalidate = false;
}
de = (struct ufs_dir_entry *)(kaddr+offset);
limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 916b4a428933..e1ef0f0a1353 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode)
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
@@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bh);
brelse(bh);
}
fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
UFSD("EXIT\n");
@@ -284,7 +284,7 @@ cg_found:
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
@@ -330,7 +330,7 @@ cg_found:
ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
mark_buffer_dirty(bh);
unlock_buffer(bh);
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
sync_dirty_buffer(bh);
brelse(bh);
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index afb601c0dda0..c843ec858cf7 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,7 @@
#include <linux/mm.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/iversion.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -693,7 +694,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
if (err)
goto bad_inode;
- inode->i_version++;
+ inode_inc_iversion(inode);
ufsi->i_lastfrag =
(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
ufsi->i_dir_start_lookup = 0;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6440003f8ddc..b6ba80e05bff 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -88,6 +88,7 @@
#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/iversion.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -282,7 +283,7 @@ void ufs_error (struct super_block * sb, const char * function,
usb1->fs_clean = UFS_FSBAD;
ubh_mark_buffer_dirty(USPI_UBH(uspi));
ufs_mark_sb_dirty(sb);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
va_start(args, fmt);
vaf.fmt = fmt;
@@ -320,7 +321,7 @@ void ufs_panic (struct super_block * sb, const char * function,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
pr_crit("panic (device %s): %s: %pV\n",
sb->s_id, function, &vaf);
va_end(args);
@@ -905,7 +906,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
if (!silent)
pr_info("ufstype=old is supported read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
break;
@@ -921,7 +922,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
if (!silent)
pr_info("ufstype=nextstep is supported read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
break;
@@ -937,7 +938,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
if (!silent)
pr_info("ufstype=nextstep-cd is supported read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
break;
@@ -953,7 +954,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
if (!silent)
pr_info("ufstype=openstep is supported read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
break;
@@ -968,7 +969,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb_rdonly(sb)) {
if (!silent)
pr_info("ufstype=hp is supported read-only\n");
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
break;
default:
@@ -1125,21 +1126,21 @@ magic_found:
break;
case UFS_FSACTIVE:
pr_err("%s(): fs is active\n", __func__);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
break;
case UFS_FSBAD:
pr_err("%s(): fs is bad\n", __func__);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
break;
default:
pr_err("%s(): can't grok fs_clean 0x%x\n",
__func__, usb1->fs_clean);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
break;
}
} else {
pr_err("%s(): fs needs fsck\n", __func__);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
}
/*
@@ -1328,7 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
return -EINVAL;
}
- if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) {
+ if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
@@ -1337,7 +1338,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
/*
* fs was mouted as rw, remounting ro
*/
- if (*mount_flags & MS_RDONLY) {
+ if (*mount_flags & SB_RDONLY) {
ufs_put_super_internal(sb);
usb1->fs_time = cpu_to_fs32(sb, get_seconds());
if ((flags & UFS_ST_MASK) == UFS_ST_SUN
@@ -1346,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufs_set_fs_state(sb, usb1, usb3,
UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
ubh_mark_buffer_dirty (USPI_UBH(uspi));
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
} else {
/*
* fs was mounted as ro, remounting rw
@@ -1370,7 +1371,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
mutex_unlock(&UFS_SB(sb)->s_lock);
return -EPERM;
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
@@ -1440,7 +1441,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
- ei->vfs_inode.i_version = 1;
+ inode_set_iversion(&ei->vfs_inode, 1);
seqlock_init(&ei->meta_lock);
mutex_init(&ei->truncate_mutex);
return &ei->vfs_inode;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ac9a4e65ca49..41a75f9f23fd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -570,11 +570,14 @@ out:
static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
struct userfaultfd_wait_queue *ewq)
{
+ struct userfaultfd_ctx *release_new_ctx;
+
if (WARN_ON_ONCE(current->flags & PF_EXITING))
goto out;
ewq->ctx = ctx;
init_waitqueue_entry(&ewq->wq, current);
+ release_new_ctx = NULL;
spin_lock(&ctx->event_wqh.lock);
/*
@@ -601,8 +604,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
new = (struct userfaultfd_ctx *)
(unsigned long)
ewq->msg.arg.reserved.reserved1;
-
- userfaultfd_ctx_put(new);
+ release_new_ctx = new;
}
break;
}
@@ -617,6 +619,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->event_wqh.lock);
+ if (release_new_ctx) {
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = release_new_ctx->mm;
+
+ /* the various vma->vm_userfaultfd_ctx still points to it */
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx)
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ up_write(&mm->mmap_sem);
+
+ userfaultfd_ctx_put(release_new_ctx);
+ }
+
/*
* ctx may go away after this if the userfault pseudo fd is
* already released.
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 0da80019a917..83ed7715f856 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -702,7 +702,7 @@ xfs_alloc_ag_vextent(
ASSERT(args->agbno % args->alignment == 0);
/* if not file data, insert new block into the reverse map btree */
- if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
args->agbno, args->len, &args->oinfo);
if (error)
@@ -1682,7 +1682,7 @@ xfs_free_ag_extent(
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
- if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ if (!xfs_rmap_should_skip_owner_update(oinfo)) {
error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
if (error)
goto error0;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 6249c92671de..a76914db72ef 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -212,6 +212,7 @@ xfs_attr_set(
int flags)
{
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *leaf_bp = NULL;
struct xfs_da_args args;
struct xfs_defer_ops dfops;
struct xfs_trans_res tres;
@@ -327,9 +328,16 @@ xfs_attr_set(
* GROT: another possible req'mt for a double-split btree op.
*/
xfs_defer_init(args.dfops, args.firstblock);
- error = xfs_attr_shortform_to_leaf(&args);
+ error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
if (error)
goto out_defer_cancel;
+ /*
+ * Prevent the leaf buffer from being unlocked so that a
+ * concurrent AIL push cannot grab the half-baked leaf
+ * buffer and run into problems with the write verifier.
+ */
+ xfs_trans_bhold(args.trans, leaf_bp);
+ xfs_defer_bjoin(args.dfops, leaf_bp);
xfs_defer_ijoin(args.dfops, dp);
error = xfs_defer_finish(&args.trans, args.dfops);
if (error)
@@ -337,13 +345,14 @@ xfs_attr_set(
/*
* Commit the leaf transformation. We'll need another (linked)
- * transaction to add the new attribute to the leaf.
+ * transaction to add the new attribute to the leaf, which
+ * means that we have to hold & join the leaf buffer here too.
*/
-
error = xfs_trans_roll_inode(&args.trans, dp);
if (error)
goto out;
-
+ xfs_trans_bjoin(args.trans, leaf_bp);
+ leaf_bp = NULL;
}
if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
@@ -374,8 +383,9 @@ xfs_attr_set(
out_defer_cancel:
xfs_defer_cancel(&dfops);
- args.trans = NULL;
out:
+ if (leaf_bp)
+ xfs_trans_brelse(args.trans, leaf_bp);
if (args.trans)
xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 53cc8b986eac..601eaa36f1ad 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -735,10 +735,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
}
/*
- * Convert from using the shortform to the leaf.
+ * Convert from using the shortform to the leaf. On success, return the
+ * buffer so that we can keep it locked until we're totally done with it.
*/
int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+xfs_attr_shortform_to_leaf(
+ struct xfs_da_args *args,
+ struct xfs_buf **leaf_bp)
{
xfs_inode_t *dp;
xfs_attr_shortform_t *sf;
@@ -818,7 +821,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
}
error = 0;
-
+ *leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f7dda0c237b0..894124efb421 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -48,7 +48,8 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
+ struct xfs_buf **leaf_bp);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 08df809e2315..1bddbba6b80c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5136,7 +5136,7 @@ __xfs_bunmapi(
* blowing out the transaction with a mix of EFIs and reflink
* adjustments.
*/
- if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+ if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
else
max_len = len;
@@ -5662,7 +5662,8 @@ xfs_bmap_collapse_extents(
*done = true;
goto del_cursor;
}
- XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+ XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
+ del_cursor);
new_startoff = got.br_startoff - offset_shift_fsb;
if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
@@ -5767,7 +5768,8 @@ xfs_bmap_insert_extents(
goto del_cursor;
}
}
- XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+ XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
+ del_cursor);
if (stop_fsb >= got.br_startoff + got.br_blockcount) {
error = -EIO;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 072ebfe1d6ae..087fea02c389 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -249,6 +249,10 @@ xfs_defer_trans_roll(
for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
+ /* Hold the (previously bjoin'd) buffer locked across the roll. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
+ xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
+
trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
/* Roll the transaction. */
@@ -264,6 +268,12 @@ xfs_defer_trans_roll(
for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) {
+ xfs_trans_bjoin(*tp, dop->dop_bufs[i]);
+ xfs_trans_bhold(*tp, dop->dop_bufs[i]);
+ }
+
return error;
}
@@ -295,6 +305,31 @@ xfs_defer_ijoin(
}
}
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Add this buffer to the deferred op. Each joined buffer is relogged
+ * each time we roll the transaction.
+ */
+int
+xfs_defer_bjoin(
+ struct xfs_defer_ops *dop,
+ struct xfs_buf *bp)
+{
+ int i;
+
+ for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) {
+ if (dop->dop_bufs[i] == bp)
+ return 0;
+ else if (dop->dop_bufs[i] == NULL) {
+ dop->dop_bufs[i] = bp;
+ return 0;
+ }
+ }
+
+ ASSERT(0);
return -EFSCORRUPTED;
}
@@ -493,9 +528,7 @@ xfs_defer_init(
struct xfs_defer_ops *dop,
xfs_fsblock_t *fbp)
{
- dop->dop_committed = false;
- dop->dop_low = false;
- memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+ memset(dop, 0, sizeof(struct xfs_defer_ops));
*fbp = NULLFSBLOCK;
INIT_LIST_HEAD(&dop->dop_intake);
INIT_LIST_HEAD(&dop->dop_pending);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index d4f046dd44bd..045beacdd37d 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -59,6 +59,7 @@ enum xfs_defer_ops_type {
};
#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
struct xfs_defer_ops {
bool dop_committed; /* did any trans commit? */
@@ -66,8 +67,9 @@ struct xfs_defer_ops {
struct list_head dop_intake; /* unlogged pending work */
struct list_head dop_pending; /* logged pending work */
- /* relog these inodes with each roll */
+ /* relog these with each roll */
struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES];
+ struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS];
};
void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
@@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop);
void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index de3f04a98656..3b57ef0f2f76 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -920,8 +920,7 @@ STATIC xfs_agnumber_t
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
- umode_t mode, /* bits set to indicate file type */
- int okalloc) /* ok to allocate more space */
+ umode_t mode) /* bits set to indicate file type */
{
xfs_agnumber_t agcount; /* number of ag's in the filesystem */
xfs_agnumber_t agno; /* current ag number */
@@ -978,9 +977,6 @@ xfs_ialloc_ag_select(
return agno;
}
- if (!okalloc)
- goto nextag;
-
if (!pag->pagf_init) {
error = xfs_alloc_pagf_init(mp, tp, agno, flags);
if (error)
@@ -1680,7 +1676,6 @@ xfs_dialloc(
struct xfs_trans *tp,
xfs_ino_t parent,
umode_t mode,
- int okalloc,
struct xfs_buf **IO_agbp,
xfs_ino_t *inop)
{
@@ -1692,6 +1687,7 @@ xfs_dialloc(
int noroom = 0;
xfs_agnumber_t start_agno;
struct xfs_perag *pag;
+ int okalloc = 1;
if (*IO_agbp) {
/*
@@ -1707,7 +1703,7 @@ xfs_dialloc(
* We do not have an agbp, so select an initial allocation
* group for inode allocation.
*/
- start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+ start_agno = xfs_ialloc_ag_select(tp, parent, mode);
if (start_agno == NULLAGNUMBER) {
*inop = NULLFSINO;
return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index d2bdcd5e7312..66a8de0b1caa 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -81,7 +81,6 @@ xfs_dialloc(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
umode_t mode, /* mode bits for new inode */
- int okalloc, /* ok to allocate more space */
struct xfs_buf **agbp, /* buf for a.g. inode header */
xfs_ino_t *inop); /* inode number allocated */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 89bf16b4d937..b0f31791c7e6 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -632,8 +632,6 @@ xfs_iext_insert(
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
- trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
if (ifp->if_height == 0)
xfs_iext_alloc_root(ifp, cur);
else if (ifp->if_height == 1)
@@ -661,6 +659,8 @@ xfs_iext_insert(
xfs_iext_set(cur_rec(cur), irec);
ifp->if_bytes += sizeof(struct xfs_iext_rec);
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
if (new)
xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 6b7989038d75..b9c0bf80669c 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -32,6 +32,8 @@
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
+#include <linux/iversion.h>
+
/*
* Check that none of the inode's in the buffer have a next
* unlinked field of 0.
@@ -264,7 +266,8 @@ xfs_inode_from_disk(
to->di_flags = be16_to_cpu(from->di_flags);
if (to->di_version == 3) {
- inode->i_version = be64_to_cpu(from->di_changecount);
+ inode_set_iversion_queried(inode,
+ be64_to_cpu(from->di_changecount));
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
@@ -314,7 +317,7 @@ xfs_inode_to_disk(
to->di_flags = cpu_to_be16(from->di_flags);
if (from->di_version == 3) {
- to->di_changecount = cpu_to_be64(inode->i_version);
+ to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 585b35d34142..c40d26763075 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc(
xfs_extlen_t aglen,
struct xfs_defer_ops *dfops)
{
- int error;
-
trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
agbno, aglen);
/* Add refcount btree reservation */
- error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ return xfs_refcount_adjust_cow(rcur, agbno, aglen,
XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
- if (error)
- return error;
-
- /* Add rmap entry */
- if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
- error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
- rcur->bc_private.a.agno,
- agbno, aglen, XFS_RMAP_OWN_COW);
- if (error)
- return error;
- }
-
- return error;
}
/*
@@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free(
xfs_extlen_t aglen,
struct xfs_defer_ops *dfops)
{
- int error;
-
trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
agbno, aglen);
/* Remove refcount btree reservation */
- error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ return xfs_refcount_adjust_cow(rcur, agbno, aglen,
XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
- if (error)
- return error;
-
- /* Remove rmap entry */
- if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
- error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
- rcur->bc_private.a.agno,
- agbno, aglen, XFS_RMAP_OWN_COW);
- if (error)
- return error;
- }
-
- return error;
}
/* Record a CoW staging extent in the refcount btree. */
@@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent(
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
+ int error;
+
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
- return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+ error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
fsb, len);
+ if (error)
+ return error;
+
+ /* Add rmap entry */
+ return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
@@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent(
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
+ int error;
+
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
+ /* Remove rmap entry */
+ error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+ if (error)
+ return error;
+
return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
fsb, len);
}
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index dd019cee1b3b..50db920ceeeb 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -368,6 +368,51 @@ xfs_rmap_lookup_le_range(
}
/*
+ * Perform all the relevant owner checks for a removal op. If we're doing an
+ * unknown-owner removal then we have no owner information to check.
+ */
+static int
+xfs_rmap_free_check_owner(
+ struct xfs_mount *mp,
+ uint64_t ltoff,
+ struct xfs_rmap_irec *rec,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int error = 0;
+
+ if (owner == XFS_RMAP_OWN_UNKNOWN)
+ return 0;
+
+ /* Make sure the unwritten flag matches. */
+ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN), out);
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out);
+
+ /* Check the offset, if necessary. */
+ if (XFS_RMAP_NON_INODE_OWNER(owner))
+ goto out;
+
+ if (flags & XFS_RMAP_BMBT_BLOCK) {
+ XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK,
+ out);
+ } else {
+ XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltoff + rec->rm_blockcount >= offset + len,
+ out);
+ }
+
+out:
+ return error;
+}
+
+/*
* Find the extent in the rmap btree and remove it.
*
* The record we find should always be an exact match for the extent that we're
@@ -444,33 +489,40 @@ xfs_rmap_unmap(
goto out_done;
}
- /* Make sure the unwritten flag matches. */
- XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
- (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+ /*
+ * If we're doing an unknown-owner removal for EFI recovery, we expect
+ * to find the full range in the rmapbt or nothing at all. If we
+ * don't find any rmaps overlapping either end of the range, we're
+ * done. Hopefully this means that the EFI creator already queued
+ * (and finished) a RUI to remove the rmap.
+ */
+ if (owner == XFS_RMAP_OWN_UNKNOWN &&
+ ltrec.rm_startblock + ltrec.rm_blockcount <= bno) {
+ struct xfs_rmap_irec rtrec;
+
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto out_error;
+ if (i == 0)
+ goto out_done;
+ error = xfs_rmap_get_rec(cur, &rtrec, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (rtrec.rm_startblock >= bno + len)
+ goto out_done;
+ }
/* Make sure the extent we found covers the entire freeing range. */
XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
- ltrec.rm_startblock + ltrec.rm_blockcount >=
- bno + len, out_error);
+ ltrec.rm_startblock + ltrec.rm_blockcount >=
+ bno + len, out_error);
- /* Make sure the owner matches what we expect to find in the tree. */
- XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
- XFS_RMAP_NON_INODE_OWNER(owner), out_error);
-
- /* Check the offset, if necessary. */
- if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
- if (flags & XFS_RMAP_BMBT_BLOCK) {
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
- out_error);
- } else {
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltrec.rm_offset <= offset, out_error);
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltoff + ltrec.rm_blockcount >= offset + len,
- out_error);
- }
- }
+ /* Check owner information. */
+ error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, bno, len, owner,
+ offset, flags);
+ if (error)
+ goto out_error;
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
@@ -664,6 +716,7 @@ xfs_rmap_map(
flags |= XFS_RMAP_UNWRITTEN;
trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
unwritten, oinfo);
+ ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
/*
* For the initial lookup, look for an exact match or the left-adjacent
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 466ede637080..0fcd5b1ba729 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -61,7 +61,21 @@ static inline void
xfs_rmap_skip_owner_update(
struct xfs_owner_info *oi)
{
- oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+ xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
+}
+
+static inline bool
+xfs_rmap_should_skip_owner_update(
+ struct xfs_owner_info *oi)
+{
+ return oi->oi_owner == XFS_RMAP_OWN_NULL;
+}
+
+static inline void
+xfs_rmap_any_owner_update(
+ struct xfs_owner_info *oi)
+{
+ xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
}
/* Reverse mapping functions. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 637b7a892313..f120fb20452f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -318,8 +318,20 @@ xfs_scrub_dinode(
/* di_mode */
mode = be16_to_cpu(dip->di_mode);
- if (mode & ~(S_IALLUGO | S_IFMT))
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ /* mode is recognized */
+ break;
+ default:
xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ }
/* v1/v2 fields */
switch (dip->di_version) {
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 8e58ba842946..3d9037eceaf1 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -107,7 +107,7 @@ xfs_scrub_quota_item(
unsigned long long rcount;
xfs_ino_t fs_icount;
- offset = id * qi->qi_dqperchunk;
+ offset = id / qi->qi_dqperchunk;
/*
* We fed $id and DQNEXT into the xfs_qm_dqget call, which means
@@ -207,7 +207,7 @@ xfs_scrub_quota(
xfs_dqid_t id = 0;
uint dqtype;
int nimaps;
- int error;
+ int error = 0;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return -ENOENT;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 9c42c4efd01e..ab3aef2ae823 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -46,7 +46,6 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
-#include "scrub/scrub.h"
#include "scrub/btree.h"
/*
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 472080e75788..86daed0e3a45 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -26,7 +26,6 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a3eeaba156c5..4fc526a27a94 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -399,7 +399,7 @@ xfs_map_blocks(
(ip->i_df.if_flags & XFS_IFEXTENTS));
ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset + count > mp->m_super->s_maxbytes)
+ if (offset > mp->m_super->s_maxbytes - count)
count = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -896,13 +896,13 @@ xfs_writepage_map(
struct writeback_control *wbc,
struct inode *inode,
struct page *page,
- loff_t offset,
- uint64_t end_offset)
+ uint64_t end_offset)
{
LIST_HEAD(submit_list);
struct xfs_ioend *ioend, *next;
struct buffer_head *bh, *head;
ssize_t len = i_blocksize(inode);
+ uint64_t offset;
int error = 0;
int count = 0;
int uptodate = 1;
@@ -1146,7 +1146,7 @@ xfs_do_writepage(
end_offset = offset;
}
- return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
+ return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
redirty:
redirty_page_for_writepage(wbc, page);
@@ -1265,7 +1265,7 @@ xfs_map_trim_size(
if (mapping_size > size)
mapping_size = size;
if (offset < i_size_read(inode) &&
- offset + mapping_size >= i_size_read(inode)) {
+ (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
/* limit mapping to block that spans EOF */
mapping_size = roundup_64(i_size_read(inode) - offset,
i_blocksize(inode));
@@ -1312,7 +1312,7 @@ xfs_get_blocks(
lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset + size > mp->m_super->s_maxbytes)
+ if (offset > mp->m_super->s_maxbytes - size)
size = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index dd136f7275e4..e5fb008d75e8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -389,7 +389,8 @@ xfs_bud_init(
int
xfs_bui_recover(
struct xfs_mount *mp,
- struct xfs_bui_log_item *buip)
+ struct xfs_bui_log_item *buip,
+ struct xfs_defer_ops *dfops)
{
int error = 0;
unsigned int bui_type;
@@ -404,9 +405,7 @@ xfs_bui_recover(
xfs_exntst_t state;
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_defer_ops dfops;
struct xfs_bmbt_irec irec;
- xfs_fsblock_t firstfsb;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -464,7 +463,6 @@ xfs_bui_recover(
if (VFS_I(ip)->i_nlink == 0)
xfs_iflags_set(ip, XFS_IRECOVERY);
- xfs_defer_init(&dfops, &firstfsb);
/* Process deferred bmap item. */
state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
@@ -479,16 +477,16 @@ xfs_bui_recover(
break;
default:
error = -EFSCORRUPTED;
- goto err_dfops;
+ goto err_inode;
}
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+ error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type,
ip, whichfork, bmap->me_startoff,
bmap->me_startblock, &count, state);
if (error)
- goto err_dfops;
+ goto err_inode;
if (count > 0) {
ASSERT(type == XFS_BMAP_UNMAP);
@@ -496,16 +494,11 @@ xfs_bui_recover(
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
irec.br_state = state;
- error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+ error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec);
if (error)
- goto err_dfops;
+ goto err_inode;
}
- /* Finish transaction, free inodes. */
- error = xfs_defer_finish(&tp, &dfops);
- if (error)
- goto err_dfops;
-
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -513,8 +506,6 @@ xfs_bui_recover(
return error;
-err_dfops:
- xfs_defer_cancel(&dfops);
err_inode:
xfs_trans_cancel(tp);
if (ip) {
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index c867daae4a3c..24b354a2c836 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
struct xfs_bui_log_item *);
void xfs_bui_item_free(struct xfs_bui_log_item *);
void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip,
+ struct xfs_defer_ops *dfops);
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4db6e8d780f6..4c6e86d861fd 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1815,22 +1815,27 @@ xfs_alloc_buftarg(
btp->bt_daxdev = dax_dev;
if (xfs_setsize_buftarg_early(btp, bdev))
- goto error;
+ goto error_free;
if (list_lru_init(&btp->bt_lru))
- goto error;
+ goto error_free;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
- goto error;
+ goto error_lru;
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- register_shrinker(&btp->bt_shrinker);
+ if (register_shrinker(&btp->bt_shrinker))
+ goto error_pcpu;
return btp;
-error:
+error_pcpu:
+ percpu_counter_destroy(&btp->bt_io_count);
+error_lru:
+ list_lru_destroy(&btp->bt_lru);
+error_free:
kmem_free(btp);
return NULL;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index d57c2db64e59..f248708c10ff 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -970,14 +970,22 @@ xfs_qm_dqflush_done(
* holding the lock before removing the dquot from the AIL.
*/
if ((lip->li_flags & XFS_LI_IN_AIL) &&
- lip->li_lsn == qip->qli_flush_lsn) {
+ ((lip->li_lsn == qip->qli_flush_lsn) ||
+ (lip->li_flags & XFS_LI_FAILED))) {
/* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->xa_lock);
- if (lip->li_lsn == qip->qli_flush_lsn)
+ if (lip->li_lsn == qip->qli_flush_lsn) {
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
- else
+ } else {
+ /*
+ * Clear the failed state since we are about to drop the
+ * flush lock
+ */
+ if (lip->li_flags & XFS_LI_FAILED)
+ xfs_clear_li_failed(lip);
spin_unlock(&ailp->xa_lock);
+ }
}
/*
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 2c7a1629e064..664dea105e76 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait(
wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * this informs the AIL that the dquot is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_dquot_item_error(
+ struct xfs_log_item *lip,
+ struct xfs_buf *bp)
+{
+ struct xfs_dquot *dqp;
+
+ dqp = DQUOT_ITEM(lip)->qli_dquot;
+ ASSERT(!completion_done(&dqp->q_flush));
+ xfs_set_li_failed(lip, bp);
+}
+
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
@@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push(
__acquires(&lip->li_ailp->xa_lock)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- struct xfs_buf *bp = NULL;
+ struct xfs_buf *bp = lip->li_buf;
uint rval = XFS_ITEM_SUCCESS;
int error;
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
+ /*
+ * The buffer containing this item failed to be written back
+ * previously. Resubmit the buffer for IO
+ */
+ if (lip->li_flags & XFS_LI_FAILED) {
+ if (!xfs_buf_trylock(bp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+
+ xfs_buf_unlock(bp);
+ return rval;
+ }
+
if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
@@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_unlock = xfs_qm_dquot_logitem_unlock,
.iop_committed = xfs_qm_dquot_logitem_committed,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_committing = xfs_qm_dquot_logitem_committing
+ .iop_committing = xfs_qm_dquot_logitem_committing,
+ .iop_error = xfs_dquot_item_error
};
/*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 44f8c5451210..64da90655e95 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -538,7 +538,7 @@ xfs_efi_recover(
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
- xfs_rmap_skip_owner_update(&oinfo);
+ xfs_rmap_any_owner_update(&oinfo);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &efip->efi_format.efi_extents[i];
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8f22fc579dbb..60a2e128cb6a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -571,6 +571,11 @@ xfs_growfs_data_private(
* this doesn't actually exist in the rmap btree.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+ error = xfs_rmap_free(tp, bp, agno,
+ be32_to_cpu(agf->agf_length) - new,
+ new, &oinfo);
+ if (error)
+ goto error0;
error = xfs_free_extent(tp,
XFS_AGB_TO_FSB(mp, agno,
be32_to_cpu(agf->agf_length) - new),
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 43005fbe8b1e..3bcb8fd2a826 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,6 +37,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/iversion.h>
/*
* Allocate and initialise an xfs_inode.
@@ -293,14 +294,14 @@ xfs_reinit_inode(
int error;
uint32_t nlink = inode->i_nlink;
uint32_t generation = inode->i_generation;
- uint64_t version = inode->i_version;
+ uint64_t version = inode_peek_iversion(inode);
umode_t mode = inode->i_mode;
error = inode_init_always(mp->m_super, inode);
set_nlink(inode, nlink);
inode->i_generation = generation;
- inode->i_version = version;
+ inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
return error;
}
@@ -870,7 +871,7 @@ xfs_eofblocks_worker(
* based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
* (We'll just piggyback on the post-EOF prealloc space workqueue.)
*/
-STATIC void
+void
xfs_queue_cowblocks(
struct xfs_mount *mp)
{
@@ -1536,8 +1537,23 @@ xfs_inode_free_quota_eofblocks(
return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
}
+static inline unsigned long
+xfs_iflag_for_tag(
+ int tag)
+{
+ switch (tag) {
+ case XFS_ICI_EOFBLOCKS_TAG:
+ return XFS_IEOFBLOCKS;
+ case XFS_ICI_COWBLOCKS_TAG:
+ return XFS_ICOWBLOCKS;
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
static void
-__xfs_inode_set_eofblocks_tag(
+__xfs_inode_set_blocks_tag(
xfs_inode_t *ip,
void (*execute)(struct xfs_mount *mp),
void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -1552,10 +1568,10 @@ __xfs_inode_set_eofblocks_tag(
* Don't bother locking the AG and looking up in the radix trees
* if we already know that we have the tag set.
*/
- if (ip->i_flags & XFS_IEOFBLOCKS)
+ if (ip->i_flags & xfs_iflag_for_tag(tag))
return;
spin_lock(&ip->i_flags_lock);
- ip->i_flags |= XFS_IEOFBLOCKS;
+ ip->i_flags |= xfs_iflag_for_tag(tag);
spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1587,13 +1603,13 @@ xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
- return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+ return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
trace_xfs_perag_set_eofblocks,
XFS_ICI_EOFBLOCKS_TAG);
}
static void
-__xfs_inode_clear_eofblocks_tag(
+__xfs_inode_clear_blocks_tag(
xfs_inode_t *ip,
void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
int error, unsigned long caller_ip),
@@ -1603,7 +1619,7 @@ __xfs_inode_clear_eofblocks_tag(
struct xfs_perag *pag;
spin_lock(&ip->i_flags_lock);
- ip->i_flags &= ~XFS_IEOFBLOCKS;
+ ip->i_flags &= ~xfs_iflag_for_tag(tag);
spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1630,7 +1646,7 @@ xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
- return __xfs_inode_clear_eofblocks_tag(ip,
+ return __xfs_inode_clear_blocks_tag(ip,
trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
}
@@ -1724,7 +1740,7 @@ xfs_inode_set_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_cowblocks_tag(ip);
- return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+ return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
trace_xfs_perag_set_cowblocks,
XFS_ICI_COWBLOCKS_TAG);
}
@@ -1734,6 +1750,6 @@ xfs_inode_clear_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_cowblocks_tag(ip);
- return __xfs_inode_clear_eofblocks_tag(ip,
+ return __xfs_inode_clear_blocks_tag(ip,
trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index bff4d85e5498..d4a77588eca1 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
void xfs_cowblocks_worker(struct work_struct *);
+void xfs_queue_cowblocks(struct xfs_mount *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 61d1cb7dc10d..9f424e0aef1f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -16,6 +16,7 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/log2.h>
+#include <linux/iversion.h>
#include "xfs.h"
#include "xfs_fs.h"
@@ -749,7 +750,6 @@ xfs_ialloc(
xfs_nlink_t nlink,
dev_t rdev,
prid_t prid,
- int okalloc,
xfs_buf_t **ialloc_context,
xfs_inode_t **ipp)
{
@@ -765,7 +765,7 @@ xfs_ialloc(
* Call the space management code to pick
* the on-disk inode to be allocated.
*/
- error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
ialloc_context, &ino);
if (error)
return error;
@@ -833,7 +833,7 @@ xfs_ialloc(
ip->i_d.di_flags = 0;
if (ip->i_d.di_version == 3) {
- inode->i_version = 1;
+ inode_set_iversion(inode, 1);
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
@@ -957,7 +957,6 @@ xfs_dir_ialloc(
xfs_nlink_t nlink,
dev_t rdev,
prid_t prid, /* project id */
- int okalloc, /* ok to allocate new space */
xfs_inode_t **ipp, /* pointer to inode; it will be
locked. */
int *committed)
@@ -988,8 +987,8 @@ xfs_dir_ialloc(
* transaction commit so that no other process can steal
* the inode(s) that we've just allocated.
*/
- code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
- &ialloc_context, &ip);
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
+ &ip);
/*
* Return an error if we were unable to allocate a new inode.
@@ -1061,7 +1060,7 @@ xfs_dir_ialloc(
* this call should always succeed.
*/
code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
- okalloc, &ialloc_context, &ip);
+ &ialloc_context, &ip);
/*
* If we get an error at this point, return to the caller
@@ -1182,11 +1181,6 @@ xfs_create(
xfs_flush_inodes(mp);
error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
}
- if (error == -ENOSPC) {
- /* No space at all so try a "no-allocation" reservation */
- resblks = 0;
- error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
- }
if (error)
goto out_release_inode;
@@ -1203,19 +1197,13 @@ xfs_create(
if (error)
goto out_trans_cancel;
- if (!resblks) {
- error = xfs_dir_canenter(tp, dp, name);
- if (error)
- goto out_trans_cancel;
- }
-
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, resblks > 0, &ip, NULL);
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip,
+ NULL);
if (error)
goto out_trans_cancel;
@@ -1340,11 +1328,6 @@ xfs_create_tmpfile(
tres = &M_RES(mp)->tr_create_tmpfile;
error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- /* No space at all so try a "no-allocation" reservation */
- resblks = 0;
- error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
- }
if (error)
goto out_release_inode;
@@ -1353,8 +1336,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
- prid, resblks > 0, &ip, NULL);
+ error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL);
if (error)
goto out_trans_cancel;
@@ -1506,6 +1488,24 @@ xfs_link(
return error;
}
+/* Clear the reflink flag and the cowblocks tag if possible. */
+static void
+xfs_itruncate_clear_reflink_flags(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *dfork;
+ struct xfs_ifork *cfork;
+
+ if (!xfs_is_reflink_inode(ip))
+ return;
+ dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (cfork->if_bytes == 0)
+ xfs_inode_clear_cowblocks_tag(ip);
+}
+
/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
@@ -1602,15 +1602,7 @@ xfs_itruncate_extents(
if (error)
goto out;
- /*
- * Clear the reflink flag if there are no data fork blocks and
- * there are no extents staged in the cow fork.
- */
- if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
- if (ip->i_d.di_nblocks == 0)
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
- xfs_inode_clear_cowblocks_tag(ip);
- }
+ xfs_itruncate_clear_reflink_flags(ip);
/*
* Always re-log the inode so that our permanent transaction can keep
@@ -2401,6 +2393,24 @@ retry:
}
/*
+ * Free any local-format buffers sitting around before we reset to
+ * extents format.
+ */
+static inline void
+xfs_ifree_local_data(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+}
+
+/*
* This is called to return an inode to the inode free list.
* The inode should already be truncated to 0 length and have
* no pages associated with it. This routine also assumes that
@@ -2437,6 +2447,9 @@ xfs_ifree(
if (error)
return error;
+ xfs_ifree_local_data(ip, XFS_DATA_FORK);
+ xfs_ifree_local_data(ip, XFS_ATTR_FORK);
+
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cc13c3763721..d383e392ec9d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
* log recovery to replay a bmap operation on the inode.
*/
#define XFS_IRECOVERY (1 << 11)
+#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -428,7 +429,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
- xfs_nlink_t, dev_t, prid_t, int,
+ xfs_nlink_t, dev_t, prid_t,
struct xfs_inode **, int *);
/* from xfs_file.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6ee5c3bf19ad..7571abf5dfb3 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include <linux/iversion.h>
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -354,7 +355,7 @@ xfs_inode_to_log_dinode(
to->di_next_unlinked = NULLAGINO;
if (from->di_version == 3) {
- to->di_changecount = inode->i_version;
+ to->di_changecount = inode_peek_iversion(inode);
to->di_crtime.t_sec = from->di_crtime.t_sec;
to->di_crtime.t_nsec = from->di_crtime.t_nsec;
to->di_flags2 = from->di_flags2;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 33eb4fb2e3fd..66e1edbfb2b2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1006,7 +1006,7 @@ xfs_file_iomap_begin(
}
ASSERT(offset <= mp->m_super->s_maxbytes);
- if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+ if (offset > mp->m_super->s_maxbytes - length)
length = mp->m_super->s_maxbytes - offset;
offset_fsb = XFS_B_TO_FSBT(mp, offset);
end_fsb = XFS_B_TO_FSB(mp, offset + length);
@@ -1213,7 +1213,7 @@ xfs_xattr_iomap_begin(
ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
- &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+ &nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
xfs_iunlock(ip, lockmode);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 38d4227895ae..a503af96d780 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -781,17 +781,17 @@ xfs_log_mount_finish(
* something to an unlinked inode, the irele won't cause
* premature truncation and freeing of the inode, which results
* in log recovery failure. We have to evict the unreferenced
- * lru inodes after clearing MS_ACTIVE because we don't
+ * lru inodes after clearing SB_ACTIVE because we don't
* otherwise clean up the lru if there's a subsequent failure in
* xfs_mountfs, which leads to us leaking the inodes if nothing
* else (e.g. quotacheck) references the inodes before the
* mount failure occurs.
*/
- mp->m_super->s_flags |= MS_ACTIVE;
+ mp->m_super->s_flags |= SB_ACTIVE;
error = xlog_recover_finish(mp->m_log);
if (!error)
xfs_log_work_queue(mp);
- mp->m_super->s_flags &= ~MS_ACTIVE;
+ mp->m_super->s_flags &= ~SB_ACTIVE;
evict_inodes(mp->m_super);
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 87b1c331f9eb..28d1abfe835e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -4716,7 +4717,8 @@ STATIC int
xlog_recover_process_cui(
struct xfs_mount *mp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct xfs_defer_ops *dfops)
{
struct xfs_cui_log_item *cuip;
int error;
@@ -4729,7 +4731,7 @@ xlog_recover_process_cui(
return 0;
spin_unlock(&ailp->xa_lock);
- error = xfs_cui_recover(mp, cuip);
+ error = xfs_cui_recover(mp, cuip, dfops);
spin_lock(&ailp->xa_lock);
return error;
@@ -4756,7 +4758,8 @@ STATIC int
xlog_recover_process_bui(
struct xfs_mount *mp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct xfs_defer_ops *dfops)
{
struct xfs_bui_log_item *buip;
int error;
@@ -4769,7 +4772,7 @@ xlog_recover_process_bui(
return 0;
spin_unlock(&ailp->xa_lock);
- error = xfs_bui_recover(mp, buip);
+ error = xfs_bui_recover(mp, buip, dfops);
spin_lock(&ailp->xa_lock);
return error;
@@ -4805,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
}
}
+/* Take all the collected deferred ops and finish them in order. */
+static int
+xlog_finish_defer_ops(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_trans *tp;
+ int64_t freeblks;
+ uint resblks;
+ int error;
+
+ /*
+ * We're finishing the defer_ops that accumulated as a result of
+ * recovering unfinished intent items during log recovery. We
+ * reserve an itruncate transaction because it is the largest
+ * permanent transaction type. Since we're the only user of the fs
+ * right now, take 93% (15/16) of the available free blocks. Use
+ * weird math to avoid a 64-bit division.
+ */
+ freeblks = percpu_counter_sum(&mp->m_fdblocks);
+ if (freeblks <= 0)
+ return -ENOSPC;
+ resblks = min_t(int64_t, UINT_MAX, freeblks);
+ resblks = (resblks * 15) >> 4;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+ 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ error = xfs_defer_finish(&tp, dfops);
+ if (error)
+ goto out_cancel;
+
+ return xfs_trans_commit(tp);
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
/*
* When this is called, all of the log intent items which did not have
* corresponding log done items should be in the AIL. What we do now
@@ -4825,10 +4868,12 @@ STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_log_item *lip;
- int error = 0;
+ struct xfs_defer_ops dfops;
struct xfs_ail_cursor cur;
+ struct xfs_log_item *lip;
struct xfs_ail *ailp;
+ xfs_fsblock_t firstfsb;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
@@ -4839,6 +4884,7 @@ xlog_recover_process_intents(
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
+ xfs_defer_init(&dfops, &firstfsb);
while (lip != NULL) {
/*
* We're done when we see something other than an intent.
@@ -4859,6 +4905,12 @@ xlog_recover_process_intents(
*/
ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+ /*
+ * NOTE: If your intent processing routine can create more
+ * deferred ops, you /must/ attach them to the dfops in this
+ * routine or else those subsequent intents will get
+ * replayed in the wrong order!
+ */
switch (lip->li_type) {
case XFS_LI_EFI:
error = xlog_recover_process_efi(log->l_mp, ailp, lip);
@@ -4867,10 +4919,12 @@ xlog_recover_process_intents(
error = xlog_recover_process_rui(log->l_mp, ailp, lip);
break;
case XFS_LI_CUI:
- error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+ error = xlog_recover_process_cui(log->l_mp, ailp, lip,
+ &dfops);
break;
case XFS_LI_BUI:
- error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+ error = xlog_recover_process_bui(log->l_mp, ailp, lip,
+ &dfops);
break;
}
if (error)
@@ -4880,6 +4934,11 @@ xlog_recover_process_intents(
out:
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
+ if (error)
+ xfs_defer_cancel(&dfops);
+ else
+ error = xlog_finish_defer_ops(log->l_mp, &dfops);
+
return error;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 010a13a201aa..b897b11afb2c 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,7 +48,7 @@
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-
+STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi);
STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
@@ -695,9 +695,17 @@ xfs_qm_init_quotainfo(
qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
- register_shrinker(&qinf->qi_shrinker);
+
+ error = register_shrinker(&qinf->qi_shrinker);
+ if (error)
+ goto out_free_inos;
+
return 0;
+out_free_inos:
+ mutex_destroy(&qinf->qi_quotaofflock);
+ mutex_destroy(&qinf->qi_tree_lock);
+ xfs_qm_destroy_quotainos(qinf);
out_free_lru:
list_lru_destroy(&qinf->qi_lru);
out_free_qinf:
@@ -706,7 +714,6 @@ out_free_qinf:
return error;
}
-
/*
* Gets called when unmounting a filesystem or when all quotas get
* turned off.
@@ -723,19 +730,8 @@ xfs_qm_destroy_quotainfo(
unregister_shrinker(&qi->qi_shrinker);
list_lru_destroy(&qi->qi_lru);
-
- if (qi->qi_uquotaip) {
- IRELE(qi->qi_uquotaip);
- qi->qi_uquotaip = NULL; /* paranoia */
- }
- if (qi->qi_gquotaip) {
- IRELE(qi->qi_gquotaip);
- qi->qi_gquotaip = NULL;
- }
- if (qi->qi_pquotaip) {
- IRELE(qi->qi_pquotaip);
- qi->qi_pquotaip = NULL;
- }
+ xfs_qm_destroy_quotainos(qi);
+ mutex_destroy(&qi->qi_tree_lock);
mutex_destroy(&qi->qi_quotaofflock);
kmem_free(qi);
mp->m_quotainfo = NULL;
@@ -793,8 +789,8 @@ xfs_qm_qino_alloc(
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
- &committed);
+ error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip,
+ &committed);
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -1600,6 +1596,24 @@ error_rele:
}
STATIC void
+xfs_qm_destroy_quotainos(
+ xfs_quotainfo_t *qi)
+{
+ if (qi->qi_uquotaip) {
+ IRELE(qi->qi_uquotaip);
+ qi->qi_uquotaip = NULL; /* paranoia */
+ }
+ if (qi->qi_gquotaip) {
+ IRELE(qi->qi_gquotaip);
+ qi->qi_gquotaip = NULL;
+ }
+ if (qi->qi_pquotaip) {
+ IRELE(qi->qi_pquotaip);
+ qi->qi_pquotaip = NULL;
+ }
+}
+
+STATIC void
xfs_qm_dqfree_one(
struct xfs_dquot *dqp)
{
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 8f2e2fac4255..3a55d6fc271b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -393,7 +393,8 @@ xfs_cud_init(
int
xfs_cui_recover(
struct xfs_mount *mp,
- struct xfs_cui_log_item *cuip)
+ struct xfs_cui_log_item *cuip,
+ struct xfs_defer_ops *dfops)
{
int i;
int error = 0;
@@ -405,11 +406,9 @@ xfs_cui_recover(
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
enum xfs_refcount_intent_type type;
- xfs_fsblock_t firstfsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
struct xfs_bmbt_irec irec;
- struct xfs_defer_ops dfops;
bool requeue_only = false;
ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -465,7 +464,6 @@ xfs_cui_recover(
return error;
cudp = xfs_trans_get_cud(tp, cuip);
- xfs_defer_init(&dfops, &firstfsb);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
refc = &cuip->cui_format.cui_extents[i];
refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
@@ -485,7 +483,7 @@ xfs_cui_recover(
new_len = refc->pe_len;
} else
error = xfs_trans_log_finish_refcount_update(tp, cudp,
- &dfops, type, refc->pe_startblock, refc->pe_len,
+ dfops, type, refc->pe_startblock, refc->pe_len,
&new_fsb, &new_len, &rcur);
if (error)
goto abort_error;
@@ -497,21 +495,21 @@ xfs_cui_recover(
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_increase_extent(
- tp->t_mountp, &dfops, &irec);
+ tp->t_mountp, dfops, &irec);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_decrease_extent(
- tp->t_mountp, &dfops, &irec);
+ tp->t_mountp, dfops, &irec);
break;
case XFS_REFCOUNT_ALLOC_COW:
error = xfs_refcount_alloc_cow_extent(
- tp->t_mountp, &dfops,
+ tp->t_mountp, dfops,
irec.br_startblock,
irec.br_blockcount);
break;
case XFS_REFCOUNT_FREE_COW:
error = xfs_refcount_free_cow_extent(
- tp->t_mountp, &dfops,
+ tp->t_mountp, dfops,
irec.br_startblock,
irec.br_blockcount);
break;
@@ -525,17 +523,12 @@ xfs_cui_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- error = xfs_defer_finish(&tp, &dfops);
- if (error)
- goto abort_defer;
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
error = xfs_trans_commit(tp);
return error;
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
-abort_defer:
- xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 5b74dddfa64b..0e5327349a13 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
struct xfs_cui_log_item *);
void xfs_cui_item_free(struct xfs_cui_log_item *);
void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip,
+ struct xfs_defer_ops *dfops);
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cc041a29eb70..47aea2e82c26 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -49,8 +49,6 @@
#include "xfs_alloc.h"
#include "xfs_quota_defs.h"
#include "xfs_quota.h"
-#include "xfs_btree.h"
-#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
#include "xfs_rmap_btree.h"
@@ -456,6 +454,8 @@ retry:
if (error)
goto out_bmap_cancel;
+ xfs_inode_set_cowblocks_tag(ip);
+
/* Finish up. */
error = xfs_defer_finish(&tp, &dfops);
if (error)
@@ -492,8 +492,9 @@ xfs_reflink_find_cow_mapping(
struct xfs_iext_cursor icur;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!xfs_is_reflink_inode(ip))
+ return false;
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
return false;
@@ -612,6 +613,9 @@ xfs_reflink_cancel_cow_blocks(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+ } else {
+ /* Didn't do anything, push cursor back. */
+ xfs_iext_prev(ifp, &icur);
}
next_extent:
if (!xfs_iext_get_extent(ifp, &icur, &got))
@@ -727,7 +731,7 @@ xfs_reflink_end_cow(
(unsigned int)(end_fsb - offset_fsb),
XFS_DATA_FORK);
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- resblks, 0, 0, &tp);
+ resblks, 0, XFS_TRANS_RESERVE, &tp);
if (error)
goto out;
@@ -1293,6 +1297,17 @@ xfs_reflink_remap_range(
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+ /*
+ * Clear out post-eof preallocations because we don't have page cache
+ * backing the delayed allocations and they'll never get freed on
+ * their own.
+ */
+ if (xfs_can_free_eofblocks(dest, true)) {
+ ret = xfs_free_eofblocks(dest);
+ if (ret)
+ goto out_unlock;
+ }
+
/* Set flags and remap blocks. */
ret = xfs_reflink_set_inode_flag(src, dest);
if (ret)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f663022353c0..1dacccc367f8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -212,9 +212,9 @@ xfs_parseargs(
*/
if (sb_rdonly(sb))
mp->m_flags |= XFS_MOUNT_RDONLY;
- if (sb->s_flags & MS_DIRSYNC)
+ if (sb->s_flags & SB_DIRSYNC)
mp->m_flags |= XFS_MOUNT_DIRSYNC;
- if (sb->s_flags & MS_SYNCHRONOUS)
+ if (sb->s_flags & SB_SYNCHRONOUS)
mp->m_flags |= XFS_MOUNT_WSYNC;
/*
@@ -1312,7 +1312,7 @@ xfs_fs_remount(
}
/* ro -> rw */
- if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount");
@@ -1360,6 +1360,7 @@ xfs_fs_remount(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
+ xfs_queue_cowblocks(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1368,7 +1369,15 @@ xfs_fs_remount(
}
/* rw -> ro */
- if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
+ /* Get rid of any leftover CoW reservations... */
+ cancel_delayed_work_sync(&mp->m_cowblocks_work);
+ error = xfs_icache_free_cowblocks(mp, NULL);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
/* Free the per-AG metadata reservation pool. */
error = xfs_fs_unreserve_ag_blocks(mp);
if (error) {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 5f2f32408011..fcc5dfc70aa0 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -30,7 +30,7 @@ extern void xfs_qm_exit(void);
#ifdef CONFIG_XFS_POSIX_ACL
# define XFS_ACL_STRING "ACLs, "
-# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL)
+# define set_posix_acl_flag(sb) ((sb)->s_flags |= SB_POSIXACL)
#else
# define XFS_ACL_STRING
# define set_posix_acl_flag(sb) do { } while (0)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 68d3ca2c4968..2e9e793a8f9d 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -232,11 +232,6 @@ xfs_symlink(
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
- if (error == -ENOSPC && fs_blocks == 0) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
- &tp);
- }
if (error)
goto out_release_inode;
@@ -260,14 +255,6 @@ xfs_symlink(
goto out_trans_cancel;
/*
- * Check for ability to enter directory entry, if no space reserved.
- */
- if (!resblks) {
- error = xfs_dir_canenter(tp, dp, link_name);
- if (error)
- goto out_trans_cancel;
- }
- /*
* Initialize the bmap freelist prior to calling either
* bmapi or the directory create code.
*/
@@ -277,7 +264,7 @@ xfs_symlink(
* Allocate an inode for the symlink.
*/
error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, resblks > 0, &ip, NULL);
+ prid, &ip, NULL);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 5d95fe348294..35f3546b6af5 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -24,7 +24,6 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index daa7615497f9..4a89da4b6fe7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -28,6 +28,8 @@
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include <linux/iversion.h>
+
/*
* Add a locked inode to the transaction.
*
@@ -110,15 +112,17 @@ xfs_trans_log_inode(
/*
* First time we log the inode in a transaction, bump the inode change
- * counter if it is configured for this to occur. We don't use
- * inode_inc_version() because there is no need for extra locking around
- * i_version as we already hold the inode locked exclusively for
- * metadata modification.
+ * counter if it is configured for this to occur. While we have the
+ * inode locked exclusively for metadata modification, we can usually
+ * avoid setting XFS_ILOG_CORE if no one has queried the value since
+ * the last time it was incremented. If we have XFS_ILOG_CORE already
+ * set however, then go ahead and bump the i_version counter
+ * unconditionally.
*/
if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
IS_I_VERSION(VFS_I(ip))) {
- VFS_I(ip)->i_version++;
- flags |= XFS_ILOG_CORE;
+ if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
+ flags |= XFS_ILOG_CORE;
}
tp->t_flags |= XFS_TRANS_DIRTY;