From 94e92a6e772e009e27083a7f42d49a179e5f668c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 1 Oct 2013 16:18:06 -0400
Subject: move taking vfsmount_lock down into prepend_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index 41000305d716..d888223a5486 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2889,6 +2889,7 @@ static int prepend_path(const struct path *path,
 	char *bptr;
 	int blen;
 
+	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
 restart:
 	bptr = *buffer;
@@ -2935,6 +2936,7 @@ restart:
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
+	br_read_unlock(&vfsmount_lock);
 
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
@@ -2971,9 +2973,7 @@ char *__d_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	br_read_lock(&vfsmount_lock);
 	error = prepend_path(path, root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 
 	if (error < 0)
 		return ERR_PTR(error);
@@ -2990,9 +2990,7 @@ char *d_absolute_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	br_read_lock(&vfsmount_lock);
 	error = prepend_path(path, &root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 
 	if (error > 1)
 		error = -EINVAL;
@@ -3067,9 +3065,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
 
 	rcu_read_lock();
 	get_fs_root_rcu(current->fs, &root);
-	br_read_lock(&vfsmount_lock);
 	error = path_with_deleted(path, &root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 	rcu_read_unlock();
 
 	if (error < 0)
@@ -3224,7 +3220,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
 
 	error = -ENOENT;
-	br_read_lock(&vfsmount_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		char *cwd = page + PATH_MAX;
@@ -3232,7 +3227,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &root, &cwd, &buflen);
-		br_read_unlock(&vfsmount_lock);
 		rcu_read_unlock();
 
 		if (error < 0)
@@ -3253,7 +3247,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 				error = -EFAULT;
 		}
 	} else {
-		br_read_unlock(&vfsmount_lock);
 		rcu_read_unlock();
 	}
 
-- 
cgit v1.2.3


From b70a80e7a133a0c86f2fa078e7c144597c516415 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:54 +0200
Subject: vfs: introduce d_instantiate_no_diralias()

...which just returns -EBUSY if a directory alias would be created.

This is to be used by fuse mkdir to make sure that a buggy or malicious
userspace filesystem doesn't do anything nasty.  Previously fuse used a
private mutex for this purpose, which can now go away.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/dcache.c            | 27 +++++++++++++++++++++++++++
 fs/fuse/dir.c          | 40 +++++-----------------------------------
 fs/fuse/fuse_i.h       |  3 ---
 fs/fuse/inode.c        |  2 --
 include/linux/dcache.h |  1 +
 5 files changed, 33 insertions(+), 40 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index d888223a5486..c8e83d0d61ac 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1801,6 +1801,33 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 EXPORT_SYMBOL(d_instantiate_unique);
 
+/**
+ * d_instantiate_no_diralias - instantiate a non-aliased dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.  If a directory alias is found, then
+ * return an error (and drop inode).  Together with d_materialise_unique() this
+ * guarantees that a directory inode may never have more than one alias.
+ */
+int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
+{
+	BUG_ON(!hlist_unhashed(&entry->d_alias));
+
+	spin_lock(&inode->i_lock);
+	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		return -EBUSY;
+	}
+	__d_instantiate(entry, inode);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(entry, inode);
+
+	return 0;
+}
+EXPORT_SYMBOL(d_instantiate_no_diralias);
+
 struct dentry *d_make_root(struct inode *root_inode)
 {
 	struct dentry *res = NULL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7989f2ab4c4..c3eb2c46c8f1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -342,24 +342,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	return err;
 }
 
-static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
-					      struct inode *inode)
-{
-	struct dentry *newent;
-
-	if (inode && S_ISDIR(inode->i_mode)) {
-		struct fuse_conn *fc = get_fuse_conn(inode);
-
-		mutex_lock(&fc->inst_mutex);
-		newent = d_materialise_unique(dentry, inode);
-		mutex_unlock(&fc->inst_mutex);
-	} else {
-		newent = d_materialise_unique(dentry, inode);
-	}
-
-	return newent;
-}
-
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  unsigned int flags)
 {
@@ -382,7 +364,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
 		goto out_iput;
 
-	newent = fuse_materialise_dentry(entry, inode);
+	newent = d_materialise_unique(entry, inode);
 	err = PTR_ERR(newent);
 	if (IS_ERR(newent))
 		goto out_err;
@@ -601,21 +583,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	kfree(forget);
 
-	if (S_ISDIR(inode->i_mode)) {
-		struct dentry *alias;
-		mutex_lock(&fc->inst_mutex);
-		alias = d_find_alias(inode);
-		if (alias) {
-			/* New directory must have moved since mkdir */
-			mutex_unlock(&fc->inst_mutex);
-			dput(alias);
-			iput(inode);
-			return -EBUSY;
-		}
-		d_instantiate(entry, inode);
-		mutex_unlock(&fc->inst_mutex);
-	} else
-		d_instantiate(entry, inode);
+	err = d_instantiate_no_diralias(entry, inode);
+	if (err)
+		return err;
 
 	fuse_change_entry_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
@@ -1284,7 +1254,7 @@ static int fuse_direntplus_link(struct file *file,
 	if (!inode)
 		goto out;
 
-	alias = fuse_materialise_dentry(dentry, inode);
+	alias = d_materialise_unique(dentry, inode);
 	err = PTR_ERR(alias);
 	if (IS_ERR(alias))
 		goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5b9e6f3b6aef..fe446c222e3f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -374,9 +374,6 @@ struct fuse_conn {
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
-	/** Mutex protecting against directory alias creation */
-	struct mutex inst_mutex;
-
 	/** Refcount */
 	atomic_t count;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8ce6dab60a0..1c15613c64f8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -565,7 +565,6 @@ void fuse_conn_init(struct fuse_conn *fc)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
-	mutex_init(&fc->inst_mutex);
 	init_rwsem(&fc->killsb);
 	atomic_set(&fc->count, 1);
 	init_waitqueue_head(&fc->waitq);
@@ -596,7 +595,6 @@ void fuse_conn_put(struct fuse_conn *fc)
 	if (atomic_dec_and_test(&fc->count)) {
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
-		mutex_destroy(&fc->inst_mutex);
 		fc->release(fc);
 	}
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 59066e0b4ff1..716c3760ee39 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -224,6 +224,7 @@ static inline int dname_external(const struct dentry *dentry)
 extern void d_instantiate(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_materialise_unique(struct dentry *, struct inode *);
+extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-- 
cgit v1.2.3


From 42c326082d8a2c91506f951ace638deae1faf083 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Nov 2013 12:31:16 -0500
Subject: switch shrink_dcache_for_umount() to use of d_walk()

we have too many iterators in fs/dcache.c...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 195 ++++++++++++++++++++++++++----------------------------------
 1 file changed, 85 insertions(+), 110 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index c8e83d0d61ac..eb0978da1bd4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1074,116 +1074,6 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-/*
- * destroy a single subtree of dentries for unmount
- * - see the comments on shrink_dcache_for_umount() for a description of the
- *   locking
- */
-static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
-{
-	struct dentry *parent;
-
-	BUG_ON(!IS_ROOT(dentry));
-
-	for (;;) {
-		/* descend to the first leaf in the current subtree */
-		while (!list_empty(&dentry->d_subdirs))
-			dentry = list_entry(dentry->d_subdirs.next,
-					    struct dentry, d_u.d_child);
-
-		/* consume the dentries from this leaf up through its parents
-		 * until we find one with children or run out altogether */
-		do {
-			struct inode *inode;
-
-			/*
-			 * inform the fs that this dentry is about to be
-			 * unhashed and destroyed.
-			 */
-			if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
-			    !d_unhashed(dentry))
-				dentry->d_op->d_prune(dentry);
-
-			dentry_lru_del(dentry);
-			__d_shrink(dentry);
-
-			if (dentry->d_lockref.count != 0) {
-				printk(KERN_ERR
-				       "BUG: Dentry %p{i=%lx,n=%s}"
-				       " still in use (%d)"
-				       " [unmount of %s %s]\n",
-				       dentry,
-				       dentry->d_inode ?
-				       dentry->d_inode->i_ino : 0UL,
-				       dentry->d_name.name,
-				       dentry->d_lockref.count,
-				       dentry->d_sb->s_type->name,
-				       dentry->d_sb->s_id);
-				BUG();
-			}
-
-			if (IS_ROOT(dentry)) {
-				parent = NULL;
-				list_del(&dentry->d_u.d_child);
-			} else {
-				parent = dentry->d_parent;
-				parent->d_lockref.count--;
-				list_del(&dentry->d_u.d_child);
-			}
-
-			inode = dentry->d_inode;
-			if (inode) {
-				dentry->d_inode = NULL;
-				hlist_del_init(&dentry->d_alias);
-				if (dentry->d_op && dentry->d_op->d_iput)
-					dentry->d_op->d_iput(dentry, inode);
-				else
-					iput(inode);
-			}
-
-			d_free(dentry);
-
-			/* finished when we fall off the top of the tree,
-			 * otherwise we ascend to the parent and move to the
-			 * next sibling if there is one */
-			if (!parent)
-				return;
-			dentry = parent;
-		} while (list_empty(&dentry->d_subdirs));
-
-		dentry = list_entry(dentry->d_subdirs.next,
-				    struct dentry, d_u.d_child);
-	}
-}
-
-/*
- * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock because:
- *   - the superblock is detached from all mountings and open files, so the
- *     dentry trees will not be rearranged by the VFS
- *   - s_umount is write-locked, so the memory pressure shrinker will ignore
- *     any dentries belonging to this superblock that it comes across
- *   - the filesystem itself is no longer permitted to rearrange the dentries
- *     in this superblock
- */
-void shrink_dcache_for_umount(struct super_block *sb)
-{
-	struct dentry *dentry;
-
-	if (down_read_trylock(&sb->s_umount))
-		BUG();
-
-	dentry = sb->s_root;
-	sb->s_root = NULL;
-	dentry->d_lockref.count--;
-	shrink_dcache_for_umount_subtree(dentry);
-
-	while (!hlist_bl_empty(&sb->s_anon)) {
-		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
-		shrink_dcache_for_umount_subtree(dentry);
-	}
-}
-
 /*
  * This tries to ascend one level of parenthood, but
  * we can race with renaming, so we need to re-check
@@ -1478,6 +1368,91 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
+static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+{
+	struct select_data *data = _data;
+	enum d_walk_ret ret = D_WALK_CONTINUE;
+
+	if (dentry->d_lockref.count) {
+		dentry_lru_del(dentry);
+		if (likely(!list_empty(&dentry->d_subdirs)))
+			goto out;
+		if (dentry == data->start && dentry->d_lockref.count == 1)
+			goto out;
+		printk(KERN_ERR
+		       "BUG: Dentry %p{i=%lx,n=%s}"
+		       " still in use (%d)"
+		       " [unmount of %s %s]\n",
+		       dentry,
+		       dentry->d_inode ?
+		       dentry->d_inode->i_ino : 0UL,
+		       dentry->d_name.name,
+		       dentry->d_lockref.count,
+		       dentry->d_sb->s_type->name,
+		       dentry->d_sb->s_id);
+		BUG();
+	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+		/*
+		 * We can't use d_lru_shrink_move() because we
+		 * need to get the global LRU lock and do the
+		 * LRU accounting.
+		 */
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		d_shrink_add(dentry, &data->dispose);
+		data->found++;
+		ret = D_WALK_NORETRY;
+	}
+out:
+	if (data->found && need_resched())
+		ret = D_WALK_QUIT;
+	return ret;
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+	struct dentry *dentry;
+
+	if (down_read_trylock(&sb->s_umount))
+		BUG();
+
+	dentry = sb->s_root;
+	sb->s_root = NULL;
+	for (;;) {
+		struct select_data data;
+
+		INIT_LIST_HEAD(&data.dispose);
+		data.start = dentry;
+		data.found = 0;
+
+		d_walk(dentry, &data, umount_collect, NULL);
+		if (!data.found)
+			break;
+
+		shrink_dentry_list(&data.dispose);
+		cond_resched();
+	}
+	d_drop(dentry);
+	dput(dentry);
+
+	while (!hlist_bl_empty(&sb->s_anon)) {
+		struct select_data data;
+		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
+
+		INIT_LIST_HEAD(&data.dispose);
+		data.start = NULL;
+		data.found = 0;
+
+		d_walk(dentry, &data, umount_collect, NULL);
+		if (data.found)
+			shrink_dentry_list(&data.dispose);
+		cond_resched();
+	}
+}
+
 static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
 {
 	struct select_data *data = _data;
-- 
cgit v1.2.3


From 48a066e72d970a3e225a9c18690d570c736fc455 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Sep 2013 22:06:07 -0400
Subject: RCU'd vfsmounts

* RCU-delayed freeing of vfsmounts
* vfsmount_lock replaced with a seqlock (mount_lock)
* sequence number from mount_lock is stored in nameidata->m_seq and
used when we exit RCU mode
* new vfsmount flag - MNT_SYNC_UMOUNT.  Set by umount_tree() when its
caller knows that vfsmount will have no surviving references.
* synchronize_rcu() done between unlocking namespace_sem in namespace_unlock()
and doing pending mntput().
* new helper: legitimize_mnt(mnt, seq).  Checks the mount_lock sequence
number against seq, then grabs reference to mnt.  Then it rechecks mount_lock
again to close the race and either returns success or drops the reference it
has acquired.  The subtle point is that in case of MNT_SYNC_UMOUNT we can
simply decrement the refcount and sod off - aforementioned synchronize_rcu()
makes sure that final mntput() won't come until we leave RCU mode.  We need
that, since we don't want to end up with some lazy pathwalk racing with
umount() and stealing the final mntput() from it - caller of umount() may
expect it to return only once the fs is shut down and we don't want to break
that.  In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do
full-blown mntput() in case of mount_lock sequence number mismatch happening
just as we'd grabbed the reference, but in those cases we won't be stealing
the final mntput() from anything that would care.
* mntput_no_expire() doesn't lock anything on the fast path now.  Incidentally,
SMP and UP cases are handled the same way - no ifdefs there.
* normal pathname resolution does *not* do any writes to mount_lock.  It does,
of course, bump the refcounts of vfsmount and dentry in the very end, but that's
it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c           |  20 +++++---
 fs/mount.h            |  10 ++--
 fs/namei.c            |  50 ++++++++++---------
 fs/namespace.c        | 135 ++++++++++++++++++++++++++++++++------------------
 include/linux/mount.h |   2 +
 include/linux/namei.h |   2 +-
 6 files changed, 136 insertions(+), 83 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index eb0978da1bd4..aafa2a146434 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path,
 	struct vfsmount *vfsmnt = path->mnt;
 	struct mount *mnt = real_mount(vfsmnt);
 	int error = 0;
-	unsigned seq = 0;
+	unsigned seq, m_seq = 0;
 	char *bptr;
 	int blen;
 
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
+restart_mnt:
+	read_seqbegin_or_lock(&mount_lock, &m_seq);
+	seq = 0;
 restart:
 	bptr = *buffer;
 	blen = *buflen;
+	error = 0;
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+			struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
 			/* Global root? */
-			if (mnt_has_parent(mnt)) {
-				dentry = mnt->mnt_mountpoint;
-				mnt = mnt->mnt_parent;
+			if (mnt != parent) {
+				dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+				mnt = parent;
 				vfsmnt = &mnt->mnt;
 				continue;
 			}
@@ -2938,7 +2942,11 @@ restart:
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
-	br_read_unlock(&vfsmount_lock);
+	if (need_seqretry(&mount_lock, m_seq)) {
+		m_seq = 1;
+		goto restart_mnt;
+	}
+	done_seqretry(&mount_lock, m_seq);
 
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
diff --git a/fs/mount.h b/fs/mount.h
index f0866076de6e..d64c594be6c4 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,7 +1,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
-#include <linux/lglock.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -30,6 +29,7 @@ struct mount {
 	struct mount *mnt_parent;
 	struct dentry *mnt_mountpoint;
 	struct vfsmount mnt;
+	struct rcu_head mnt_rcu;
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
 #else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
 static inline void get_mnt_ns(struct mnt_namespace *ns)
 {
 	atomic_inc(&ns->count);
 }
 
-extern struct lglock vfsmount_lock;
+extern seqlock_t mount_lock;
 
 static inline void lock_mount_hash(void)
 {
-	br_write_lock(&vfsmount_lock);
+	write_seqlock(&mount_lock);
 }
 
 static inline void unlock_mount_hash(void)
 {
-	br_write_unlock(&vfsmount_lock);
+	write_sequnlock(&mount_lock);
 }
 
 struct proc_mounts {
diff --git a/fs/namei.c b/fs/namei.c
index 1f844fbfce72..cb0ebae07e52 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put);
 
 static inline void lock_rcu_walk(void)
 {
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
 }
 
 static inline void unlock_rcu_walk(void)
 {
 	rcu_read_unlock();
-	br_read_unlock(&vfsmount_lock);
 }
 
 /**
@@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	/*
-	 * Get a reference to the parent first: we're
-	 * going to make "path_put(nd->path)" valid in
-	 * non-RCU context for "terminate_walk()".
-	 *
-	 * If this doesn't work, return immediately with
-	 * RCU walking still active (and then we will do
-	 * the RCU walk cleanup in terminate_walk()).
+	 * After legitimizing the bastards, terminate_walk()
+	 * will do the right thing for non-RCU mode, and all our
+	 * subsequent exit cases should rcu_read_unlock()
+	 * before returning.  Do vfsmount first; if dentry
+	 * can't be legitimized, just set nd->path.dentry to NULL
+	 * and rely on dput(NULL) being a no-op.
 	 */
-	if (!lockref_get_not_dead(&parent->d_lockref))
+	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
 		return -ECHILD;
-
-	/*
-	 * After the mntget(), we terminate_walk() will do
-	 * the right thing for non-RCU mode, and all our
-	 * subsequent exit cases should unlock_rcu_walk()
-	 * before returning.
-	 */
-	mntget(nd->path.mnt);
 	nd->flags &= ~LOOKUP_RCU;
 
+	if (!lockref_get_not_dead(&parent->d_lockref)) {
+		nd->path.dentry = NULL;	
+		unlock_rcu_walk();
+		return -ECHILD;
+	}
+
 	/*
 	 * For a negative lookup, the lookup sequence point is the parents
 	 * sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd)
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 
+		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+			unlock_rcu_walk();
+			return -ECHILD;
+		}
 		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
 			unlock_rcu_walk();
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
 		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
 			unlock_rcu_walk();
 			dput(dentry);
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
-		mntget(nd->path.mnt);
 		unlock_rcu_walk();
 	}
 
@@ -909,15 +909,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	parent = mnt->mnt_parent;
 	if (parent == mnt) {
-		br_read_unlock(&vfsmount_lock);
+		read_sequnlock_excl(&mount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags)
 
 			/* Something is mounted on this dentry in another
 			 * namespace and/or whatever was mounted there in this
-			 * namespace got unmounted before we managed to get the
-			 * vfsmount_lock */
+			 * namespace got unmounted before lookup_mnt() could
+			 * get it */
 		}
 
 		/* Handle an automount point */
@@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
@@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
 	nd->root.mnt = NULL;
 
+	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
diff --git a/fs/namespace.c b/fs/namespace.c
index 500202ce10db..ac2ce8a766e1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt)
 	kmem_cache_free(mnt_cache, mnt);
 }
 
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+	struct mount *mnt;
+	if (read_seqretry(&mount_lock, seq))
+		return false;
+	if (bastard == NULL)
+		return true;
+	mnt = real_mount(bastard);
+	mnt_add_count(mnt, 1);
+	if (likely(!read_seqretry(&mount_lock, seq)))
+		return true;
+	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+		mnt_add_count(mnt, -1);
+		return false;
+	}
+	rcu_read_unlock();
+	mntput(bastard);
+	rcu_read_lock();
+	return false;
+}
+
 /*
  * find the first mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
 	struct mount *p;
 
-	list_for_each_entry(p, head, mnt_hash)
+	list_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
@@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 
 /*
  * find the last mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
+	struct vfsmount *m;
+	unsigned seq;
 
-	br_read_lock(&vfsmount_lock);
-	child_mnt = __lookup_mnt(path->mnt, path->dentry);
-	if (child_mnt) {
-		mnt_add_count(child_mnt, 1);
-		br_read_unlock(&vfsmount_lock);
-		return &child_mnt->mnt;
-	} else {
-		br_read_unlock(&vfsmount_lock);
-		return NULL;
-	}
+	rcu_read_lock();
+	do {
+		seq = read_seqbegin(&mount_lock);
+		child_mnt = __lookup_mnt(path->mnt, path->dentry);
+		m = child_mnt ? &child_mnt->mnt : NULL;
+	} while (!legitimize_mnt(m, seq));
+	rcu_read_unlock();
+	return m;
 }
 
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -874,38 +896,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return ERR_PTR(err);
 }
 
+static void delayed_free(struct rcu_head *head)
+{
+	struct mount *mnt = container_of(head, struct mount, mnt_rcu);
+	kfree(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_pcp);
+#endif
+	kmem_cache_free(mnt_cache, mnt);
+}
+
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
-#ifdef CONFIG_SMP
-	br_read_lock(&vfsmount_lock);
-	if (likely(mnt->mnt_ns)) {
-		/* shouldn't be the last one */
-		mnt_add_count(mnt, -1);
-		br_read_unlock(&vfsmount_lock);
+	rcu_read_lock();
+	mnt_add_count(mnt, -1);
+	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+		rcu_read_unlock();
 		return;
 	}
-	br_read_unlock(&vfsmount_lock);
-
 	lock_mount_hash();
-	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
+		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
-#else
-	mnt_add_count(mnt, -1);
-	if (likely(mnt_get_count(mnt)))
-		return;
-	lock_mount_hash();
-#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
+		rcu_read_unlock();
 		unlock_mount_hash();
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+		rcu_read_unlock();
+		unlock_mount_hash();
+		return;
+	}
+	mnt->mnt.mnt_flags |= MNT_DOOMED;
+	rcu_read_unlock();
 
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
@@ -924,7 +954,8 @@ put_again:
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
-	free_vfsmnt(mnt);
+	mnt_free_id(mnt);
+	call_rcu(&mnt->mnt_rcu, delayed_free);
 }
 
 void mntput(struct vfsmount *mnt)
@@ -1137,6 +1168,8 @@ static void namespace_unlock(void)
 	list_splice_init(&unmounted, &head);
 	up_write(&namespace_sem);
 
+	synchronize_rcu();
+
 	while (!list_empty(&head)) {
 		mnt = list_first_entry(&head, struct mount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
@@ -1152,10 +1185,13 @@ static inline void namespace_lock(void)
 }
 
 /*
- * vfsmount lock must be held for write
+ * mount_lock must be held
  * namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
  */
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
 {
 	LIST_HEAD(tmp_list);
 	struct mount *p;
@@ -1163,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate)
 	for (p = mnt; p; p = next_mnt(p, mnt))
 		list_move(&p->mnt_hash, &tmp_list);
 
-	if (propagate)
+	if (how)
 		propagate_umount(&tmp_list);
 
 	list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1207,8 @@ void umount_tree(struct mount *mnt, int propagate)
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		if (how < 2)
+			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
 			put_mountpoint(p->mnt_mp);
@@ -1262,14 +1300,18 @@ static int do_umount(struct mount *mnt, int flags)
 	lock_mount_hash();
 	event++;
 
-	if (!(flags & MNT_DETACH))
-		shrink_submounts(mnt);
-
-	retval = -EBUSY;
-	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, 1);
+			umount_tree(mnt, 2);
 		retval = 0;
+	} else {
+		shrink_submounts(mnt);
+		retval = -EBUSY;
+		if (!propagate_mount_busy(mnt, 2)) {
+			if (!list_empty(&mnt->mnt_list))
+				umount_tree(mnt, 1);
+			retval = 0;
+		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
@@ -1955,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 	struct mount *parent;
 	int err;
 
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
 
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
@@ -2172,7 +2214,7 @@ resume:
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
@@ -2558,7 +2600,7 @@ out_type:
 /*
  * Return true if path is reachable from root
  *
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
@@ -2573,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2790,6 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
-	br_lock_init(&vfsmount_lock);
-
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2788,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
-		lock_mount_hash();
 		real_mount(mnt)->mnt_ns = NULL;
-		unlock_mount_hash();
+		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 38cd98f112a0..371d346fa270 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,6 +49,8 @@ struct mnt_namespace;
 
 #define MNT_LOCK_READONLY	0x400000
 #define MNT_LOCKED		0x800000
+#define MNT_DOOMED		0x1000000
+#define MNT_SYNC_UMOUNT		0x2000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8e47bc7a1665..492de72560fa 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -16,7 +16,7 @@ struct nameidata {
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
-	unsigned	seq;
+	unsigned	seq, m_seq;
 	int		last_type;
 	unsigned	depth;
 	char *saved_names[MAX_NESTED_LINKS + 1];
-- 
cgit v1.2.3


From b61625d24596ea44555943867d5a5c1efd81074c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Oct 2013 11:09:01 -0400
Subject: fold __d_shrink() into its only remaining caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index aafa2a146434..fb7bcf3ba5d6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -483,27 +483,6 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	return parent;
 }
 
-/*
- * Unhash a dentry without inserting an RCU walk barrier or checking that
- * dentry->d_lock is locked.  The caller must take care of that, if
- * appropriate.
- */
-static void __d_shrink(struct dentry *dentry)
-{
-	if (!d_unhashed(dentry)) {
-		struct hlist_bl_head *b;
-		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
-			b = &dentry->d_sb->s_anon;
-		else
-			b = d_hash(dentry->d_parent, dentry->d_name.hash);
-
-		hlist_bl_lock(b);
-		__hlist_bl_del(&dentry->d_hash);
-		dentry->d_hash.pprev = NULL;
-		hlist_bl_unlock(b);
-	}
-}
-
 /**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
@@ -522,7 +501,16 @@ static void __d_shrink(struct dentry *dentry)
 void __d_drop(struct dentry *dentry)
 {
 	if (!d_unhashed(dentry)) {
-		__d_shrink(dentry);
+		struct hlist_bl_head *b;
+		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+			b = &dentry->d_sb->s_anon;
+		else
+			b = d_hash(dentry->d_parent, dentry->d_name.hash);
+
+		hlist_bl_lock(b);
+		__hlist_bl_del(&dentry->d_hash);
+		dentry->d_hash.pprev = NULL;
+		hlist_bl_unlock(b);
 		dentry_rcuwalk_barrier(dentry);
 	}
 }
-- 
cgit v1.2.3


From b18825a7c8e37a7cf6abb97a12a6ad71af160de7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Sep 2013 19:22:53 +0100
Subject: VFS: Put a small type field into struct dentry::d_flags

Put a type field into struct dentry::d_flags to indicate if the dentry is one
of the following types that relate particularly to pathwalk:

	Miss (negative dentry)
	Directory
	"Automount" directory (defective - no i_op->lookup())
	Symlink
	Other (regular, socket, fifo, device)

The type field is set to one of the first five types on a dentry by calls to
__d_instantiate() and d_obtain_alias() from information in the inode (if one is
given).

The type is cleared by dentry_unlink_inode() when it reconstitutes an existing
dentry as a negative dentry.

Accessors provided are:

	d_set_type(dentry, type)
	d_is_directory(dentry)
	d_is_autodir(dentry)
	d_is_symlink(dentry)
	d_is_file(dentry)
	d_is_negative(dentry)
	d_is_positive(dentry)

A bunch of checks in pathname resolution switched to those.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            |  42 +++++++++++++++++---
 fs/namei.c             |  95 ++++++++++++++++++---------------------------
 include/linux/dcache.h | 103 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 158 insertions(+), 82 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index fb7bcf3ba5d6..525770e576db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -343,6 +343,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	__releases(dentry->d_inode->i_lock)
 {
 	struct inode *inode = dentry->d_inode;
+	__d_clear_type(dentry);
 	dentry->d_inode = NULL;
 	hlist_del_init(&dentry->d_alias);
 	dentry_rcuwalk_barrier(dentry);
@@ -1648,14 +1649,42 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
+static unsigned d_flags_for_inode(struct inode *inode)
+{
+	unsigned add_flags = DCACHE_FILE_TYPE;
+
+	if (!inode)
+		return DCACHE_MISS_TYPE;
+
+	if (S_ISDIR(inode->i_mode)) {
+		add_flags = DCACHE_DIRECTORY_TYPE;
+		if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
+			if (unlikely(!inode->i_op->lookup))
+				add_flags = DCACHE_AUTODIR_TYPE;
+			else
+				inode->i_opflags |= IOP_LOOKUP;
+		}
+	} else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+		if (unlikely(inode->i_op->follow_link))
+			add_flags = DCACHE_SYMLINK_TYPE;
+		else
+			inode->i_opflags |= IOP_NOFOLLOW;
+	}
+
+	if (unlikely(IS_AUTOMOUNT(inode)))
+		add_flags |= DCACHE_NEED_AUTOMOUNT;
+	return add_flags;
+}
+
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
+	unsigned add_flags = d_flags_for_inode(inode);
+
 	spin_lock(&dentry->d_lock);
-	if (inode) {
-		if (unlikely(IS_AUTOMOUNT(inode)))
-			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+	dentry->d_flags &= ~DCACHE_ENTRY_TYPE;
+	dentry->d_flags |= add_flags;
+	if (inode)
 		hlist_add_head(&dentry->d_alias, &inode->i_dentry);
-	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -1860,6 +1889,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	static const struct qstr anonstring = QSTR_INIT("/", 1);
 	struct dentry *tmp;
 	struct dentry *res;
+	unsigned add_flags;
 
 	if (!inode)
 		return ERR_PTR(-ESTALE);
@@ -1885,9 +1915,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 
 	/* attach a disconnected dentry */
+	add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+
 	spin_lock(&tmp->d_lock);
 	tmp->d_inode = inode;
-	tmp->d_flags |= DCACHE_DISCONNECTED;
+	tmp->d_flags |= add_flags;
 	hlist_add_head(&tmp->d_alias, &inode->i_dentry);
 	hlist_bl_lock(&tmp->d_sb->s_anon);
 	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
diff --git a/fs/namei.c b/fs/namei.c
index e5c0118ba9f8..e1fa43346c61 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1501,18 +1501,9 @@ static void terminate_walk(struct nameidata *nd)
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static inline int should_follow_link(struct inode *inode, int follow)
+static inline int should_follow_link(struct dentry *dentry, int follow)
 {
-	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (likely(inode->i_op->follow_link))
-			return follow;
-
-		/* This gets set once for the inode lifetime */
-		spin_lock(&inode->i_lock);
-		inode->i_opflags |= IOP_NOFOLLOW;
-		spin_unlock(&inode->i_lock);
-	}
-	return 0;
+	return unlikely(d_is_symlink(dentry)) ? follow : 0;
 }
 
 static inline int walk_component(struct nameidata *nd, struct path *path,
@@ -1542,7 +1533,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 	if (!inode)
 		goto out_path_put;
 
-	if (should_follow_link(inode, follow)) {
+	if (should_follow_link(path->dentry, follow)) {
 		if (nd->flags & LOOKUP_RCU) {
 			if (unlikely(unlazy_walk(nd, path->dentry))) {
 				err = -ECHILD;
@@ -1600,26 +1591,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 	return res;
 }
 
-/*
- * We really don't want to look at inode->i_op->lookup
- * when we don't have to. So we keep a cache bit in
- * the inode ->i_opflags field that says "yes, we can
- * do lookup on this inode".
- */
-static inline int can_lookup(struct inode *inode)
-{
-	if (likely(inode->i_opflags & IOP_LOOKUP))
-		return 1;
-	if (likely(!inode->i_op->lookup))
-		return 0;
-
-	/* We do this once for the lifetime of the inode */
-	spin_lock(&inode->i_lock);
-	inode->i_opflags |= IOP_LOOKUP;
-	spin_unlock(&inode->i_lock);
-	return 1;
-}
-
 /*
  * We can do the critical dentry name comparison and hashing
  * operations one word at a time, but we are limited to:
@@ -1823,7 +1794,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (err)
 				return err;
 		}
-		if (!can_lookup(nd->inode)) {
+		if (!d_is_directory(nd->path.dentry)) {
 			err = -ENOTDIR; 
 			break;
 		}
@@ -1841,9 +1812,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
-		struct inode *inode = nd->root.dentry->d_inode;
+		struct dentry *root = nd->root.dentry;
+		struct inode *inode = root->d_inode;
 		if (*name) {
-			if (!can_lookup(inode))
+			if (!d_is_directory(root))
 				return -ENOTDIR;
 			retval = inode_permission(inode, MAY_EXEC);
 			if (retval)
@@ -1899,7 +1871,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		dentry = f.file->f_path.dentry;
 
 		if (*name) {
-			if (!can_lookup(dentry->d_inode)) {
+			if (!d_is_directory(dentry)) {
 				fdput(f);
 				return -ENOTDIR;
 			}
@@ -1981,7 +1953,7 @@ static int path_lookupat(int dfd, const char *name,
 		err = complete_walk(nd);
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
-		if (!can_lookup(nd->inode)) {
+		if (!d_is_directory(nd->path.dentry)) {
 			path_put(&nd->path);
 			err = -ENOTDIR;
 		}
@@ -2273,7 +2245,7 @@ done:
 	}
 	path->dentry = dentry;
 	path->mnt = mntget(nd->path.mnt);
-	if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
+	if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
 		return 1;
 	follow_mount(path);
 	error = 0;
@@ -2417,12 +2389,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 {
+	struct inode *inode = victim->d_inode;
 	int error;
 
-	if (!victim->d_inode)
+	if (d_is_negative(victim))
 		return -ENOENT;
+	BUG_ON(!inode);
 
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
@@ -2432,15 +2406,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+
+	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
 		return -EPERM;
 	if (isdir) {
-		if (!S_ISDIR(victim->d_inode->i_mode))
+		if (!d_is_directory(victim) && !d_is_autodir(victim))
 			return -ENOTDIR;
 		if (IS_ROOT(victim))
 			return -EBUSY;
-	} else if (S_ISDIR(victim->d_inode->i_mode))
+	} else if (d_is_directory(victim) || d_is_autodir(victim))
 		return -EISDIR;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
@@ -2974,7 +2949,7 @@ retry_lookup:
 	/*
 	 * create/update audit record if it already exists.
 	 */
-	if (path->dentry->d_inode)
+	if (d_is_positive(path->dentry))
 		audit_inode(name, path->dentry, 0);
 
 	/*
@@ -3003,12 +2978,12 @@ retry_lookup:
 finish_lookup:
 	/* we _can_ be in RCU mode here */
 	error = -ENOENT;
-	if (!inode) {
+	if (d_is_negative(path->dentry)) {
 		path_to_nameidata(path, nd);
 		goto out;
 	}
 
-	if (should_follow_link(inode, !symlink_ok)) {
+	if (should_follow_link(path->dentry, !symlink_ok)) {
 		if (nd->flags & LOOKUP_RCU) {
 			if (unlikely(unlazy_walk(nd, path->dentry))) {
 				error = -ECHILD;
@@ -3037,10 +3012,11 @@ finish_open:
 	}
 	audit_inode(name, nd->path.dentry, 0);
 	error = -EISDIR;
-	if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+	if ((open_flag & O_CREAT) &&
+	    (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
 		goto out;
 	error = -ENOTDIR;
-	if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
+	if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry))
 		goto out;
 	if (!S_ISREG(nd->inode->i_mode))
 		will_truncate = false;
@@ -3266,7 +3242,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	nd.root.mnt = mnt;
 	nd.root.dentry = dentry;
 
-	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
@@ -3316,8 +3292,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
 		goto unlock;
 
 	error = -EEXIST;
-	if (dentry->d_inode)
+	if (d_is_positive(dentry))
 		goto fail;
+
 	/*
 	 * Special case - lookup gave negative, but... we had foo/bar/
 	 * From the vfs_mknod() POV we just have a negative dentry -
@@ -3706,7 +3683,7 @@ retry:
 		if (nd.last.name[nd.last.len])
 			goto slashes;
 		inode = dentry->d_inode;
-		if (!inode)
+		if (d_is_negative(dentry))
 			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&nd.path, dentry);
@@ -3731,8 +3708,12 @@ exit1:
 	return error;
 
 slashes:
-	error = !dentry->d_inode ? -ENOENT :
-		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
+	if (d_is_negative(dentry))
+		error = -ENOENT;
+	else if (d_is_directory(dentry) || d_is_autodir(dentry))
+		error = -EISDIR;
+	else
+		error = -ENOTDIR;
 	goto exit2;
 }
 
@@ -4046,7 +4027,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode *new_dir, struct dentry *new_dentry)
 {
 	int error;
-	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
 	const unsigned char *old_name;
 
 	if (old_dentry->d_inode == new_dentry->d_inode)
@@ -4134,10 +4115,10 @@ retry:
 		goto exit3;
 	/* source must exist */
 	error = -ENOENT;
-	if (!old_dentry->d_inode)
+	if (d_is_negative(old_dentry))
 		goto exit4;
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
-	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
+	if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) {
 		error = -ENOTDIR;
 		if (oldnd.last.name[oldnd.last.len])
 			goto exit4;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 716c3760ee39..57e87e749a48 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -169,13 +169,13 @@ struct dentry_operations {
  */
 
 /* d_flags entries */
-#define DCACHE_OP_HASH		0x0001
-#define DCACHE_OP_COMPARE	0x0002
-#define DCACHE_OP_REVALIDATE	0x0004
-#define DCACHE_OP_DELETE	0x0008
-#define DCACHE_OP_PRUNE         0x0010
+#define DCACHE_OP_HASH			0x00000001
+#define DCACHE_OP_COMPARE		0x00000002
+#define DCACHE_OP_REVALIDATE		0x00000004
+#define DCACHE_OP_DELETE		0x00000008
+#define DCACHE_OP_PRUNE			0x00000010
 
-#define	DCACHE_DISCONNECTED	0x0020
+#define	DCACHE_DISCONNECTED		0x00000020
      /* This dentry is possibly not currently connected to the dcache tree, in
       * which case its parent will either be itself, or will have this flag as
       * well.  nfsd will not use a dentry with this bit set, but will first
@@ -186,30 +186,38 @@ struct dentry_operations {
       * dentry into place and return that dentry rather than the passed one,
       * typically using d_splice_alias. */
 
-#define DCACHE_REFERENCED	0x0040  /* Recently used, don't discard. */
-#define DCACHE_RCUACCESS	0x0080	/* Entry has ever been RCU-visible */
+#define DCACHE_REFERENCED		0x00000040 /* Recently used, don't discard. */
+#define DCACHE_RCUACCESS		0x00000080 /* Entry has ever been RCU-visible */
 
-#define DCACHE_CANT_MOUNT	0x0100
-#define DCACHE_GENOCIDE		0x0200
-#define DCACHE_SHRINK_LIST	0x0400
+#define DCACHE_CANT_MOUNT		0x00000100
+#define DCACHE_GENOCIDE			0x00000200
+#define DCACHE_SHRINK_LIST		0x00000400
 
-#define DCACHE_OP_WEAK_REVALIDATE	0x0800
+#define DCACHE_OP_WEAK_REVALIDATE	0x00000800
 
-#define DCACHE_NFSFS_RENAMED	0x1000
+#define DCACHE_NFSFS_RENAMED		0x00001000
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE		0x2000	/* For use by dcookie subsystem */
-#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x4000
+#define DCACHE_COOKIE			0x00002000 /* For use by dcookie subsystem */
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x00004000
      /* Parent inode is watched by some fsnotify listener */
 
-#define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
-#define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
-#define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
+#define DCACHE_DENTRY_KILLED		0x00008000
+
+#define DCACHE_MOUNTED			0x00010000 /* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT		0x00020000 /* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT		0x00040000 /* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
-#define DCACHE_LRU_LIST		0x80000
-#define DCACHE_DENTRY_KILLED	0x100000
+#define DCACHE_LRU_LIST			0x00080000
+
+#define DCACHE_ENTRY_TYPE		0x00700000
+#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry */
+#define DCACHE_DIRECTORY_TYPE		0x00100000 /* Normal directory */
+#define DCACHE_AUTODIR_TYPE		0x00200000 /* Lookupless directory (presumed automount) */
+#define DCACHE_SYMLINK_TYPE		0x00300000 /* Symlink */
+#define DCACHE_FILE_TYPE		0x00400000 /* Other file type */
 
 extern seqlock_t rename_lock;
 
@@ -394,6 +402,61 @@ static inline bool d_mountpoint(const struct dentry *dentry)
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
 
+/*
+ * Directory cache entry type accessor functions.
+ */
+static inline void __d_set_type(struct dentry *dentry, unsigned type)
+{
+	dentry->d_flags = (dentry->d_flags & ~DCACHE_ENTRY_TYPE) | type;
+}
+
+static inline void __d_clear_type(struct dentry *dentry)
+{
+	__d_set_type(dentry, DCACHE_MISS_TYPE);
+}
+
+static inline void d_set_type(struct dentry *dentry, unsigned type)
+{
+	spin_lock(&dentry->d_lock);
+	__d_set_type(dentry, type);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline unsigned __d_entry_type(const struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_ENTRY_TYPE;
+}
+
+static inline bool d_is_directory(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
+}
+
+static inline bool d_is_autodir(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
+}
+
+static inline bool d_is_symlink(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
+}
+
+static inline bool d_is_file(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_FILE_TYPE;
+}
+
+static inline bool d_is_negative(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
+}
+
+static inline bool d_is_positive(const struct dentry *dentry)
+{
+	return !d_is_negative(dentry);
+}
+
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
-- 
cgit v1.2.3


From 7632e465feb182cadc3c9aa1282a057201818a8c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 28 Jun 2012 12:10:55 -0400
Subject: dcache: use IS_ROOT to decide where dentry is hashed

Every hashed dentry is either hashed in the dentry_hashtable, or a
superblock's s_anon list.

__d_drop() assumes it can determine which is the case by checking
DCACHE_DISCONNECTED; this is not true.

It is true that when DCACHE_DISCONNECTED is cleared, the dentry is not
only hashed on dentry_hashtable, but is fully connected to its parents
back to the root.

But the converse is *not* true: fs/exportfs/expfs.c:reconnect_path()
attempts to connect a directory (found by filehandle lookup) back to
root by ascending to parents and performing lookups one at a time.  It
does not clear DCACHE_DISCONNECTED until it's done, and that is not at
all an atomic process.

In particular, it is possible for DCACHE_DISCONNECTED to be set on a
dentry which is hashed on the dentry_hashtable.

Instead, use IS_ROOT() to check which hash chain a dentry is on.  This
*does* work:

Dentries are hashed only by:

	- d_obtain_alias, which adds an IS_ROOT() dentry to sb_anon.

	- __d_rehash, called by _d_rehash: hashes to the dentry's
	  parent, and all callers of _d_rehash appear to have d_parent
	  set to a "real" parent.
	- __d_rehash, called by __d_move: rehashes the moved dentry to
	  hash chain determined by target, and assigns target's d_parent
	  to its d_parent, before dropping the dentry's d_lock.

Therefore I believe it's safe for a holder of a dentry's d_lock to
assume that it is hashed on sb_anon if and only if IS_ROOT(dentry) is
true.

I believe the incorrect assumption about DCACHE_DISCONNECTED was
originally introduced by ceb5bdc2d246 "fs: dcache per-bucket dcache hash
locking".

Also add a comment while we're here.

Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index 525770e576db..c2add37082b6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -503,7 +503,12 @@ void __d_drop(struct dentry *dentry)
 {
 	if (!d_unhashed(dentry)) {
 		struct hlist_bl_head *b;
-		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+		/*
+		 * Hashed dentries are normally on the dentry hashtable,
+		 * with the exception of those newly allocated by
+		 * d_obtain_alias, which are always IS_ROOT:
+		 */
+		if (unlikely(IS_ROOT(dentry)))
 			b = &dentry->d_sb->s_anon;
 		else
 			b = d_hash(dentry->d_parent, dentry->d_name.hash);
-- 
cgit v1.2.3


From e1a24bb0aa6abaa86b5a95638722ea2036dbaadd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 29 Jun 2012 16:20:47 -0400
Subject: dcache: Don't set DISCONNECTED on "pseudo filesystem" dentries

I can't for the life of me see any reason why anyone should care whether
a dentry that is never hooked into the dentry cache would need
DCACHE_DISCONNECTED set.

This originates from 4b936885ab04dc6e0bb0ef35e0e23c1a7364d9e5 "fs:
improve scalability of pseudo filesystems", which probably just made the
false assumption the DCACHE_DISCONNECTED was meant to be set on anything
not connected to a parent somehow.

So this is just confusing.  Ideally the only uses of DCACHE_DISCONNECTED
would be in the filehandle-lookup code, which needs it to ensure
dentries are connected into the dentry tree before use.

I left d_alloc_pseudo there even though it's now equivalent to
__d_alloc(), just on the theory the name is better documentation of its
intended use outside dcache.c.

Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index c2add37082b6..67662beebc72 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1607,12 +1607,17 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+/**
+ * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
+ * @sb: the superblock
+ * @name: qstr of the name
+ *
+ * For a filesystem that just pins its dentries in memory and never
+ * performs lookups at all, return an unhashed IS_ROOT dentry.
+ */
 struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 {
-	struct dentry *dentry = __d_alloc(sb, name);
-	if (dentry)
-		dentry->d_flags |= DCACHE_DISCONNECTED;
-	return dentry;
+	return __d_alloc(sb, name);
 }
 EXPORT_SYMBOL(d_alloc_pseudo);
 
-- 
cgit v1.2.3


From f80de2cde10350b8d146e375ff8b634e72e6a827 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 18 Jul 2012 16:27:37 -0600
Subject: dcache: don't clear DCACHE_DISCONNECTED too early

DCACHE_DISCONNECTED should not be cleared until we're sure the dentry is
connected all the way up to the root of the filesystem.  It *shouldn't*
be cleared as soon as the dentry is connected to a parent.  That will
cause bugs at least on exportable filesystems.

Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/dcache.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index 67662beebc72..72c99b88d264 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2757,7 +2757,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	spin_unlock(&dentry->d_lock);
 
 	/* anon->d_lock still locked, returns locked */
-	anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
 
 /**
-- 
cgit v1.2.3