From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:23 +1100 Subject: fs: change d_delete semantics Change d_delete from a dentry deletion notification to a dentry caching advise, more like ->drop_inode. Require it to be constant and idempotent, and not take d_lock. This is how all existing filesystems use the callback anyway. This makes fine grained dentry locking of dput and dentry lru scanning much simpler. Signed-off-by: Nick Piggin --- Documentation/filesystems/porting | 8 ++++++++ Documentation/filesystems/vfs.txt | 27 +++++++++++++-------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index b12c89538680..9e71c9ad3108 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -318,3 +318,11 @@ if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput( may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly free the on-disk inode, you may end up doing that while ->write_inode() is writing to it. + +--- +[mandatory] + + .d_delete() now only advises the dcache as to whether or not to cache +unreferenced dentries, and is now only called when the dentry refcount goes to +0. Even on 0 refcount transition, it must be able to tolerate being called 0, +1, or more times (eg. constant, idempotent). diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 20899e095e7e..95c0a93f056c 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -847,9 +847,9 @@ defined: struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash) (struct dentry *, struct qstr *); - int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); - int (*d_delete)(struct dentry *); + int (*d_hash)(struct dentry *, struct qstr *); + int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); @@ -864,9 +864,11 @@ struct dentry_operations { d_compare: called when a dentry should be compared with another - d_delete: called when the last reference to a dentry is - deleted. This means no-one is using the dentry, however it is - still valid and in the dcache + d_delete: called when the last reference to a dentry is dropped and the + dcache is deciding whether or not to cache it. Return 1 to delete + immediately, or 0 to cache the dentry. Default is NULL which means to + always cache a reachable dentry. d_delete must be constant and + idempotent. d_release: called when a dentry is really deallocated @@ -910,14 +912,11 @@ manipulate dentries: the usage count) dput: close a handle for a dentry (decrements the usage count). If - the usage count drops to 0, the "d_delete" method is called - and the dentry is placed on the unused list if the dentry is - still in its parents hash list. Putting the dentry on the - unused list just means that if the system needs some RAM, it - goes through the unused list of dentries and deallocates them. - If the dentry has already been unhashed and the usage count - drops to 0, in this case the dentry is deallocated after the - "d_delete" method is called + the usage count drops to 0, and the dentry is still in its + parent's hash, the "d_delete" method is called to check whether + it should be cached. If it should not be cached, or if the dentry + is not hashed, it is deleted. Otherwise cached dentries are put + into an LRU list to be reclaimed on memory shortage. d_drop: this unhashes a dentry from its parents hash list. A subsequent call to dput() will deallocate the dentry if its -- cgit v1.2.3 From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:27 +1100 Subject: fs: change d_compare for rcu-walk Change d_compare so it may be called from lock-free RCU lookups. This does put significant restrictions on what may be done from the callback, however there don't seem to have been any problems with in-tree fses. If some strange use case pops up that _really_ cannot cope with the rcu-walk rules, we can just add new rcu-unaware callbacks, which would cause name lookup to drop out of rcu-walk mode. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- Documentation/filesystems/Locking | 4 +- Documentation/filesystems/porting | 7 +++ Documentation/filesystems/vfs.txt | 26 +++++++++-- drivers/staging/smbfs/dir.c | 16 ++++--- fs/adfs/dir.c | 8 ++-- fs/affs/namei.c | 46 ++++++++++++-------- fs/cifs/dir.c | 12 ++--- fs/dcache.c | 4 +- fs/fat/namei_msdos.c | 14 +++--- fs/fat/namei_vfat.c | 33 ++++++++------ fs/hfs/hfs_fs.h | 5 ++- fs/hfs/string.c | 14 +++--- fs/hfsplus/hfsplus_fs.h | 5 ++- fs/hfsplus/unicode.c | 15 ++++--- fs/hpfs/dentry.c | 22 ++++++---- fs/isofs/inode.c | 92 ++++++++++++++++++++------------------- fs/isofs/namei.c | 3 +- fs/jfs/namei.c | 11 +++-- fs/ncpfs/dir.c | 25 ++++++----- fs/ncpfs/ncplib_kernel.h | 8 ++-- fs/proc/proc_sysctl.c | 13 +++--- include/linux/dcache.h | 12 +++-- include/linux/ncp_fs.h | 4 +- 23 files changed, 242 insertions(+), 157 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 33fa3e5d38fd..9a76f8d8bf95 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -11,7 +11,9 @@ be able to use diff(1). prototypes: int (*d_revalidate)(struct dentry *, int); int (*d_hash) (struct dentry *, struct qstr *); - int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 9e71c9ad3108..d44511e20828 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -326,3 +326,10 @@ to it. unreferenced dentries, and is now only called when the dentry refcount goes to 0. Even on 0 refcount transition, it must be able to tolerate being called 0, 1, or more times (eg. constant, idempotent). + +--- +[mandatory] + + .d_compare() calling convention and locking rules are significantly +changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +look at examples of other filesystems) for guidance. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 95c0a93f056c..250681b8c7cc 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -848,7 +848,9 @@ defined: struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); int (*d_hash)(struct dentry *, struct qstr *); - int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -860,9 +862,27 @@ struct dentry_operations { dcache. Most filesystems leave this as NULL, because all their dentries in the dcache are valid - d_hash: called when the VFS adds a dentry to the hash table + d_hash: called when the VFS adds a dentry to the hash table. The first + dentry passed to d_hash is the parent directory that the name is + to be hashed into. - d_compare: called when a dentry should be compared with another + d_compare: called to compare a dentry name with a given name. The first + dentry is the parent of the dentry to be compared, the second is + the parent's inode, then the dentry and inode (may be NULL) of the + child dentry. len and name string are properties of the dentry to be + compared. qstr is the name to compare it with. + + Must be constant and idempotent, and should not take locks if + possible, and should not or store into the dentry or inodes. + Should not dereference pointers outside the dentry or inodes without + lots of care (eg. d_parent, d_inode, d_name should not be used). + + However, our vfsmount is pinned, and RCU held, so the dentries and + inodes won't disappear, neither will our sb or filesystem module. + ->i_sb and ->d_sb may be used. + + It is a tricky calling convention because it needs to be called under + "rcu-walk", ie. without any locks or references on things. d_delete: called when the last reference to a dentry is dropped and the dcache is deciding whether or not to cache it. Return 1 to delete diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c index 2270d4822c2f..bd63229f43f2 100644 --- a/drivers/staging/smbfs/dir.c +++ b/drivers/staging/smbfs/dir.c @@ -275,7 +275,10 @@ smb_dir_open(struct inode *dir, struct file *file) */ static int smb_lookup_validate(struct dentry *, struct nameidata *); static int smb_hash_dentry(struct dentry *, struct qstr *); -static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +static int smb_compare_dentry(const struct dentry *, + const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); static int smb_delete_dentry(const struct dentry *); static const struct dentry_operations smbfs_dentry_operations = @@ -347,14 +350,17 @@ smb_hash_dentry(struct dentry *dir, struct qstr *this) } static int -smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b) +smb_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; - if (a->len != b->len) + if (len != name->len) goto out; - for (i=0; i < a->len; i++) { - if (tolower(a->name[i]) != tolower(b->name[i])) + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) goto out; } result = 0; diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index f4287e4de744..c8ed66162bd4 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -237,17 +237,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr) * requirements of the underlying filesystem. */ static int -adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name) +adfs_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { int i; - if (entry->len != name->len) + if (len != name->len) return 1; for (i = 0; i < name->len; i++) { char a, b; - a = entry->name[i]; + a = str[i]; b = name->name[i]; if (a >= 'A' && a <= 'Z') diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 914d1c0bc07a..547d5deb0d42 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -14,10 +14,16 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); static int affs_hash_dentry(struct dentry *, struct qstr *); -static int affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); static int affs_intl_hash_dentry(struct dentry *, struct qstr *); -static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); const struct dentry_operations affs_dentry_operations = { .d_hash = affs_hash_dentry, @@ -88,29 +94,29 @@ affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr) return __affs_hash_dentry(dentry, qstr, affs_intl_toupper); } -static inline int -__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper) +static inline int __affs_compare_dentry(unsigned int len, + const char *str, const struct qstr *name, toupper_t toupper) { - const u8 *aname = a->name; - const u8 *bname = b->name; - int len; + const u8 *aname = str; + const u8 *bname = name->name; - /* 'a' is the qstr of an already existing dentry, so the name - * must be valid. 'b' must be validated first. + /* + * 'str' is the name of an already existing dentry, so the name + * must be valid. 'name' must be validated first. */ - if (affs_check_name(b->name,b->len)) + if (affs_check_name(name->name, name->len)) return 1; - /* If the names are longer than the allowed 30 chars, + /* + * If the names are longer than the allowed 30 chars, * the excess is ignored, so their length may differ. */ - len = a->len; if (len >= 30) { - if (b->len < 30) + if (name->len < 30) return 1; len = 30; - } else if (len != b->len) + } else if (len != name->len) return 1; for (; len > 0; len--) @@ -121,14 +127,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou } static int -affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(dentry, a, b, affs_toupper); + return __affs_compare_dentry(len, str, name, affs_toupper); } static int -affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(dentry, a, b, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper); } /* diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 521d841b1fd1..c60133f0d8e4 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -715,13 +715,15 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) return 0; } -static int cifs_ci_compare(struct dentry *dentry, struct qstr *a, - struct qstr *b) +static int cifs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; - if ((a->len == b->len) && - (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) + if ((name->len == len) && + (nls_strnicmp(codepage, name->name, str, len) == 0)) return 0; return 1; } diff --git a/fs/dcache.c b/fs/dcache.c index 814e5f491e9c..7075555fbb04 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1437,7 +1437,9 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) */ qstr = &dentry->d_name; if (parent->d_op && parent->d_op->d_compare) { - if (parent->d_op->d_compare(parent, qstr, name)) + if (parent->d_op->d_compare(parent, parent->d_inode, + dentry, dentry->d_inode, + qstr->len, qstr->name, name)) goto next; } else { if (qstr->len != len) diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 3345aabd1dd7..99d3c7ac973c 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -164,16 +164,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr) * Compare two msdos names. If either of the names are invalid, * we fall back to doing the standard name comparison. */ -static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; + struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; int error; - error = msdos_format_name(a->name, a->len, a_msdos_name, options); + error = msdos_format_name(name->name, name->len, a_msdos_name, options); if (error) goto old_compare; - error = msdos_format_name(b->name, b->len, b_msdos_name, options); + error = msdos_format_name(str, len, b_msdos_name, options); if (error) goto old_compare; error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); @@ -182,8 +184,8 @@ out: old_compare: error = 1; - if (a->len == b->len) - error = memcmp(a->name, b->name, a->len); + if (name->len == len) + error = memcmp(name->name, str, len); goto out; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b936703b8924..95e00ab84c3f 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -85,15 +85,18 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) } /* returns the length of a struct qstr, ignoring trailing dots */ -static unsigned int vfat_striptail_len(struct qstr *qstr) +static unsigned int __vfat_striptail_len(unsigned int len, const char *name) { - unsigned int len = qstr->len; - - while (len && qstr->name[len - 1] == '.') + while (len && name[len - 1] == '.') len--; return len; } +static unsigned int vfat_striptail_len(const struct qstr *qstr) +{ + return __vfat_striptail_len(qstr->len, qstr->name); +} + /* * Compute the hash for the vfat name corresponding to the dentry. * Note: if the name is invalid, we leave the hash code unchanged so @@ -133,16 +136,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) /* * Case insensitive compare of two vfat names. */ -static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; + struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; unsigned int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = vfat_striptail_len(a); - blen = vfat_striptail_len(b); + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); if (alen == blen) { - if (nls_strnicmp(t, a->name, b->name, alen) == 0) + if (nls_strnicmp(t, name->name, str, alen) == 0) return 0; } return 1; @@ -151,15 +156,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) /* * Case sensitive compare of two vfat names. */ -static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { unsigned int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = vfat_striptail_len(a); - blen = vfat_striptail_len(b); + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); if (alen == blen) { - if (strncmp(a->name, b->name, alen) == 0) + if (strncmp(name->name, str, alen) == 0) return 0; } return 1; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index c8cffb81e849..8cd876f0e961 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -216,7 +216,10 @@ extern const struct dentry_operations hfs_dentry_operations; extern int hfs_hash_dentry(struct dentry *, struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); -extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +extern int hfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); /* trans.c */ extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 927a5af79428..aaf90d0d6940 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -92,21 +92,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ -int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) +int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; - int len; - len = s1->len; if (len >= HFS_NAMELEN) { - if (s2->len < HFS_NAMELEN) + if (name->len < HFS_NAMELEN) return 1; len = HFS_NAMELEN; - } else if (len != s2->len) + } else if (len != name->len) return 1; - n1 = s1->name; - n2 = s2->name; + n1 = str; + n2 = name->name; while (len--) { if (caseorder[*n1++] != caseorder[*n2++]) return 1; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index cb3653efb57a..7aa96eefe483 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -380,7 +380,10 @@ int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *) int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); -int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index b66d67de882c..b178c997efa8 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -363,9 +363,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct super_block *sb = dentry->d_sb; + struct super_block *sb = parent->d_sb; int casefold, decompose, size; int dsize1, dsize2, len1, len2; const u16 *dstr1, *dstr2; @@ -375,10 +378,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - astr1 = s1->name; - len1 = s1->len; - astr2 = s2->name; - len2 = s2->len; + astr1 = str; + len1 = len; + astr2 = name->name; + len2 = name->len; dsize1 = dsize2 = 0; dstr1 = dstr2 = NULL; diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 67d9d36b3d5f..dd9b1e74a734 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -34,19 +34,25 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) return 0; } -static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int hpfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - unsigned al=a->len; - unsigned bl=b->len; - hpfs_adjust_length(a->name, &al); + unsigned al = len; + unsigned bl = name->len; + + hpfs_adjust_length(str, &al); /*hpfs_adjust_length(b->name, &bl);*/ - /* 'a' is the qstr of an already existing dentry, so the name - * must be valid. 'b' must be validated first. + + /* + * 'str' is the nane of an already existing dentry, so the name + * must be valid. 'name' must be validated first. */ - if (hpfs_chk_name(b->name, &bl)) + if (hpfs_chk_name(name->name, &bl)) return 1; - if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0)) + if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0)) return 1; return 0; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bfdeb82a53be..7b0fbc61af81 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -28,14 +28,26 @@ static int isofs_hashi(struct dentry *parent, struct qstr *qstr); static int isofs_hash(struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b); -static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b); +static int isofs_dentry_cmpi(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr); static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); -static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); +static int isofs_dentry_cmpi_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); #endif static void isofs_put_super(struct super_block *sb) @@ -206,49 +218,31 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) } /* - * Case insensitive compare of two isofs names. + * Compare of two isofs names. */ -static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a, - struct qstr *b, int ms) +static int isofs_dentry_cmp_common( + unsigned int len, const char *str, + const struct qstr *name, int ms, int ci) { int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = a->len; - blen = b->len; + alen = name->len; + blen = len; if (ms) { - while (alen && a->name[alen-1] == '.') + while (alen && name->name[alen-1] == '.') alen--; - while (blen && b->name[blen-1] == '.') + while (blen && str[blen-1] == '.') blen--; } if (alen == blen) { - if (strnicmp(a->name, b->name, alen) == 0) - return 0; - } - return 1; -} - -/* - * Case sensitive compare of two isofs names. - */ -static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a, - struct qstr *b, int ms) -{ - int alen, blen; - - /* A filename cannot end in '.' or we treat it like it has none */ - alen = a->len; - blen = b->len; - if (ms) { - while (alen && a->name[alen-1] == '.') - alen--; - while (blen && b->name[blen-1] == '.') - blen--; - } - if (alen == blen) { - if (strncmp(a->name, b->name, alen) == 0) - return 0; + if (ci) { + if (strnicmp(name->name, str, alen) == 0) + return 0; + } else { + if (strncmp(name->name, str, alen) == 0) + return 0; + } } return 1; } @@ -266,15 +260,19 @@ isofs_hashi(struct dentry *dentry, struct qstr *qstr) } static int -isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmp_common(dentry, a, b, 0); + return isofs_dentry_cmp_common(len, str, name, 0, 0); } static int -isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmpi_common(dentry, a, b, 0); + return isofs_dentry_cmp_common(len, str, name, 0, 1); } #ifdef CONFIG_JOLIET @@ -291,15 +289,19 @@ isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr) } static int -isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmp_common(dentry, a, b, 1); + return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int -isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmpi_common(dentry, a, b, 1); + return isofs_dentry_cmp_common(len, str, name, 1, 1); } #endif diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 0d23abfd4280..715f7d318046 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen) qstr.name = compare; qstr.len = dlen; - return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr); + return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, + dentry->d_name.len, dentry->d_name.name, &qstr); } /* diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 2da1546161fa..92129016cd79 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1587,14 +1587,17 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this) return 0; } -static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b) +static int jfs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; - if (a->len != b->len) + if (len != name->len) goto out; - for (i=0; i < a->len; i++) { - if (tolower(a->name[i]) != tolower(b->name[i])) + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) goto out; } result = 0; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index e80ea4e37c48..3bcc68aed416 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -75,7 +75,9 @@ const struct inode_operations ncp_dir_inode_operations = */ static int ncp_lookup_validate(struct dentry *, struct nameidata *); static int ncp_hash_dentry(struct dentry *, struct qstr *); -static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); static const struct dentry_operations ncp_dentry_operations = @@ -113,10 +115,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) #define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) -static inline int ncp_case_sensitive(struct dentry *dentry) +static inline int ncp_case_sensitive(const struct inode *i) { #ifdef CONFIG_NCPFS_NFS_NS - return ncp_namespace(dentry->d_inode) == NW_NS_NFS; + return ncp_namespace(i) == NW_NS_NFS; #else return 0; #endif /* CONFIG_NCPFS_NFS_NS */ @@ -129,12 +131,13 @@ static inline int ncp_case_sensitive(struct dentry *dentry) static int ncp_hash_dentry(struct dentry *dentry, struct qstr *this) { - if (!ncp_case_sensitive(dentry)) { + if (!ncp_case_sensitive(dentry->d_inode)) { + struct super_block *sb = dentry->d_sb; struct nls_table *t; unsigned long hash; int i; - t = NCP_IO_TABLE(dentry); + t = NCP_IO_TABLE(sb); hash = init_name_hash(); for (i=0; ilen ; i++) hash = partial_name_hash(ncp_tolower(t, this->name[i]), @@ -145,15 +148,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this) } static int -ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - if (a->len != b->len) + if (len != name->len) return 1; - if (ncp_case_sensitive(dentry)) - return strncmp(a->name, b->name, a->len); + if (ncp_case_sensitive(pinode)) + return strncmp(str, name->name, len); - return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); + return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len); } /* diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 3c57eca634ce..244d1b73fda7 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -135,7 +135,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *, const unsigned char *, unsigned int, int); #define NCP_ESC ':' -#define NCP_IO_TABLE(dentry) (NCP_SERVER((dentry)->d_inode)->nls_io) +#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io) #define ncp_tolower(t, c) nls_tolower(t, c) #define ncp_toupper(t, c) nls_toupper(t, c) #define ncp_strnicmp(t, s1, s2, len) \ @@ -150,15 +150,15 @@ int ncp__io2vol(unsigned char *, unsigned int *, int ncp__vol2io(unsigned char *, unsigned int *, const unsigned char *, unsigned int, int); -#define NCP_IO_TABLE(dentry) NULL +#define NCP_IO_TABLE(sb) NULL #define ncp_tolower(t, c) tolower(c) #define ncp_toupper(t, c) toupper(c) #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) -static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1, - const unsigned char *s2, int len) +static inline int ncp_strnicmp(const struct nls_table *t, + const unsigned char *s1, const unsigned char *s2, int len) { while (len--) { if (tolower(*s1++) != tolower(*s2++)) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a256d770ea18..ae4b0fd9033f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -397,15 +397,16 @@ static int proc_sys_delete(const struct dentry *dentry) return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, - struct qstr *name) +static int proc_sys_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct dentry *dentry = container_of(qstr, struct dentry, d_name); - if (qstr->len != name->len) + if (name->len != len) return 1; - if (memcmp(qstr->name, name->name, name->len)) + if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); + return !sysctl_is_seen(PROC_I(inode)->sysctl); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6cdf4995c90a..75a072bf2a34 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -134,7 +134,9 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); int (*d_hash)(struct dentry *, struct qstr *); - int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -145,12 +147,8 @@ struct dentry_operations { * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/Locking. Keep it updated! * - * the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining - * equivalency classes. Using the dentry itself might not work, as it - * might be a negative dentry which has no information associated with - * it. + * FUrther descriptions are found in Documentation/filesystems/vfs.txt. + * Keep it updated too! */ /* d_flags entries */ diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h index ef663061d5ac..1c27f201c856 100644 --- a/include/linux/ncp_fs.h +++ b/include/linux/ncp_fs.h @@ -184,13 +184,13 @@ struct ncp_entry_info { __u8 file_handle[6]; }; -static inline struct ncp_server *NCP_SBP(struct super_block *sb) +static inline struct ncp_server *NCP_SBP(const struct super_block *sb) { return sb->s_fs_info; } #define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) -static inline struct ncp_inode_info *NCP_FINFO(struct inode *inode) +static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) { return container_of(inode, struct ncp_inode_info, vfs_inode); } -- cgit v1.2.3 From b1e6a015a580ad145689ad1d6b4aa0e03e6c868b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:28 +1100 Subject: fs: change d_hash for rcu-walk Change d_hash so it may be called from lock-free RCU lookups. See similar patch for d_compare for details. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- Documentation/filesystems/Locking | 5 +++-- Documentation/filesystems/porting | 7 +++++++ Documentation/filesystems/vfs.txt | 8 ++++++-- drivers/staging/smbfs/cache.c | 2 +- drivers/staging/smbfs/dir.c | 6 ++++-- fs/adfs/dir.c | 3 ++- fs/affs/namei.c | 20 ++++++++++++-------- fs/cifs/dir.c | 5 +++-- fs/cifs/readdir.c | 2 +- fs/dcache.c | 2 +- fs/ecryptfs/inode.c | 4 ++-- fs/fat/namei_msdos.c | 3 ++- fs/fat/namei_vfat.c | 8 +++++--- fs/gfs2/dentry.c | 3 ++- fs/hfs/hfs_fs.h | 3 ++- fs/hfs/string.c | 3 ++- fs/hfsplus/hfsplus_fs.h | 3 ++- fs/hfsplus/unicode.c | 3 ++- fs/hpfs/dentry.c | 3 ++- fs/isofs/inode.c | 28 ++++++++++++++++++---------- fs/jfs/namei.c | 3 ++- fs/namei.c | 5 +++-- fs/ncpfs/dir.c | 10 ++++++---- fs/sysv/namei.c | 3 ++- include/linux/dcache.h | 3 ++- 25 files changed, 94 insertions(+), 51 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 9a76f8d8bf95..a15ee207b449 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -10,7 +10,8 @@ be able to use diff(1). --------------------------- dentry_operations -------------------------- prototypes: int (*d_revalidate)(struct dentry *, int); - int (*d_hash) (struct dentry *, struct qstr *); + int (*d_hash)(const struct dentry *, const struct inode *, + struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); @@ -22,7 +23,7 @@ prototypes: locking rules: dcache_lock rename_lock ->d_lock may block d_revalidate: no no no yes -d_hash no no no yes +d_hash no no no no d_compare: no yes no no d_delete: yes no yes no d_release: no no no yes diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index d44511e20828..9fd31940a8ef 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -333,3 +333,10 @@ unreferenced dentries, and is now only called when the dentry refcount goes to .d_compare() calling convention and locking rules are significantly changed. Read updated documentation in Documentation/filesystems/vfs.txt (and look at examples of other filesystems) for guidance. + +--- +[mandatory] + + .d_hash() calling convention and locking rules are significantly +changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +look at examples of other filesystems) for guidance. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 250681b8c7cc..69b10ff5ec81 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -847,7 +847,8 @@ defined: struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash)(struct dentry *, struct qstr *); + int (*d_hash)(const struct dentry *, const struct inode *, + struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); @@ -864,7 +865,10 @@ struct dentry_operations { d_hash: called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is - to be hashed into. + to be hashed into. The inode is the dentry's inode. + + Same locking and synchronisation rules as d_compare regarding + what is safe to dereference etc. d_compare: called to compare a dentry name with a given name. The first dentry is the parent of the dentry to be compared, the second is diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c index dbd2e1df3ba9..0beded260b00 100644 --- a/drivers/staging/smbfs/cache.c +++ b/drivers/staging/smbfs/cache.c @@ -134,7 +134,7 @@ smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, qname->hash = full_name_hash(qname->name, qname->len); if (dentry->d_op && dentry->d_op->d_hash) - if (dentry->d_op->d_hash(dentry, qname) != 0) + if (dentry->d_op->d_hash(dentry, inode, qname) != 0) goto end_advance; newdent = d_lookup(dentry, qname); diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c index bd63229f43f2..5f79799d5d4a 100644 --- a/drivers/staging/smbfs/dir.c +++ b/drivers/staging/smbfs/dir.c @@ -274,7 +274,8 @@ smb_dir_open(struct inode *dir, struct file *file) * Dentry operations routines */ static int smb_lookup_validate(struct dentry *, struct nameidata *); -static int smb_hash_dentry(struct dentry *, struct qstr *); +static int smb_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); static int smb_compare_dentry(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, @@ -336,7 +337,8 @@ smb_lookup_validate(struct dentry * dentry, struct nameidata *nd) } static int -smb_hash_dentry(struct dentry *dir, struct qstr *this) +smb_hash_dentry(const struct dentry *dir, const struct inode *inode, + struct qstr *this) { unsigned long hash; int i; diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index c8ed66162bd4..a11e5e102716 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = { }; static int -adfs_hash(struct dentry *parent, struct qstr *qstr) +adfs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr) { const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; const unsigned char *name; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 547d5deb0d42..5aca08c21100 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -13,13 +13,15 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); -static int affs_hash_dentry(struct dentry *, struct qstr *); +static int affs_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); static int affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); -static int affs_intl_hash_dentry(struct dentry *, struct qstr *); +static int affs_intl_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); static int affs_intl_compare_dentry(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -64,13 +66,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name,qstr->len); + i = affs_check_name(qstr->name, qstr->len); if (i) return i; @@ -84,14 +86,16 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) } static int -affs_hash_dentry(struct dentry *dentry, struct qstr *qstr) +affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - return __affs_hash_dentry(dentry, qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper); } static int -affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr) +affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - return __affs_hash_dentry(dentry, qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper); } static inline int __affs_compare_dentry(unsigned int len, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c60133f0d8e4..88bfe686ac00 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -700,9 +700,10 @@ const struct dentry_operations cifs_dentry_ops = { /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; -static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) +static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *q) { - struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; int i; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a73eb9f4bdaf..ee463aeca0b0 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, cFYI(1, "For %s", name->name); if (parent->d_op && parent->d_op->d_hash) - parent->d_op->d_hash(parent, name); + parent->d_op->d_hash(parent, parent->d_inode, name); else name->hash = full_name_hash(name->name, name->len); diff --git a/fs/dcache.c b/fs/dcache.c index 7075555fbb04..6f59f375ed9d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1478,7 +1478,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) */ name->hash = full_name_hash(name->name, name->len); if (dir->d_op && dir->d_op->d_hash) { - if (dir->d_op->d_hash(dir, name) < 0) + if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0) goto out; } dentry = d_lookup(dir, name); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 9d1a22d62765..a1ed7a7cb173 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -454,7 +454,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, lower_name.hash = ecryptfs_dentry->d_name.hash; if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - &lower_name); + lower_dir_dentry->d_inode, &lower_name); if (rc < 0) goto out_d_drop; } @@ -489,7 +489,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, lower_name.hash = full_name_hash(lower_name.name, lower_name.len); if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - &lower_name); + lower_dir_dentry->d_inode, &lower_name); if (rc < 0) goto out_d_drop; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 99d3c7ac973c..3b3e072d8982 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len, * that the existing dentry can be used. The msdos fs routines will * return ENOENT or EINVAL as appropriate. */ -static int msdos_hash(struct dentry *dentry, struct qstr *qstr) +static int msdos_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; unsigned char msdos_name[MSDOS_NAME]; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 95e00ab84c3f..4fc06278db48 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -103,7 +103,8 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr) * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hash(struct dentry *dentry, struct qstr *qstr) +static int vfat_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); return 0; @@ -115,9 +116,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr) * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) +static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; + struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; const unsigned char *name; unsigned int len; unsigned long hash; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index e80fea2f65ff..50497f65763b 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -100,7 +100,8 @@ fail: return 0; } -static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) { str->hash = gfs2_disk_hash(str->name, str->len); return 0; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 8cd876f0e961..ad97c2d58287 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -213,7 +213,8 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; -extern int hfs_hash_dentry(struct dentry *, struct qstr *); +extern int hfs_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); extern int hfs_compare_dentry(const struct dentry *parent, diff --git a/fs/hfs/string.c b/fs/hfs/string.c index aaf90d0d6940..495a976a3cc9 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -51,7 +51,8 @@ static unsigned char caseorder[256] = { /* * Hash a string to an integer in a case-independent way */ -int hfs_hash_dentry(struct dentry *dentry, struct qstr *this) +int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 7aa96eefe483..a5308f491e3e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -379,7 +379,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); -int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); +int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *str); int hfsplus_compare_dentry(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index b178c997efa8..d800aa0f2c80 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -320,7 +320,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) +int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) { struct super_block *sb = dentry->d_sb; const char *astr; diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index dd9b1e74a734..35526df1fd32 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -12,7 +12,8 @@ * Note: the dentry argument is the parent dentry. */ -static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) +static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { unsigned long hash; int i; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 7b0fbc61af81..d204ee4235fd 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -26,8 +26,10 @@ #define BEQUIET -static int isofs_hashi(struct dentry *parent, struct qstr *qstr); -static int isofs_hash(struct dentry *parent, struct qstr *qstr); +static int isofs_hashi(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); static int isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -38,8 +40,10 @@ static int isofs_dentry_cmp(const struct dentry *parent, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET -static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr); -static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr); +static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); static int isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -172,7 +176,7 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -193,7 +197,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -248,13 +252,15 @@ static int isofs_dentry_cmp_common( } static int -isofs_hash(struct dentry *dentry, struct qstr *qstr) +isofs_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 0); } static int -isofs_hashi(struct dentry *dentry, struct qstr *qstr) +isofs_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } @@ -277,13 +283,15 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, #ifdef CONFIG_JOLIET static int -isofs_hash_ms(struct dentry *dentry, struct qstr *qstr) +isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int -isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr) +isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 92129016cd79..57f90dad8919 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1574,7 +1574,8 @@ const struct file_operations jfs_dir_operations = { .llseek = generic_file_llseek, }; -static int jfs_ci_hash(struct dentry *dir, struct qstr *this) +static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, + struct qstr *this) { unsigned long hash; int i; diff --git a/fs/namei.c b/fs/namei.c index 4ff7ca530533..f3b5ca404659 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -731,7 +731,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, * to use its own hash.. */ if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); + int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, + nd->path.dentry->d_inode, name); if (err < 0) return err; } @@ -1134,7 +1135,7 @@ static struct dentry *__lookup_hash(struct qstr *name, * to use its own hash.. */ if (base->d_op && base->d_op->d_hash) { - err = base->d_op->d_hash(base, name); + err = base->d_op->d_hash(base, inode, name); dentry = ERR_PTR(err); if (err < 0) goto out; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 3bcc68aed416..bbbf7922f422 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -74,7 +74,8 @@ const struct inode_operations ncp_dir_inode_operations = * Dentry operations routines */ static int ncp_lookup_validate(struct dentry *, struct nameidata *); -static int ncp_hash_dentry(struct dentry *, struct qstr *); +static int ncp_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); static int ncp_compare_dentry(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); @@ -129,9 +130,10 @@ static inline int ncp_case_sensitive(const struct inode *i) * is case-sensitive. */ static int -ncp_hash_dentry(struct dentry *dentry, struct qstr *this) +ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) { - if (!ncp_case_sensitive(dentry->d_inode)) { + if (!ncp_case_sensitive(inode)) { struct super_block *sb = dentry->d_sb; struct nls_table *t; unsigned long hash; @@ -597,7 +599,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, qname.hash = full_name_hash(qname.name, qname.len); if (dentry->d_op && dentry->d_op->d_hash) - if (dentry->d_op->d_hash(dentry, &qname) != 0) + if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0) goto end_advance; newdent = d_lookup(dentry, &qname); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 11e7f7d11cd0..7507aeb4c90e 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int sysv_hash(struct dentry *dentry, struct qstr *qstr) +static int sysv_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { /* Truncate the name in place, avoids having to define a compare function. */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 75a072bf2a34..1149e706f04d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -133,7 +133,8 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash)(struct dentry *, struct qstr *); + int (*d_hash)(const struct dentry *, const struct inode *, + struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); -- cgit v1.2.3 From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:38 +1100 Subject: fs: dcache remove dcache_lock dcache_lock no longer protects anything. remove it. Signed-off-by: Nick Piggin --- Documentation/filesystems/Locking | 16 +-- Documentation/filesystems/dentry-locking.txt | 40 ++++--- Documentation/filesystems/porting | 8 +- arch/powerpc/platforms/cell/spufs/inode.c | 5 +- drivers/infiniband/hw/ipath/ipath_fs.c | 6 +- drivers/infiniband/hw/qib/qib_fs.c | 3 - drivers/staging/pohmelfs/path_entry.c | 2 - drivers/staging/smbfs/cache.c | 4 - drivers/usb/core/inode.c | 3 - fs/9p/vfs_inode.c | 2 - fs/affs/amigaffs.c | 2 - fs/autofs4/autofs_i.h | 3 + fs/autofs4/expire.c | 10 +- fs/autofs4/root.c | 44 ++++---- fs/autofs4/waitq.c | 7 +- fs/ceph/dir.c | 6 +- fs/ceph/inode.c | 4 - fs/cifs/inode.c | 3 - fs/coda/cache.c | 2 - fs/configfs/configfs_internal.h | 2 - fs/configfs/inode.c | 6 +- fs/dcache.c | 160 ++++----------------------- fs/exportfs/expfs.c | 4 - fs/libfs.c | 8 -- fs/namei.c | 9 +- fs/ncpfs/dir.c | 3 - fs/ncpfs/ncplib_kernel.h | 4 - fs/nfs/dir.c | 3 - fs/nfs/getroot.c | 2 - fs/nfs/namespace.c | 3 - fs/notify/fsnotify.c | 2 - fs/ocfs2/dcache.c | 2 - include/linux/dcache.h | 5 +- include/linux/fs.h | 6 +- include/linux/fsnotify.h | 2 - include/linux/fsnotify_backend.h | 11 +- include/linux/namei.h | 1 - kernel/cgroup.c | 6 - mm/filemap.c | 3 - security/selinux/selinuxfs.c | 4 - 40 files changed, 109 insertions(+), 307 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index a15ee207b449..bdad6414dfa0 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -21,14 +21,14 @@ prototypes: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); locking rules: - dcache_lock rename_lock ->d_lock may block -d_revalidate: no no no yes -d_hash no no no no -d_compare: no yes no no -d_delete: yes no yes no -d_release: no no no yes -d_iput: no no no yes -d_dname: no no no no + rename_lock ->d_lock may block +d_revalidate: no no yes +d_hash no no no +d_compare: yes no no +d_delete: no yes no +d_release: no no yes +d_iput: no no yes +d_dname: no no no --------------------------- inode_operations --------------------------- prototypes: diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt index 79334ed5daa7..30b6a40f5650 100644 --- a/Documentation/filesystems/dentry-locking.txt +++ b/Documentation/filesystems/dentry-locking.txt @@ -31,6 +31,7 @@ significant change is the way d_lookup traverses the hash chain, it doesn't acquire the dcache_lock for this and rely on RCU to ensure that the dentry has not been *freed*. +dcache_lock no longer exists, dentry locking is explained in fs/dcache.c Dcache locking details ====================== @@ -50,14 +51,12 @@ Safe lock-free look-up of dcache hash table Dcache is a complex data structure with the hash table entries also linked together in other lists. In 2.4 kernel, dcache_lock protected -all the lists. We applied RCU only on hash chain walking. The rest of -the lists are still protected by dcache_lock. Some of the important -changes are : +all the lists. RCU dentry hash walking works like this: 1. The deletion from hash chain is done using hlist_del_rcu() macro which doesn't initialize next pointer of the deleted dentry and this allows us to walk safely lock-free while a deletion is - happening. + happening. This is a standard hlist_rcu iteration. 2. Insertion of a dentry into the hash table is done using hlist_add_head_rcu() which take care of ordering the writes - the @@ -66,19 +65,18 @@ changes are : which has since been replaced by hlist_for_each_entry_rcu(), while walking the hash chain. The only requirement is that all initialization to the dentry must be done before - hlist_add_head_rcu() since we don't have dcache_lock protection - while traversing the hash chain. This isn't different from the - existing code. - -3. The dentry looked up without holding dcache_lock by cannot be - returned for walking if it is unhashed. It then may have a NULL - d_inode or other bogosity since RCU doesn't protect the other - fields in the dentry. We therefore use a flag DCACHE_UNHASHED to - indicate unhashed dentries and use this in conjunction with a - per-dentry lock (d_lock). Once looked up without the dcache_lock, - we acquire the per-dentry lock (d_lock) and check if the dentry is - unhashed. If so, the look-up is failed. If not, the reference count - of the dentry is increased and the dentry is returned. + hlist_add_head_rcu() since we don't have lock protection + while traversing the hash chain. + +3. The dentry looked up without holding locks cannot be returned for + walking if it is unhashed. It then may have a NULL d_inode or other + bogosity since RCU doesn't protect the other fields in the dentry. We + therefore use a flag DCACHE_UNHASHED to indicate unhashed dentries + and use this in conjunction with a per-dentry lock (d_lock). Once + looked up without locks, we acquire the per-dentry lock (d_lock) and + check if the dentry is unhashed. If so, the look-up is failed. If not, + the reference count of the dentry is increased and the dentry is + returned. 4. Once a dentry is looked up, it must be ensured during the path walk for that component it doesn't go away. In pre-2.5.10 code, this was @@ -86,10 +84,10 @@ changes are : In some sense, dcache_rcu path walking looks like the pre-2.5.10 version. -5. All dentry hash chain updates must take the dcache_lock as well as - the per-dentry lock in that order. dput() does this to ensure that - a dentry that has just been looked up in another CPU doesn't get - deleted before dget() can be done on it. +5. All dentry hash chain updates must take the per-dentry lock (see + fs/dcache.c). This excludes dput() to ensure that a dentry that has + been looked up concurrently does not get deleted before dget() can + take a ref. 6. There are several ways to do reference counting of RCU protected objects. One such example is in ipv4 route cache where deferred diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 9fd31940a8ef..1eb76959d096 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink(). ->d_parent changes are not protected by BKL anymore. Read access is safe if at least one of the following is true: * filesystem has no cross-directory rename() - * dcache_lock is held * we know that parent had been locked (e.g. we are looking at ->d_parent of ->lookup() argument). * we are called from ->rename(). @@ -340,3 +339,10 @@ look at examples of other filesystems) for guidance. .d_hash() calling convention and locking rules are significantly changed. Read updated documentation in Documentation/filesystems/vfs.txt (and look at examples of other filesystems) for guidance. + +--- +[mandatory] + dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c +for details of what locks to replace dcache_lock with in order to protect +particular things. Most of the time, a filesystem only needs ->d_lock, which +protects *all* the dcache state of a given dentry. diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 5aef1a7f5e4b..2662b50ea8d4 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -159,21 +159,18 @@ static void spufs_prune_dir(struct dentry *dir) mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_u.d_child) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry)) && dentry->d_inode) { dget_locked_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); simple_unlink(dir->d_inode, dentry); - /* XXX: what is dcache_lock protecting here? Other + /* XXX: what was dcache_lock protecting here? Other * filesystems (IB, configfs) release dcache_lock * before unlink */ - spin_unlock(&dcache_lock); dput(dentry); } else { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } } shrink_dcache_parent(dir); diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 18aee04a8411..925e88227ded 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -277,18 +277,14 @@ static int remove_file(struct dentry *parent, char *name) goto bail; } - spin_lock(&dcache_lock); spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { dget_locked_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, tmp); - } else { + } else spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); - } ret = 0; bail: diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index fe4b242f0094..49af4a6538ba 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -453,17 +453,14 @@ static int remove_file(struct dentry *parent, char *name) goto bail; } - spin_lock(&dcache_lock); spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { dget_locked_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, tmp); } else { spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); } ret = 0; diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c index bbe42f42ca8f..400a9fc386ad 100644 --- a/drivers/staging/pohmelfs/path_entry.c +++ b/drivers/staging/pohmelfs/path_entry.c @@ -101,7 +101,6 @@ rename_retry: d = first; seq = read_seqbegin(&rename_lock); rcu_read_lock(); - spin_lock(&dcache_lock); if (!IS_ROOT(d) && d_unhashed(d)) len += UNHASHED_OBSCURE_STRING_SIZE; /* Obscure " (deleted)" string */ @@ -110,7 +109,6 @@ rename_retry: len += d->d_name.len + 1; /* Plus slash */ d = d->d_parent; } - spin_unlock(&dcache_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c index 920434b6c071..75dfd403fb90 100644 --- a/drivers/staging/smbfs/cache.c +++ b/drivers/staging/smbfs/cache.c @@ -62,7 +62,6 @@ smb_invalidate_dircache_entries(struct dentry *parent) struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { @@ -72,7 +71,6 @@ smb_invalidate_dircache_entries(struct dentry *parent) next = next->next; } spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); } /* @@ -98,7 +96,6 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) } /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { @@ -115,7 +112,6 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) dent = NULL; out_unlock: spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); return dent; } diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index 89a0e8366585..1b125c224dcf 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -343,7 +343,6 @@ static int usbfs_empty (struct dentry *dentry) { struct list_head *list; - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); list_for_each(list, &dentry->d_subdirs) { struct dentry *de = list_entry(list, struct dentry, d_u.d_child); @@ -352,13 +351,11 @@ static int usbfs_empty (struct dentry *dentry) if (usbfs_positive(de)) { spin_unlock(&de->d_lock); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 0; } spin_unlock(&de->d_lock); } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 1; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 47dfd5d29a6b..1073bca8488c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -270,13 +270,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) { struct dentry *dentry; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); /* Directory should have only one entry. */ BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return dentry; } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 2321cc92d44f..600101a21ba2 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -128,7 +128,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino) void *data = dentry->d_fsdata; struct list_head *head, *next; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); head = &inode->i_dentry; next = head->next; @@ -141,7 +140,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino) next = next->next; } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 9d2ae9b30d9f..0fffe1c24cec 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -16,6 +16,7 @@ #include #include #include +#include #include /* This is the range of ioctl() numbers we claim as ours */ @@ -60,6 +61,8 @@ do { \ current->pid, __func__, ##args); \ } while (0) +extern spinlock_t autofs4_lock; + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 968c1434af62..2f7951d67d16 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -102,7 +102,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev, if (prev == NULL) return dget(prev); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); relock: p = prev; spin_lock(&p->d_lock); @@ -114,7 +114,7 @@ again: if (p == root) { spin_unlock(&p->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); dput(prev); return NULL; } @@ -144,7 +144,7 @@ again: dget_dlock(ret); spin_unlock(&ret->d_lock); spin_unlock(&p->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); dput(prev); @@ -408,13 +408,13 @@ found: ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&expired->d_parent->d_lock); spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&expired->d_lock); spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return expired; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7a9ed6b88291..10ca68a96dc7 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -23,6 +23,8 @@ #include "autofs_i.h" +DEFINE_SPINLOCK(autofs4_lock); + static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); @@ -142,15 +144,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * autofs file system so just let the libfs routines handle * it. */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&dentry->d_lock); if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return -ENOENT; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); out: return dcache_dir_open(inode, file); @@ -255,11 +257,11 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) /* We trigger a mount for almost all flags */ lookup_type = autofs4_need_mount(nd->flags); spin_lock(&sbi->fs_lock); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&dentry->d_lock); if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); spin_unlock(&sbi->fs_lock); goto follow; } @@ -272,7 +274,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (ino->flags & AUTOFS_INF_PENDING || (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); spin_unlock(&sbi->fs_lock); status = try_to_fill_dentry(dentry, nd->flags); @@ -282,7 +284,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) goto follow; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); spin_unlock(&sbi->fs_lock); follow: /* @@ -353,14 +355,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; /* Check for a non-mountpoint directory with no contents */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&dentry->d_lock); if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); /* The daemon never causes a mount to trigger */ if (oz_mode) @@ -377,7 +379,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) return status; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return 1; } @@ -432,7 +434,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { @@ -465,14 +467,14 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) dget_dlock(active); spin_unlock(&active->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return active; } next: spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return NULL; } @@ -487,7 +489,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; list_for_each(p, head) { @@ -520,14 +522,14 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) dget_dlock(expiring); spin_unlock(&expiring->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return expiring; } next: spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return NULL; } @@ -763,12 +765,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return 0; } @@ -785,20 +787,20 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi)) return -EACCES; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return -ENOTEMPTY; } __autofs4_add_expiring(dentry); spin_unlock(&sbi->lookup_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4be8f778a418..c5f8459c905e 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -194,14 +194,15 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, rename_retry: buf = *name; len = 0; + seq = read_seqbegin(&rename_lock); rcu_read_lock(); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; if (!len || --len > NAME_MAX) { - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; @@ -217,7 +218,7 @@ rename_retry: p -= tmp->d_name.len; strncpy(p, tmp->d_name.name, tmp->d_name.len); } - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2c924e8d85fe..58abc3da6111 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -112,7 +112,6 @@ static int __dcache_readdir(struct file *filp, dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, last); - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); /* start at beginning? */ @@ -156,7 +155,6 @@ more: dget_dlock(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -182,21 +180,19 @@ more: filp->f_pos++; - /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ + /* make sure a dentry wasn't dropped while we didn't have parent lock */ if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; out_unlock: spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); out: if (last) dput(last); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c6944473366..2a48cafc174b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -841,7 +841,6 @@ static void ceph_set_dentry_offset(struct dentry *dn) di->offset = ceph_inode(inode)->i_max_offset++; spin_unlock(&inode->i_lock); - spin_lock(&dcache_lock); spin_lock(&dir->d_lock); spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); list_move(&dn->d_u.d_child, &dir->d_subdirs); @@ -849,7 +848,6 @@ static void ceph_set_dentry_offset(struct dentry *dn) dn->d_u.d_child.prev, dn->d_u.d_child.next); spin_unlock(&dn->d_lock); spin_unlock(&dir->d_lock); - spin_unlock(&dcache_lock); } /* @@ -1233,13 +1231,11 @@ retry_lookup: goto retry_lookup; } else { /* reorder parent's d_subdirs */ - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); list_move(&dn->d_u.d_child, &parent->d_subdirs); spin_unlock(&dn->d_lock); spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); } di = dn->d_fsdata; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 003698365ece..99b9a2cc14b7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -809,17 +809,14 @@ inode_has_hashed_dentries(struct inode *inode) { struct dentry *dentry; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return true; } } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return false; } diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 859393fca2b7..5525e1c660fd 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -93,7 +93,6 @@ static void coda_flag_children(struct dentry *parent, int flag) struct list_head *child; struct dentry *de; - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); list_for_each(child, &parent->d_subdirs) { @@ -104,7 +103,6 @@ static void coda_flag_children(struct dentry *parent, int flag) coda_flag_inode(de->d_inode, flag); } spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); return; } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index e58b4c30e216..026cf68553a4 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -120,7 +120,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry { struct config_item * item = NULL; - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; @@ -131,7 +130,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry item = config_item_get(sd->s_element); } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return item; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 79b37765d8ff..fb3a55fff82a 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) struct dentry * dentry = sd->s_dentry; if (dentry) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { dget_locked_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else { + } else spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - } } } diff --git a/fs/dcache.c b/fs/dcache.c index a9bc4ecc21e1..0dbae053b664 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -54,11 +54,10 @@ * - d_alias, d_inode * * Ordering: - * dcache_lock - * dcache_inode_lock - * dentry->d_lock - * dcache_lru_lock - * dcache_hash_lock + * dcache_inode_lock + * dentry->d_lock + * dcache_lru_lock + * dcache_hash_lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock @@ -77,12 +76,10 @@ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); EXPORT_SYMBOL(dcache_inode_lock); -EXPORT_SYMBOL(dcache_lock); static struct kmem_cache *dentry_cache __read_mostly; @@ -139,7 +136,7 @@ static void __d_free(struct rcu_head *head) } /* - * no dcache_lock, please. + * no locks, please. */ static void d_free(struct dentry *dentry) { @@ -162,7 +159,6 @@ static void d_free(struct dentry *dentry) static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) __releases(dcache_inode_lock) - __releases(dcache_lock) { struct inode *inode = dentry->d_inode; if (inode) { @@ -170,7 +166,6 @@ static void dentry_iput(struct dentry * dentry) list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) @@ -180,7 +175,6 @@ static void dentry_iput(struct dentry * dentry) } else { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } } @@ -235,14 +229,13 @@ static void dentry_lru_move_tail(struct dentry *dentry) * * If this is the root of the dentry tree, return NULL. * - * dcache_lock and d_lock and d_parent->d_lock must be held by caller, and - * are dropped by d_kill. + * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by + * d_kill. */ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(dentry->d_lock) __releases(parent->d_lock) __releases(dcache_inode_lock) - __releases(dcache_lock) { dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); @@ -285,11 +278,9 @@ EXPORT_SYMBOL(__d_drop); void d_drop(struct dentry *dentry) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(d_drop); @@ -337,21 +328,10 @@ repeat: else parent = dentry->d_parent; if (dentry->d_count == 1) { - if (!spin_trylock(&dcache_lock)) { - /* - * Something of a livelock possibility we could avoid - * by taking dcache_lock and trying again, but we - * want to reduce dcache_lock anyway so this will - * get improved. - */ -drop1: - spin_unlock(&dentry->d_lock); - goto repeat; - } if (!spin_trylock(&dcache_inode_lock)) { drop2: - spin_unlock(&dcache_lock); - goto drop1; + spin_unlock(&dentry->d_lock); + goto repeat; } if (parent && !spin_trylock(&parent->d_lock)) { spin_unlock(&dcache_inode_lock); @@ -363,7 +343,6 @@ drop2: spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); return; } @@ -387,7 +366,6 @@ drop2: if (parent) spin_unlock(&parent->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return; unhash_it: @@ -418,11 +396,9 @@ int d_invalidate(struct dentry * dentry) /* * If it's already been dropped, return OK. */ - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 0; } /* @@ -431,9 +407,7 @@ int d_invalidate(struct dentry * dentry) */ if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); } @@ -450,19 +424,17 @@ int d_invalidate(struct dentry * dentry) if (dentry->d_count > 1) { if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return -EBUSY; } } __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 0; } EXPORT_SYMBOL(d_invalidate); -/* This must be called with dcache_lock and d_lock held */ +/* This must be called with d_lock held */ static inline struct dentry * __dget_locked_dlock(struct dentry *dentry) { dentry->d_count++; @@ -470,7 +442,7 @@ static inline struct dentry * __dget_locked_dlock(struct dentry *dentry) return dentry; } -/* This should be called _only_ with dcache_lock held */ +/* This must be called with d_lock held */ static inline struct dentry * __dget_locked(struct dentry *dentry) { spin_lock(&dentry->d_lock); @@ -575,11 +547,9 @@ struct dentry *d_find_alias(struct inode *inode) struct dentry *de = NULL; if (!list_empty(&inode->i_dentry)) { - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); de = __d_find_alias(inode, 0); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } return de; } @@ -593,7 +563,6 @@ void d_prune_aliases(struct inode *inode) { struct dentry *dentry; restart: - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); @@ -602,14 +571,12 @@ restart: __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); dput(dentry); goto restart; } spin_unlock(&dentry->d_lock); } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(d_prune_aliases); @@ -625,17 +592,14 @@ static void prune_one_dentry(struct dentry *dentry, struct dentry *parent) __releases(dentry->d_lock) __releases(parent->d_lock) __releases(dcache_inode_lock) - __releases(dcache_lock) { __d_drop(dentry); dentry = d_kill(dentry, parent); /* - * Prune ancestors. Locking is simpler than in dput(), - * because dcache_lock needs to be taken anyway. + * Prune ancestors. */ while (dentry) { - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); again: spin_lock(&dentry->d_lock); @@ -653,7 +617,6 @@ again: spin_unlock(&parent->d_lock); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return; } @@ -702,8 +665,7 @@ relock: spin_unlock(&dcache_lru_lock); prune_one_dentry(dentry, parent); - /* dcache_lock, dcache_inode_lock and dentry->d_lock dropped */ - spin_lock(&dcache_lock); + /* dcache_inode_lock and dentry->d_lock dropped */ spin_lock(&dcache_inode_lock); spin_lock(&dcache_lru_lock); } @@ -725,7 +687,6 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) LIST_HEAD(tmp); int cnt = *count; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); relock: spin_lock(&dcache_lru_lock); @@ -766,7 +727,6 @@ relock: list_splice(&referenced, &sb->s_dentry_lru); spin_unlock(&dcache_lru_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } /** @@ -788,7 +748,6 @@ static void prune_dcache(int count) if (unused == 0 || count == 0) return; - spin_lock(&dcache_lock); if (count >= unused) prune_ratio = 1; else @@ -825,11 +784,9 @@ static void prune_dcache(int count) if (down_read_trylock(&sb->s_umount)) { if ((sb->s_root != NULL) && (!list_empty(&sb->s_dentry_lru))) { - spin_unlock(&dcache_lock); __shrink_dcache_sb(sb, &w_count, DCACHE_REFERENCED); pruned -= w_count; - spin_lock(&dcache_lock); } up_read(&sb->s_umount); } @@ -845,7 +802,6 @@ static void prune_dcache(int count) if (p) __put_super(p); spin_unlock(&sb_lock); - spin_unlock(&dcache_lock); } /** @@ -859,7 +815,6 @@ void shrink_dcache_sb(struct super_block *sb) { LIST_HEAD(tmp); - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); spin_lock(&dcache_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { @@ -868,7 +823,6 @@ void shrink_dcache_sb(struct super_block *sb) } spin_unlock(&dcache_lru_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -885,12 +839,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) BUG_ON(!IS_ROOT(dentry)); /* detach this root from the system */ - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); dentry_lru_del(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); for (;;) { /* descend to the first leaf in the current subtree */ @@ -899,7 +851,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* this is a branch with children - detach all of them * from the system in one go */ - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { @@ -910,7 +861,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_unlock(&loop->d_lock); } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); /* move to the first child */ dentry = list_entry(dentry->d_subdirs.next, @@ -977,8 +927,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* * destroy the dentries attached to a superblock on unmounting - * - we don't need to use dentry->d_lock, and only need dcache_lock when - * removing the dentry from the system lists and hashes because: + * - we don't need to use dentry->d_lock because: * - the superblock is detached from all mountings and open files, so the * dentry trees will not be rearranged by the VFS * - s_umount is write-locked, so the memory pressure shrinker will ignore @@ -1029,7 +978,6 @@ rename_retry: this_parent = parent; seq = read_seqbegin(&rename_lock); - spin_lock(&dcache_lock); if (d_mountpoint(parent)) goto positive; spin_lock(&this_parent->d_lock); @@ -1075,7 +1023,6 @@ resume: if (this_parent != child->d_parent || read_seqretry(&rename_lock, seq)) { spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); rcu_read_unlock(); goto rename_retry; } @@ -1084,12 +1031,10 @@ resume: goto resume; } spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); if (read_seqretry(&rename_lock, seq)) goto rename_retry; return 0; /* No mount points found in tree */ positive: - spin_unlock(&dcache_lock); if (read_seqretry(&rename_lock, seq)) goto rename_retry; return 1; @@ -1121,7 +1066,6 @@ rename_retry: this_parent = parent; seq = read_seqbegin(&rename_lock); - spin_lock(&dcache_lock); spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; @@ -1185,7 +1129,6 @@ resume: if (this_parent != child->d_parent || read_seqretry(&rename_lock, seq)) { spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); rcu_read_unlock(); goto rename_retry; } @@ -1195,7 +1138,6 @@ resume: } out: spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); if (read_seqretry(&rename_lock, seq)) goto rename_retry; return found; @@ -1297,7 +1239,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) INIT_LIST_HEAD(&dentry->d_u.d_child); if (parent) { - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_parent = dget_dlock(parent); @@ -1305,7 +1246,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) list_add(&dentry->d_u.d_child, &parent->d_subdirs); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); } this_cpu_inc(nr_dentry); @@ -1325,7 +1265,6 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) } EXPORT_SYMBOL(d_alloc_name); -/* the caller must hold dcache_lock */ static void __d_instantiate(struct dentry *dentry, struct inode *inode) { spin_lock(&dentry->d_lock); @@ -1354,11 +1293,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); __d_instantiate(entry, inode); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate); @@ -1422,11 +1359,9 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); result = __d_instantiate_unique(entry, inode); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); if (!result) { security_d_instantiate(entry, inode); @@ -1515,12 +1450,11 @@ struct dentry *d_obtain_alias(struct inode *inode) } tmp->d_parent = tmp; /* make sure dput doesn't croak */ - spin_lock(&dcache_lock); + spin_lock(&dcache_inode_lock); res = __d_find_alias(inode, 0); if (res) { spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); dput(tmp); goto out_iput; } @@ -1538,7 +1472,6 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_unlock(&tmp->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return tmp; out_iput: @@ -1568,21 +1501,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) struct dentry *new = NULL; if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); new = __d_find_alias(inode, 1); if (new) { BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); security_d_instantiate(new, inode); d_move(new, dentry); iput(inode); } else { - /* already taking dcache_lock, so d_add() by hand */ + /* already taking dcache_inode_lock, so d_add() by hand */ __d_instantiate(dentry, inode); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); } @@ -1655,12 +1585,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * Negative dentry: instantiate it unless the inode is a directory and * already has a dentry. */ - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { __d_instantiate(found, inode); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); security_d_instantiate(found, inode); return found; } @@ -1672,7 +1600,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, new = list_entry(inode->i_dentry.next, struct dentry, d_alias); dget_locked(new); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); security_d_instantiate(found, inode); d_move(new, found); iput(inode); @@ -1843,7 +1770,6 @@ int d_validate(struct dentry *dentry, struct dentry *dparent) { struct dentry *child; - spin_lock(&dcache_lock); spin_lock(&dparent->d_lock); list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) { if (dentry == child) { @@ -1851,12 +1777,10 @@ int d_validate(struct dentry *dentry, struct dentry *dparent) __dget_locked_dlock(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dparent->d_lock); - spin_unlock(&dcache_lock); return 1; } } spin_unlock(&dparent->d_lock); - spin_unlock(&dcache_lock); return 0; } @@ -1889,7 +1813,6 @@ void d_delete(struct dentry * dentry) /* * Are we the only user? */ - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); spin_lock(&dentry->d_lock); isdir = S_ISDIR(dentry->d_inode->i_mode); @@ -1905,7 +1828,6 @@ void d_delete(struct dentry * dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); fsnotify_nameremove(dentry, isdir); } @@ -1932,13 +1854,11 @@ static void _d_rehash(struct dentry * entry) void d_rehash(struct dentry * entry) { - spin_lock(&dcache_lock); spin_lock(&entry->d_lock); spin_lock(&dcache_hash_lock); _d_rehash(entry); spin_unlock(&dcache_hash_lock); spin_unlock(&entry->d_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(d_rehash); @@ -1961,11 +1881,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(dentry_update_name_case); @@ -2058,14 +1976,14 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * The hash value has to match the hash queue that the dentry is on.. */ /* - * d_move_locked - move a dentry + * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. */ -static void d_move_locked(struct dentry * dentry, struct dentry * target) +void d_move(struct dentry * dentry, struct dentry * target) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2114,22 +2032,6 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target) spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); } - -/** - * d_move - move a dentry - * @dentry: entry to move - * @target: new dentry - * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. - */ - -void d_move(struct dentry * dentry, struct dentry * target) -{ - spin_lock(&dcache_lock); - d_move_locked(dentry, target); - spin_unlock(&dcache_lock); -} EXPORT_SYMBOL(d_move); /** @@ -2155,13 +2057,12 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex and the dcache_lock + * dentry->d_parent->d_inode->i_mutex and the dcache_inode_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) - __releases(dcache_lock) __releases(dcache_inode_lock) { struct mutex *m1 = NULL, *m2 = NULL; @@ -2185,11 +2086,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - d_move_locked(alias, dentry); + d_move(alias, dentry); ret = alias; out_err: spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2249,7 +2149,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) BUG_ON(!d_unhashed(dentry)); - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); if (!inode) { @@ -2295,7 +2194,6 @@ found: spin_unlock(&dcache_hash_lock); spin_unlock(&actual->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); out_nolock: if (actual == dentry) { security_d_instantiate(dentry, inode); @@ -2307,7 +2205,6 @@ out_nolock: shouldnt_be_hashed: spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); BUG(); } EXPORT_SYMBOL_GPL(d_materialise_unique); @@ -2421,11 +2318,9 @@ char *__d_path(const struct path *path, struct path *root, int error; prepend(&res, &buflen, "\0", 1); - spin_lock(&dcache_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); if (error) return ERR_PTR(error); @@ -2487,14 +2382,12 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); - spin_lock(&dcache_lock); write_seqlock(&rename_lock); tmp = root; error = path_with_deleted(path, &tmp, &res, &buflen); if (error) res = ERR_PTR(error); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); path_put(&root); return res; } @@ -2520,14 +2413,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); - spin_lock(&dcache_lock); write_seqlock(&rename_lock); tmp = root; error = path_with_deleted(path, &tmp, &res, &buflen); if (!error && !path_equal(&tmp, &root)) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); path_put(&root); if (error) res = ERR_PTR(error); @@ -2594,11 +2485,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { char *retval; - spin_lock(&dcache_lock); write_seqlock(&rename_lock); retval = __dentry_path(dentry, buf, buflen); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); return retval; } @@ -2609,7 +2498,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - spin_lock(&dcache_lock); write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; @@ -2619,12 +2507,10 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) } retval = __dentry_path(dentry, buf, buflen); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; Elong: - spin_unlock(&dcache_lock); return ERR_PTR(-ENAMETOOLONG); } @@ -2658,7 +2544,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; - spin_lock(&dcache_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2669,7 +2554,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &tmp, &cwd, &buflen); write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); if (error) goto out; @@ -2690,7 +2574,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); - spin_unlock(&dcache_lock); } out: @@ -2776,7 +2659,6 @@ void d_genocide(struct dentry *root) rename_retry: this_parent = root; seq = read_seqbegin(&rename_lock); - spin_lock(&dcache_lock); spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; @@ -2823,7 +2705,6 @@ resume: if (this_parent != child->d_parent || read_seqretry(&rename_lock, seq)) { spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); rcu_read_unlock(); goto rename_retry; } @@ -2832,7 +2713,6 @@ resume: goto resume; } spin_unlock(&this_parent->d_lock); - spin_unlock(&dcache_lock); if (read_seqretry(&rename_lock, seq)) goto rename_retry; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 84b8c460a781..53a5c08fb63c 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -47,24 +47,20 @@ find_acceptable_alias(struct dentry *result, if (acceptable(context, result)) return result; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { dget_locked(dentry); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); if (toput) dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; } - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); toput = dentry; } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); if (toput) dput(toput); diff --git a/fs/libfs.c b/fs/libfs.c index cc4794914b52..28b36663c44e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -100,7 +100,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) struct dentry *cursor = file->private_data; loff_t n = file->f_pos - 2; - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); /* d_lock not required for cursor */ list_del(&cursor->d_u.d_child); @@ -116,7 +115,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) } list_add_tail(&cursor->d_u.d_child, p); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } } mutex_unlock(&dentry->d_inode->i_mutex); @@ -159,7 +157,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (filp->f_pos == 2) list_move(q, &dentry->d_subdirs); @@ -175,13 +172,11 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) spin_unlock(&next->d_lock); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0) return 0; - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); /* next is still alive */ @@ -191,7 +186,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } return 0; } @@ -285,7 +279,6 @@ int simple_empty(struct dentry *dentry) struct dentry *child; int ret = 0; - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); @@ -298,7 +291,6 @@ int simple_empty(struct dentry *dentry) ret = 1; out: spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return ret; } diff --git a/fs/namei.c b/fs/namei.c index cbfa5fb31072..5642bc2be418 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -612,8 +612,8 @@ int follow_up(struct path *path) return 1; } -/* no need for dcache_lock, as serialization is taken care in - * namespace.c +/* + * serialization is taken care of in namespace.c */ static int __follow_mount(struct path *path) { @@ -645,9 +645,6 @@ static void follow_mount(struct path *path) } } -/* no need for dcache_lock, as serialization is taken care in - * namespace.c - */ int follow_down(struct path *path) { struct vfsmount *mounted; @@ -2131,12 +2128,10 @@ void dentry_unhash(struct dentry *dentry) { dget(dentry); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (dentry->d_count == 2) __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 102278ed38bd..de15c533311c 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -391,7 +391,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) } /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { @@ -402,13 +401,11 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) else dent = NULL; spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); goto out; } next = next->next; } spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); return NULL; out: diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index c4b718ff9a6b..1220df75ff22 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -193,7 +193,6 @@ ncp_renew_dentries(struct dentry *parent) struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { @@ -207,7 +206,6 @@ ncp_renew_dentries(struct dentry *parent) next = next->next; } spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); } static inline void @@ -217,7 +215,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent) struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { @@ -227,7 +224,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent) next = next->next; } spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); } struct ncp_cache_head { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 12de824edb5c..eb77471b8823 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1718,11 +1718,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (dentry->d_count > 1) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); @@ -1733,7 +1731,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) need_rehash = 1; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); if (!error || error == -ENOENT) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 850f67d5f0ac..b3e36c3430de 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -63,13 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); spin_lock(&sb->s_root->d_lock); list_del_init(&sb->s_root->d_alias); spin_unlock(&sb->s_root->d_lock); spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } return 0; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 78c0ebb0b07c..74aaf3963c10 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -60,7 +60,6 @@ rename_retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); - spin_lock(&dcache_lock); while (!IS_ROOT(dentry) && dentry != droot) { namelen = dentry->d_name.len; buflen -= namelen + 1; @@ -71,7 +70,6 @@ rename_retry: *--end = '/'; dentry = dentry->d_parent; } - spin_unlock(&dcache_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; @@ -91,7 +89,6 @@ rename_retry: memcpy(end, base, namelen); return end; Elong_unlock: - spin_unlock(&dcache_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ae769fc9b66c..9be6ec1f36d8 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -59,7 +59,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) /* determine if the children should tell inode about their events */ watched = fsnotify_inode_watches_children(inode); - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ @@ -84,7 +83,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_unlock(&alias->d_lock); } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); } /* Notify this dentry's parent about a child's events. */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index c31b5c647ac7..b7de749bdd12 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -169,7 +169,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct list_head *p; struct dentry *dentry = NULL; - spin_lock(&dcache_lock); spin_lock(&dcache_inode_lock); list_for_each(p, &inode->i_dentry) { dentry = list_entry(p, struct dentry, d_alias); @@ -189,7 +188,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, } spin_unlock(&dcache_inode_lock); - spin_unlock(&dcache_lock); return dentry; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c963ebada922..a2ceb94b0e38 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -183,7 +183,6 @@ struct dentry_operations { #define DCACHE_GENOCIDE 0x0200 extern spinlock_t dcache_inode_lock; -extern spinlock_t dcache_lock; extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) @@ -296,8 +295,8 @@ extern char *dentry_path(struct dentry *, char *, int); * destroyed when it has references. dget() should never be * called for dentries with zero reference counter. For these cases * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold dcache_lock - * and call dget_locked() instead of dget(). + * needs and they take necessary precautions) you should hold d_lock + * and call dget_dlock() instead of dget(). */ static inline struct dentry *dget_dlock(struct dentry *dentry) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 090f0eacde29..296cf2fde945 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1378,7 +1378,7 @@ struct super_block { #else struct list_head s_files; #endif - /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ + /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ @@ -2446,6 +2446,10 @@ static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; + /* + * Don't strictly need d_lock here? If the parent ino could change + * then surely we'd have a deeper race in the caller? + */ spin_lock(&dentry->d_lock); res = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index b10bcdeaef76..2a53f10712b3 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,7 +17,6 @@ /* * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. */ static inline void fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) @@ -62,7 +61,6 @@ static inline int fsnotify_perm(struct file *file, int mask) /* * fsnotify_d_move - dentry has been moved - * Called with dcache_lock and dentry->d_lock held. */ static inline void fsnotify_d_move(struct dentry *dentry) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7380763595d3..69ad89b50489 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -329,9 +329,15 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) { struct dentry *parent; - assert_spin_locked(&dcache_lock); assert_spin_locked(&dentry->d_lock); + /* + * Serialisation of setting PARENT_WATCHED on the dentries is provided + * by d_lock. If inotify_inode_watched changes after we have taken + * d_lock, the following __fsnotify_update_child_dentry_flags call will + * find our entry, so it will spin until we complete here, and update + * us with the new state. + */ parent = dentry->d_parent; if (parent->d_inode && fsnotify_inode_watches_children(parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; @@ -341,15 +347,12 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) /* * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. */ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) { if (!inode) return; - assert_spin_locked(&dcache_lock); - spin_lock(&dentry->d_lock); __fsnotify_update_dcache_flags(dentry); spin_unlock(&dentry->d_lock); diff --git a/include/linux/namei.h b/include/linux/namei.h index 05b441d93642..aec730b53935 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -41,7 +41,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - require a directory * - ending slashes ok even for nonexistent files * - internal "there are more path components" flag - * - locked when lookup done with dcache_lock held * - dentry cache is untrusted; force a real lookup */ #define LOOKUP_FOLLOW 1 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b4705b51d4a..1864cb6a6a59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -876,7 +876,6 @@ static void cgroup_clear_directory(struct dentry *dentry) struct list_head *node; BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { @@ -891,18 +890,15 @@ static void cgroup_clear_directory(struct dentry *dentry) dget_locked_dlock(d); spin_unlock(&d->d_lock); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); } else spin_unlock(&d->d_lock); node = dentry->d_subdirs.next; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } /* @@ -914,14 +910,12 @@ static void cgroup_d_remove_dir(struct dentry *dentry) cgroup_clear_directory(dentry); - spin_lock(&dcache_lock); parent = dentry->d_parent; spin_lock(&parent->d_lock); spin_lock(&dentry->d_lock); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); remove_dir(dentry); } diff --git a/mm/filemap.c b/mm/filemap.c index 6b9aee20f242..ca389394fa2a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -102,9 +102,6 @@ * ->inode_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) - * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) * ->i_mmap_lock diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 017ec096446e..2285d693f296 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1145,7 +1145,6 @@ static void sel_remove_entries(struct dentry *de) { struct list_head *node; - spin_lock(&dcache_lock); spin_lock(&de->d_lock); node = de->d_subdirs.next; while (node != &de->d_subdirs) { @@ -1158,11 +1157,9 @@ static void sel_remove_entries(struct dentry *de) dget_locked_dlock(d); spin_unlock(&de->d_lock); spin_unlock(&d->d_lock); - spin_unlock(&dcache_lock); d_delete(d); simple_unlink(de->d_inode, d); dput(d); - spin_lock(&dcache_lock); spin_lock(&de->d_lock); } else spin_unlock(&d->d_lock); @@ -1170,7 +1167,6 @@ static void sel_remove_entries(struct dentry *de) } spin_unlock(&de->d_lock); - spin_unlock(&dcache_lock); } #define BOOL_DIR_NAME "booleans" -- cgit v1.2.3 From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:49 +1100 Subject: fs: icache RCU free inodes RCU free the struct inode. This will allow: - Subsequent store-free path walking patch. The inode must be consulted for permissions when walking, so an RCU inode reference is a must. - sb_inode_list_lock to be moved inside i_lock because sb list walkers who want to take i_lock no longer need to take sb_inode_list_lock to walk the list in the first place. This will simplify and optimize locking. - Could remove some nested trylock loops in dcache code - Could potentially simplify things a bit in VM land. Do not need to take the page lock to follow page->mapping. The downsides of this is the performance cost of using RCU. In a simple creat/unlink microbenchmark, performance drops by about 10% due to inability to reuse cache-hot slab objects. As iterations increase and RCU freeing starts kicking over, this increases to about 20%. In cases where inode lifetimes are longer (ie. many inodes may be allocated during the average life span of a single inode), a lot of this cache reuse is not applicable, so the regression caused by this patch is smaller. The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU, however this adds some complexity to list walking and store-free path walking, so I prefer to implement this at a later date, if it is shown to be a win in real situations. I haven't found a regression in any non-micro benchmark so I doubt it will be a problem. Signed-off-by: Nick Piggin --- Documentation/filesystems/porting | 14 ++++++++++++++ arch/powerpc/platforms/cell/spufs/inode.c | 10 ++++++++-- drivers/staging/pohmelfs/inode.c | 9 ++++++++- drivers/staging/smbfs/inode.c | 9 ++++++++- fs/9p/vfs_inode.c | 9 ++++++++- fs/adfs/super.c | 9 ++++++++- fs/affs/super.c | 9 ++++++++- fs/afs/super.c | 10 +++++++++- fs/befs/linuxvfs.c | 10 ++++++++-- fs/bfs/inode.c | 9 ++++++++- fs/block_dev.c | 9 ++++++++- fs/btrfs/inode.c | 9 ++++++++- fs/ceph/inode.c | 11 ++++++++++- fs/cifs/cifsfs.c | 9 ++++++++- fs/coda/inode.c | 9 ++++++++- fs/ecryptfs/super.c | 12 +++++++++++- fs/efs/super.c | 9 ++++++++- fs/exofs/super.c | 9 ++++++++- fs/ext2/super.c | 9 ++++++++- fs/ext3/super.c | 9 ++++++++- fs/ext4/super.c | 9 ++++++++- fs/fat/inode.c | 9 ++++++++- fs/freevxfs/vxfs_inode.c | 9 ++++++++- fs/fuse/inode.c | 9 ++++++++- fs/gfs2/super.c | 9 ++++++++- fs/hfs/super.c | 9 ++++++++- fs/hfsplus/super.c | 10 +++++++++- fs/hostfs/hostfs_kern.c | 9 ++++++++- fs/hpfs/super.c | 9 ++++++++- fs/hppfs/hppfs.c | 9 ++++++++- fs/hugetlbfs/inode.c | 9 ++++++++- fs/inode.c | 10 +++++++++- fs/isofs/inode.c | 9 ++++++++- fs/jffs2/super.c | 9 ++++++++- fs/jfs/super.c | 10 +++++++++- fs/logfs/inode.c | 9 ++++++++- fs/minix/inode.c | 9 ++++++++- fs/ncpfs/inode.c | 9 ++++++++- fs/nfs/inode.c | 9 ++++++++- fs/nilfs2/super.c | 10 +++++++++- fs/ntfs/inode.c | 9 ++++++++- fs/ocfs2/dlmfs/dlmfs.c | 9 ++++++++- fs/ocfs2/super.c | 9 ++++++++- fs/openpromfs/inode.c | 9 ++++++++- fs/proc/inode.c | 9 ++++++++- fs/qnx4/inode.c | 9 ++++++++- fs/reiserfs/super.c | 9 ++++++++- fs/romfs/super.c | 9 ++++++++- fs/squashfs/super.c | 9 ++++++++- fs/sysv/inode.c | 9 ++++++++- fs/ubifs/super.c | 10 +++++++++- fs/udf/super.c | 9 ++++++++- fs/ufs/super.c | 9 ++++++++- fs/xfs/xfs_iget.c | 13 ++++++++++++- include/linux/fs.h | 5 ++++- include/linux/net.h | 1 - ipc/mqueue.c | 9 ++++++++- mm/shmem.c | 9 ++++++++- net/socket.c | 16 ++++++++-------- net/sunrpc/rpc_pipe.c | 10 +++++++++- 60 files changed, 490 insertions(+), 68 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 1eb76959d096..ccf0ce7866b9 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -346,3 +346,17 @@ look at examples of other filesystems) for guidance. for details of what locks to replace dcache_lock with in order to protect particular things. Most of the time, a filesystem only needs ->d_lock, which protects *all* the dcache state of a given dentry. + +-- +[mandatory] + + Filesystems must RCU-free their inodes, if they can have been accessed +via rcu-walk path walk (basically, if the file can have had a path name in the +vfs namespace). + + i_dentry and i_rcu share storage in a union, and the vfs expects +i_dentry to be reinitialized before it is freed, so an: + + INIT_LIST_HEAD(&inode->i_dentry); + +must be done in the RCU callback. diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 03185de7cd4a..856e9c398068 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -71,12 +71,18 @@ spufs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void -spufs_destroy_inode(struct inode *inode) +static void spufs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(spufs_inode_cache, SPUFS_I(inode)); } +static void spufs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, spufs_i_callback); +} + static void spufs_init_once(void *p) { diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index 61685ccceda8..cc8d2840f9b6 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -826,6 +826,13 @@ const struct address_space_operations pohmelfs_aops = { .set_page_dirty = __set_page_dirty_nobuffers, }; +static void pohmelfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(pohmelfs_inode_cache, POHMELFS_I(inode)); +} + /* * ->detroy_inode() callback. Deletes inode from the caches * and frees private data. @@ -842,8 +849,8 @@ static void pohmelfs_destroy_inode(struct inode *inode) dprintk("%s: pi: %p, inode: %p, ino: %llu.\n", __func__, pi, &pi->vfs_inode, pi->ino); - kmem_cache_free(pohmelfs_inode_cache, pi); atomic_long_dec(&psb->total_inodes); + call_rcu(&inode->i_rcu, pohmelfs_i_callback); } /* diff --git a/drivers/staging/smbfs/inode.c b/drivers/staging/smbfs/inode.c index 540a984bb516..244319dc9702 100644 --- a/drivers/staging/smbfs/inode.c +++ b/drivers/staging/smbfs/inode.c @@ -62,11 +62,18 @@ static struct inode *smb_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void smb_destroy_inode(struct inode *inode) +static void smb_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(smb_inode_cachep, SMB_I(inode)); } +static void smb_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, smb_i_callback); +} + static void init_once(void *foo) { struct smb_inode_info *ei = (struct smb_inode_info *) foo; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 1073bca8488c..f6f9081e6d2c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -237,10 +237,17 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) * */ -void v9fs_destroy_inode(struct inode *inode) +static void v9fs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); } + +void v9fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, v9fs_i_callback); +} #endif /** diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 959dbff2d42d..47dffc513a26 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void adfs_destroy_inode(struct inode *inode) +static void adfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } +static void adfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, adfs_i_callback); +} + static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; diff --git a/fs/affs/super.c b/fs/affs/super.c index 0cf7f4384cbd..4c18fcfb0a19 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb) return &i->vfs_inode; } -static void affs_destroy_inode(struct inode *inode) +static void affs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } +static void affs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, affs_i_callback); +} + static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; diff --git a/fs/afs/super.c b/fs/afs/super.c index 27201cffece4..f901a9d7c111 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -498,6 +498,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb) return &vnode->vfs_inode; } +static void afs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct afs_vnode *vnode = AFS_FS_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(afs_inode_cachep, vnode); +} + /* * destroy an AFS inode struct */ @@ -511,7 +519,7 @@ static void afs_destroy_inode(struct inode *inode) ASSERTCMP(vnode->server, ==, NULL); - kmem_cache_free(afs_inode_cachep, vnode); + call_rcu(&inode->i_rcu, afs_i_callback); atomic_dec(&afs_count_active_inodes); } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index aa4e7c7ae3c6..de93581b79a2 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void -befs_destroy_inode(struct inode *inode) +static void befs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } +static void befs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, befs_i_callback); +} + static void init_once(void *foo) { struct befs_inode_info *bi = (struct befs_inode_info *) foo; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 76db6d7d49bb..a8e37f81d097 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void bfs_destroy_inode(struct inode *inode) +static void bfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } +static void bfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bfs_i_callback); +} + static void init_once(void *foo) { struct bfs_inode_info *bi = foo; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4230252fd689..771f23527010 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -409,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void bdev_destroy_inode(struct inode *inode) +static void bdev_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } +static void bdev_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bdev_i_callback); +} + static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7ce9f8932789..f9d2994a42a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6495,6 +6495,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return inode; } +static void btrfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; @@ -6564,7 +6571,7 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + call_rcu(&inode->i_rcu, btrfs_i_callback); } int btrfs_drop_inode(struct inode *inode) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2a48cafc174b..47f8c8baf3b5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -368,6 +368,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb) return &ci->vfs_inode; } +static void ceph_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ceph_inode_info *ci = ceph_inode(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ceph_inode_cachep, ci); +} + void ceph_destroy_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -407,7 +416,7 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); - kmem_cache_free(ceph_inode_cachep, ci); + call_rcu(&inode->i_rcu, ceph_i_callback); } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3936aa7f2c22..223717dcc401 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -334,10 +334,17 @@ cifs_alloc_inode(struct super_block *sb) return &cifs_inode->vfs_inode; } +static void cifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); +} + static void cifs_destroy_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + call_rcu(&inode->i_rcu, cifs_i_callback); } static void diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 5ea57c8c7f97..50dc7d189f56 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -56,11 +56,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void coda_destroy_inode(struct inode *inode) +static void coda_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(coda_inode_cachep, ITOC(inode)); } +static void coda_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, coda_i_callback); +} + static void init_once(void *foo) { struct coda_inode_info *ei = (struct coda_inode_info *) foo; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2720178b7718..3042fe123a34 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -62,6 +62,16 @@ out: return inode; } +static void ecryptfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ecryptfs_inode_info *inode_info; + inode_info = ecryptfs_inode_to_private(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); +} + /** * ecryptfs_destroy_inode * @inode: The ecryptfs inode @@ -88,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode) } } ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); + call_rcu(&inode->i_rcu, ecryptfs_i_callback); } /** diff --git a/fs/efs/super.c b/fs/efs/super.c index 5073a07652cc..0f31acb0131c 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void efs_destroy_inode(struct inode *inode) +static void efs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } +static void efs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, efs_i_callback); +} + static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 79c3ae6e0456..8c6c4669b381 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } +static void exofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + /* * Remove an inode from the cache */ static void exofs_destroy_inode(struct inode *inode) { - kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); + call_rcu(&inode->i_rcu, exofs_i_callback); } /* diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d89e0b6a2d78..e0c6380ff992 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -161,11 +161,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ext2_destroy_inode(struct inode *inode) +static void ext2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } +static void ext2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ext2_i_callback); +} + static void init_once(void *foo) { struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index acf8695fa8f0..77ce1616f725 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -479,6 +479,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + static void ext3_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT3_I(inode)->i_orphan))) { @@ -489,7 +496,7 @@ static void ext3_destroy_inode(struct inode *inode) false); dump_stack(); } - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); + call_rcu(&inode->i_rcu, ext3_i_callback); } static void init_once(void *foo) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb15c9c0be74..cd37f9d5e447 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -841,6 +841,13 @@ static int ext4_drop_inode(struct inode *inode) return drop; } +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + static void ext4_destroy_inode(struct inode *inode) { ext4_ioend_wait(inode); @@ -853,7 +860,7 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } - kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); + call_rcu(&inode->i_rcu, ext4_i_callback); } static void init_once(void *foo) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index ad6998a92c30..8cccfebee180 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void fat_destroy_inode(struct inode *inode) +static void fat_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } +static void fat_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, fat_i_callback); +} + static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 8c04eac5079d..2ba6719ac612 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino) return ip; } +static void vxfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(vxfs_inode_cachep, inode->i_private); +} + /** * vxfs_evict_inode - remove inode from main memory * @ip: inode to discard. @@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip) { truncate_inode_pages(&ip->i_data, 0); end_writeback(ip); - kmem_cache_free(vxfs_inode_cachep, ip->i_private); + call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cfce3ad86a92..44e0a6c57e87 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -99,6 +99,13 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return inode; } +static void fuse_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(fuse_inode_cachep, inode); +} + static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -106,7 +113,7 @@ static void fuse_destroy_inode(struct inode *inode) BUG_ON(!list_empty(&fi->queued_writes)); if (fi->forget_req) fuse_request_free(fi->forget_req); - kmem_cache_free(fuse_inode_cachep, inode); + call_rcu(&inode->i_rcu, fuse_i_callback); } void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2b2c4997430b..16c2ecac7eb7 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1405,11 +1405,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) return &ip->i_inode; } -static void gfs2_destroy_inode(struct inode *inode) +static void gfs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(gfs2_inode_cachep, inode); } +static void gfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, gfs2_i_callback); +} + const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4824c27cebb8..ef4ee5716abe 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfs_destroy_inode(struct inode *inode) +static void hfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } +static void hfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfs_i_callback); +} + static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, .destroy_inode = hfs_destroy_inode, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 52cc746d3ba3..182e83a9079e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -488,11 +488,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfsplus_destroy_inode(struct inode *inode) +static void hfsplus_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } +static void hfsplus_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfsplus_i_callback); +} + #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) static struct dentry *hfsplus_mount(struct file_system_type *fs_type, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 39dc505ed273..861113fcfc88 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -247,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode) } } -static void hostfs_destroy_inode(struct inode *inode) +static void hostfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kfree(HOSTFS_I(inode)); } +static void hostfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hostfs_i_callback); +} + static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) { const char *root_path = vfs->mnt_sb->s_fs_info; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 6c5f01597c3a..49935ba78db8 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void hpfs_destroy_inode(struct inode *inode) +static void hpfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } +static void hpfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hpfs_i_callback); +} + static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index f702b5f713fc..87ed48e0343d 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino) mntput(ino->i_sb->s_fs_info); } -static void hppfs_destroy_inode(struct inode *inode) +static void hppfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kfree(HPPFS_I(inode)); } +static void hppfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hppfs_i_callback); +} + static const struct super_operations hppfs_sbops = { .alloc_inode = hppfs_alloc_inode, .destroy_inode = hppfs_destroy_inode, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a5fe68189eed..9885082b470f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void hugetlbfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); - kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); + call_rcu(&inode->i_rcu, hugetlbfs_i_callback); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/fs/inode.c b/fs/inode.c index 5a0a898f55d1..6751dfe8cc06 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -272,6 +272,13 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(inode_cachep, inode); +} + static void destroy_inode(struct inode *inode) { BUG_ON(!list_empty(&inode->i_lru)); @@ -279,7 +286,7 @@ static void destroy_inode(struct inode *inode) if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else - kmem_cache_free(inode_cachep, (inode)); + call_rcu(&inode->i_rcu, i_callback); } /* @@ -432,6 +439,7 @@ void end_writeback(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); + /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(end_writeback); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d204ee4235fd..d8f3a652243d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -81,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void isofs_destroy_inode(struct inode *inode) +static void isofs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } +static void isofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, isofs_i_callback); +} + static void init_once(void *foo) { struct iso_inode_info *ei = foo; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index c86041b866a4..853b8e300084 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) return &f->vfs_inode; } -static void jffs2_destroy_inode(struct inode *inode) +static void jffs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); } +static void jffs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, jffs2_i_callback); +} + static void jffs2_i_init_once(void *foo) { struct jffs2_inode_info *f = foo; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0669fc1cc3bf..b715b0f7bdfd 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) return &jfs_inode->vfs_inode; } +static void jfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct jfs_inode_info *ji = JFS_IP(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(jfs_inode_cachep, ji); +} + static void jfs_destroy_inode(struct inode *inode) { struct jfs_inode_info *ji = JFS_IP(inode); @@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode) ji->active_ag = -1; } spin_unlock_irq(&ji->ag_lock); - kmem_cache_free(jfs_inode_cachep, ji); + call_rcu(&inode->i_rcu, jfs_i_callback); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index d8c71ece098f..03b8c240aeda 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) return __logfs_iget(sb, ino); } +static void logfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); +} + static void __logfs_destroy_inode(struct inode *inode) { struct logfs_inode *li = logfs_inode(inode); BUG_ON(li->li_block); list_del(&li->li_freeing_list); - kmem_cache_free(logfs_inode_cache, li); + call_rcu(&inode->i_rcu, logfs_i_callback); } static void logfs_destroy_inode(struct inode *inode) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index fb2020858a34..ae0b83f476a6 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void minix_destroy_inode(struct inode *inode) +static void minix_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(minix_inode_cachep, minix_i(inode)); } +static void minix_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, minix_i_callback); +} + static void init_once(void *foo) { struct minix_inode_info *ei = (struct minix_inode_info *) foo; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 8fb93b604e73..60047dbeb38d 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -58,11 +58,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ncp_destroy_inode(struct inode *inode) +static void ncp_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); } +static void ncp_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ncp_i_callback); +} + static void init_once(void *foo) { struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e67e31c73416..017daa3bed38 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1438,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return &nfsi->vfs_inode; } -void nfs_destroy_inode(struct inode *inode) +static void nfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} + static inline void nfs4_init_once(struct nfs_inode *nfsi) { #ifdef CONFIG_NFS_V4 diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index d36fc7ee615f..e2dcc9c733f7 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -162,10 +162,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) return &ii->vfs_inode; } -void nilfs_destroy_inode(struct inode *inode) +static void nilfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + INIT_LIST_HEAD(&inode->i_dentry); + if (mdi) { kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); @@ -173,6 +176,11 @@ void nilfs_destroy_inode(struct inode *inode) kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } +void nilfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nilfs_i_callback); +} + static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) { struct the_nilfs *nilfs = sbi->s_nilfs; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 93622b175fc7..a627ed82c0a3 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) return NULL; } +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + void ntfs_destroy_big_inode(struct inode *inode) { ntfs_inode *ni = NTFS_I(inode); @@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode) BUG_ON(ni->page); if (!atomic_dec_and_test(&ni->count)) BUG(); - kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); + call_rcu(&inode->i_rcu, ntfs_i_callback); } static inline ntfs_inode *ntfs_alloc_extent_inode(void) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b2df490a19ed..8c5c0eddc365 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) return &ip->ip_vfs_inode; } -static void dlmfs_destroy_inode(struct inode *inode) +static void dlmfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } +static void dlmfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dlmfs_i_callback); +} + static void dlmfs_evict_inode(struct inode *inode) { int status; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cfeab7ce3697..17ff46fa8a10 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -569,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void ocfs2_destroy_inode(struct inode *inode) +static void ocfs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } +static void ocfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ocfs2_i_callback); +} + static unsigned long long ocfs2_max_file_offset(unsigned int bbits, unsigned int cbits) { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 911e61f348fc..a2a5bff774e3 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void openprom_destroy_inode(struct inode *inode) +static void openprom_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(op_inode_cachep, OP_I(inode)); } +static void openprom_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, openprom_i_callback); +} + static struct inode *openprom_iget(struct super_block *sb, ino_t ino) { struct inode *inode; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3ddb6068177c..6bcb926b101b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb) return inode; } -static void proc_destroy_inode(struct inode *inode) +static void proc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index fcada42f1aa3..e63b4171d583 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void qnx4_destroy_inode(struct inode *inode) +static void qnx4_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } +static void qnx4_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, qnx4_i_callback); +} + static void init_once(void *foo) { struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b243117b8752..2575682a9ead 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -529,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void reiserfs_destroy_inode(struct inode *inode) +static void reiserfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } +static void reiserfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, reiserfs_i_callback); +} + static void init_once(void *foo) { struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 6647f90e55cd..2305e3121cb1 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) /* * return a spent inode to the slab cache */ -static void romfs_destroy_inode(struct inode *inode) +static void romfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } +static void romfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, romfs_i_callback); +} + /* * get filesystem statistics */ diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 24de30ba34c1..20700b9f2b4c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb) } -static void squashfs_destroy_inode(struct inode *inode) +static void squashfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } +static void squashfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, squashfs_i_callback); +} + static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index de44d067b9e6..0630eb969a28 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) return &si->vfs_inode; } -static void sysv_destroy_inode(struct inode *inode) +static void sysv_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } +static void sysv_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, sysv_i_callback); +} + static void init_once(void *p) { struct sysv_inode_info *si = (struct sysv_inode_info *)p; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 91fac54c70e3..6e11c2975dcf 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) return &ui->vfs_inode; }; +static void ubifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ubifs_inode *ui = ubifs_inode(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ubifs_inode_slab, ui); +} + static void ubifs_destroy_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); - kmem_cache_free(ubifs_inode_slab, inode); + call_rcu(&inode->i_rcu, ubifs_i_callback); } /* diff --git a/fs/udf/super.c b/fs/udf/super.c index 4a5c7c61836a..b539d53320fb 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -139,11 +139,18 @@ static struct inode *udf_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void udf_destroy_inode(struct inode *inode) +static void udf_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } +static void udf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, udf_i_callback); +} + static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 2c47daed56da..2c61ac5d4e48 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ufs_destroy_inode(struct inode *inode) +static void ufs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } +static void ufs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ufs_i_callback); +} + static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0cdd26932d8e..d7de5a3f7867 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -91,6 +91,17 @@ xfs_inode_alloc( return ip; } +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_zone_free(xfs_inode_zone, ip); +} + void xfs_inode_free( struct xfs_inode *ip) @@ -134,7 +145,7 @@ xfs_inode_free( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 296cf2fde945..1ff4d0a33b25 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -737,7 +737,10 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; - struct list_head i_dentry; + union { + struct list_head i_dentry; + struct rcu_head i_rcu; + }; unsigned long i_ino; atomic_t i_count; unsigned int i_nlink; diff --git a/include/linux/net.h b/include/linux/net.h index 16faa130088c..06bde4908473 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -120,7 +120,6 @@ enum sock_shutdown_cmd { struct socket_wq { wait_queue_head_t wait; struct fasync_struct *fasync_list; - struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 035f4399edbc..14fb6d67e6a3 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -237,11 +237,18 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void mqueue_destroy_inode(struct inode *inode) +static void mqueue_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } +static void mqueue_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, mqueue_i_callback); +} + static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; diff --git a/mm/shmem.c b/mm/shmem.c index 47fdeeb9d636..5ee67c990602 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void shmem_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); + call_rcu(&inode->i_rcu, shmem_i_callback); } static void init_once(void *foo) diff --git a/net/socket.c b/net/socket.c index 088fb3fd45e0..97fff3a4e72f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -262,20 +262,20 @@ static struct inode *sock_alloc_inode(struct super_block *sb) } -static void wq_free_rcu(struct rcu_head *head) +static void sock_free_rcu(struct rcu_head *head) { - struct socket_wq *wq = container_of(head, struct socket_wq, rcu); + struct inode *inode = container_of(head, struct inode, i_rcu); + struct socket_alloc *ei = container_of(inode, struct socket_alloc, + vfs_inode); - kfree(wq); + kfree(ei->socket.wq); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(sock_inode_cachep, ei); } static void sock_destroy_inode(struct inode *inode) { - struct socket_alloc *ei; - - ei = container_of(inode, struct socket_alloc, vfs_inode); - call_rcu(&ei->socket.wq->rcu, wq_free_rcu); - kmem_cache_free(sock_inode_cachep, ei); + call_rcu(&inode->i_rcu, sock_free_rcu); } static void init_once(void *foo) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a0dc1a86fcea..2899fe27f880 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -162,11 +162,19 @@ rpc_alloc_inode(struct super_block *sb) } static void -rpc_destroy_inode(struct inode *inode) +rpc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); } +static void +rpc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, rpc_i_callback); +} + static int rpc_pipe_open(struct inode *inode, struct file *filp) { -- cgit v1.2.3 From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:52 +1100 Subject: fs: rcu-walk for path lookup Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin --- Documentation/filesystems/dentry-locking.txt | 172 ------- Documentation/filesystems/path-lookup.txt | 345 +++++++++++++ fs/dcache.c | 203 +++++++- fs/filesystems.c | 3 + fs/namei.c | 743 ++++++++++++++++++++++----- fs/proc/proc_sysctl.c | 4 + include/linux/dcache.h | 32 +- include/linux/namei.h | 15 +- include/linux/security.h | 8 +- security/security.c | 9 + 10 files changed, 1194 insertions(+), 340 deletions(-) delete mode 100644 Documentation/filesystems/dentry-locking.txt create mode 100644 Documentation/filesystems/path-lookup.txt (limited to 'Documentation') diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt deleted file mode 100644 index 30b6a40f5650..000000000000 --- a/Documentation/filesystems/dentry-locking.txt +++ /dev/null @@ -1,172 +0,0 @@ -RCU-based dcache locking model -============================== - -On many workloads, the most common operation on dcache is to look up a -dentry, given a parent dentry and the name of the child. Typically, -for every open(), stat() etc., the dentry corresponding to the -pathname will be looked up by walking the tree starting with the first -component of the pathname and using that dentry along with the next -component to look up the next level and so on. Since it is a frequent -operation for workloads like multiuser environments and web servers, -it is important to optimize this path. - -Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in -every component during path look-up. Since 2.5.10 onwards, fast-walk -algorithm changed this by holding the dcache_lock at the beginning and -walking as many cached path component dentries as possible. This -significantly decreases the number of acquisition of -dcache_lock. However it also increases the lock hold time -significantly and affects performance in large SMP machines. Since -2.5.62 kernel, dcache has been using a new locking model that uses RCU -to make dcache look-up lock-free. - -The current dcache locking model is not very different from the -existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock -protected the hash chain, d_child, d_alias, d_lru lists as well as -d_inode and several other things like mount look-up. RCU-based changes -affect only the way the hash chain is protected. For everything else -the dcache_lock must be taken for both traversing as well as -updating. The hash chain updates too take the dcache_lock. The -significant change is the way d_lookup traverses the hash chain, it -doesn't acquire the dcache_lock for this and rely on RCU to ensure -that the dentry has not been *freed*. - -dcache_lock no longer exists, dentry locking is explained in fs/dcache.c - -Dcache locking details -====================== - -For many multi-user workloads, open() and stat() on files are very -frequently occurring operations. Both involve walking of path names to -find the dentry corresponding to the concerned file. In 2.4 kernel, -dcache_lock was held during look-up of each path component. Contention -and cache-line bouncing of this global lock caused significant -scalability problems. With the introduction of RCU in Linux kernel, -this was worked around by making the look-up of path components during -path walking lock-free. - - -Safe lock-free look-up of dcache hash table -=========================================== - -Dcache is a complex data structure with the hash table entries also -linked together in other lists. In 2.4 kernel, dcache_lock protected -all the lists. RCU dentry hash walking works like this: - -1. The deletion from hash chain is done using hlist_del_rcu() macro - which doesn't initialize next pointer of the deleted dentry and - this allows us to walk safely lock-free while a deletion is - happening. This is a standard hlist_rcu iteration. - -2. Insertion of a dentry into the hash table is done using - hlist_add_head_rcu() which take care of ordering the writes - the - writes to the dentry must be visible before the dentry is - inserted. This works in conjunction with hlist_for_each_rcu(), - which has since been replaced by hlist_for_each_entry_rcu(), while - walking the hash chain. The only requirement is that all - initialization to the dentry must be done before - hlist_add_head_rcu() since we don't have lock protection - while traversing the hash chain. - -3. The dentry looked up without holding locks cannot be returned for - walking if it is unhashed. It then may have a NULL d_inode or other - bogosity since RCU doesn't protect the other fields in the dentry. We - therefore use a flag DCACHE_UNHASHED to indicate unhashed dentries - and use this in conjunction with a per-dentry lock (d_lock). Once - looked up without locks, we acquire the per-dentry lock (d_lock) and - check if the dentry is unhashed. If so, the look-up is failed. If not, - the reference count of the dentry is increased and the dentry is - returned. - -4. Once a dentry is looked up, it must be ensured during the path walk - for that component it doesn't go away. In pre-2.5.10 code, this was - done holding a reference to the dentry. dcache_rcu does the same. - In some sense, dcache_rcu path walking looks like the pre-2.5.10 - version. - -5. All dentry hash chain updates must take the per-dentry lock (see - fs/dcache.c). This excludes dput() to ensure that a dentry that has - been looked up concurrently does not get deleted before dget() can - take a ref. - -6. There are several ways to do reference counting of RCU protected - objects. One such example is in ipv4 route cache where deferred - freeing (using call_rcu()) is done as soon as the reference count - goes to zero. This cannot be done in the case of dentries because - tearing down of dentries require blocking (dentry_iput()) which - isn't supported from RCU callbacks. Instead, tearing down of - dentries happen synchronously in dput(), but actual freeing happens - later when RCU grace period is over. This allows safe lock-free - walking of the hash chains, but a matched dentry may have been - partially torn down. The checking of DCACHE_UNHASHED flag with - d_lock held detects such dentries and prevents them from being - returned from look-up. - - -Maintaining POSIX rename semantics -================================== - -Since look-up of dentries is lock-free, it can race against a -concurrent rename operation. For example, during rename of file A to -B, look-up of either A or B must succeed. So, if look-up of B happens -after A has been removed from the hash chain but not added to the new -hash chain, it may fail. Also, a comparison while the name is being -written concurrently by a rename may result in false positive matches -violating rename semantics. Issues related to race with rename are -handled as described below : - -1. Look-up can be done in two ways - d_lookup() which is safe from - simultaneous renames and __d_lookup() which is not. If - __d_lookup() fails, it must be followed up by a d_lookup() to - correctly determine whether a dentry is in the hash table or - not. d_lookup() protects look-ups using a sequence lock - (rename_lock). - -2. The name associated with a dentry (d_name) may be changed if a - rename is allowed to happen simultaneously. To avoid memcmp() in - __d_lookup() go out of bounds due to a rename and false positive - comparison, the name comparison is done while holding the - per-dentry lock. This prevents concurrent renames during this - operation. - -3. Hash table walking during look-up may move to a different bucket as - the current dentry is moved to a different bucket due to rename. - But we use hlists in dcache hash table and they are - null-terminated. So, even if a dentry moves to a different bucket, - hash chain walk will terminate. [with a list_head list, it may not - since termination is when the list_head in the original bucket is - reached]. Since we redo the d_parent check and compare name while - holding d_lock, lock-free look-up will not race against d_move(). - -4. There can be a theoretical race when a dentry keeps coming back to - original bucket due to double moves. Due to this look-up may - consider that it has never moved and can end up in a infinite loop. - But this is not any worse that theoretical livelocks we already - have in the kernel. - - -Important guidelines for filesystem developers related to dcache_rcu -==================================================================== - -1. Existing dcache interfaces (pre-2.5.62) exported to filesystem - don't change. Only dcache internal implementation changes. However - filesystems *must not* delete from the dentry hash chains directly - using the list macros like allowed earlier. They must use dcache - APIs like d_drop() or __d_drop() depending on the situation. - -2. d_flags is now protected by a per-dentry lock (d_lock). All access - to d_flags must be protected by it. - -3. For a hashed dentry, checking of d_count needs to be protected by - d_lock. - - -Papers and other documentation on dcache locking -================================================ - -1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). - -2. http://lse.sourceforge.net/locking/dcache/dcache.html - - - diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt new file mode 100644 index 000000000000..09b2878724a1 --- /dev/null +++ b/Documentation/filesystems/path-lookup.txt @@ -0,0 +1,345 @@ +Path walking and name lookup locking +==================================== + +Path resolution is the finding a dentry corresponding to a path name string, by +performing a path walk. Typically, for every open(), stat() etc., the path name +will be resolved. Paths are resolved by walking the namespace tree, starting +with the first component of the pathname (eg. root or cwd) with a known dentry, +then finding the child of that dentry, which is named the next component in the +path string. Then repeating the lookup from the child dentry and finding its +child with the next element, and so on. + +Since it is a frequent operation for workloads like multiuser environments and +web servers, it is important to optimize this code. + +Path walking synchronisation history: +Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and +thus in every component during path look-up. Since 2.5.10 onwards, fast-walk +algorithm changed this by holding the dcache_lock at the beginning and walking +as many cached path component dentries as possible. This significantly +decreases the number of acquisition of dcache_lock. However it also increases +the lock hold time significantly and affects performance in large SMP machines. +Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to +make dcache look-up lock-free. + +All the above algorithms required taking a lock and reference count on the +dentry that was looked up, so that may be used as the basis for walking the +next path element. This is inefficient and unscalable. It is inefficient +because of the locks and atomic operations required for every dentry element +slows things down. It is not scalable because many parallel applications that +are path-walk intensive tend to do path lookups starting from a common dentry +(usually, the root "/" or current working directory). So contention on these +common path elements causes lock and cacheline queueing. + +Since 2.6.38, RCU is used to make a significant part of the entire path walk +(including dcache look-up) completely "store-free" (so, no locks, atomics, or +even stores into cachelines of common dentries). This is known as "rcu-walk" +path walking. + +Path walking overview +===================== + +A name string specifies a start (root directory, cwd, fd-relative) and a +sequence of elements (directory entry names), which together refer to a path in +the namespace. A path is represented as a (dentry, vfsmount) tuple. The name +elements are sub-strings, seperated by '/'. + +Name lookups will want to find a particular path that a name string refers to +(usually the final element, or parent of final element). This is done by taking +the path given by the name's starting point (which we know in advance -- eg. +current->fs->cwd or current->fs->root) as the first parent of the lookup. Then +iteratively for each subsequent name element, look up the child of the current +parent with the given name and if it is not the desired entry, make it the +parent for the next lookup. + +A parent, of course, must be a directory, and we must have appropriate +permissions on the parent inode to be able to walk into it. + +Turning the child into a parent for the next lookup requires more checks and +procedures. Symlinks essentially substitute the symlink name for the target +name in the name string, and require some recursive path walking. Mount points +must be followed into (thus changing the vfsmount that subsequent path elements +refer to), switching from the mount point path to the root of the particular +mounted vfsmount. These behaviours are variously modified depending on the +exact path walking flags. + +Path walking then must, broadly, do several particular things: +- find the start point of the walk; +- perform permissions and validity checks on inodes; +- perform dcache hash name lookups on (parent, name element) tuples; +- traverse mount points; +- traverse symlinks; +- lookup and create missing parts of the path on demand. + +Safe store-free look-up of dcache hash table +============================================ + +Dcache name lookup +------------------ +In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple +and use that to select a bucket in the dcache-hash table. The list of entries +in that bucket is then walked, and we do a full comparison of each entry +against our (parent, name) tuple. + +The hash lists are RCU protected, so list walking is not serialised with +concurrent updates (insertion, deletion from the hash). This is a standard RCU +list application with the exception of renames, which will be covered below. + +Parent and name members of a dentry, as well as its membership in the dcache +hash, and its inode are protected by the per-dentry d_lock spinlock. A +reference is taken on the dentry (while the fields are verified under d_lock), +and this stabilises its d_inode pointer and actual inode. This gives a stable +point to perform the next step of our path walk against. + +These members are also protected by d_seq seqlock, although this offers +read-only protection and no durability of results, so care must be taken when +using d_seq for synchronisation (see seqcount based lookups, below). + +Renames +------- +Back to the rename case. In usual RCU protected lists, the only operations that +will happen to an object is insertion, and then eventually removal from the +list. The object will not be reused until an RCU grace period is complete. +This ensures the RCU list traversal primitives can run over the object without +problems (see RCU documentation for how this works). + +However when a dentry is renamed, its hash value can change, requiring it to be +moved to a new hash list. Allocating and inserting a new alias would be +expensive and also problematic for directory dentries. Latency would be far to +high to wait for a grace period after removing the dentry and before inserting +it in the new hash bucket. So what is done is to insert the dentry into the +new list immediately. + +However, when the dentry's list pointers are updated to point to objects in the +new list before waiting for a grace period, this can result in a concurrent RCU +lookup of the old list veering off into the new (incorrect) list and missing +the remaining dentries on the list. + +There is no fundamental problem with walking down the wrong list, because the +dentry comparisons will never match. However it is fatal to miss a matching +dentry. So a seqlock is used to detect when a rename has occurred, and so the +lookup can be retried. + + 1 2 3 + +---+ +---+ +---+ +hlist-->| N-+->| N-+->| N-+-> +head <--+-P |<-+-P |<-+-P | + +---+ +---+ +---+ + +Rename of dentry 2 may require it deleted from the above list, and inserted +into a new list. Deleting 2 gives the following list. + + 1 3 + +---+ +---+ (don't worry, the longer pointers do not +hlist-->| N-+-------->| N-+-> impose a measurable performance overhead +head <--+-P |<--------+-P | on modern CPUs) + +---+ +---+ + ^ 2 ^ + | +---+ | + | | N-+----+ + +----+-P | + +---+ + +This is a standard RCU-list deletion, which leaves the deleted object's +pointers intact, so a concurrent list walker that is currently looking at +object 2 will correctly continue to object 3 when it is time to traverse the +next object. + +However, when inserting object 2 onto a new list, we end up with this: + + 1 3 + +---+ +---+ +hlist-->| N-+-------->| N-+-> +head <--+-P |<--------+-P | + +---+ +---+ + 2 + +---+ + | N-+----> + <----+-P | + +---+ + +Because we didn't wait for a grace period, there may be a concurrent lookup +still at 2. Now when it follows 2's 'next' pointer, it will walk off into +another list without ever having checked object 3. + +A related, but distinctly different, issue is that of rename atomicity versus +lookup operations. If a file is renamed from 'A' to 'B', a lookup must only +find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup +of 'B' must succeed (note the reverse is not true). + +Between deleting the dentry from the old hash list, and inserting it on the new +hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same +rename seqlock is also used to cover this race in much the same way, by +retrying a negative lookup result if a rename was in progress. + +Seqcount based lookups +---------------------- +In refcount based dcache lookups, d_lock is used to serialise access to +the dentry, stabilising it while comparing its name and parent and then +taking a reference count (the reference count then gives a stable place to +start the next part of the path walk from). + +As explained above, we would like to do path walking without taking locks or +reference counts on intermediate dentries along the path. To do this, a per +dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry +looks like (its name, parent, and inode). That snapshot is then used to start +the next part of the path walk. When loading the coherent snapshot under d_seq, +care must be taken to load the members up-front, and use those pointers rather +than reloading from the dentry later on (otherwise we'd have interesting things +like d_inode going NULL underneath us, if the name was unlinked). + +Also important is to avoid performing any destructive operations (pretty much: +no non-atomic stores to shared data), and to recheck the seqcount when we are +"done" with the operation. Retry or abort if the seqcount does not match. +Avoiding destructive or changing operations means we can easily unwind from +failure. + +What this means is that a caller, provided they are holding RCU lock to +protect the dentry object from disappearing, can perform a seqcount based +lookup which does not increment the refcount on the dentry or write to +it in any way. This returned dentry can be used for subsequent operations, +provided that d_seq is rechecked after that operation is complete. + +Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be +queried for permissions. + +With this two parts of the puzzle, we can do path lookups without taking +locks or refcounts on dentry elements. + +RCU-walk path walking design +============================ + +Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk +is the traditional[*] way of performing dcache lookups using d_lock to +serialise concurrent modifications to the dentry and take a reference count on +it. ref-walk is simple and obvious, and may sleep, take locks, etc while path +walking is operating on each dentry. rcu-walk uses seqcount based dentry +lookups, and can perform lookup of intermediate elements without any stores to +shared data in the dentry or inode. rcu-walk can not be applied to all cases, +eg. if the filesystem must sleep or perform non trivial operations, rcu-walk +must be switched to ref-walk mode. + +[*] RCU is still used for the dentry hash lookup in ref-walk, but not the full + path walk. + +Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining +path string, rcu-walk uses a d_seq protected snapshot. When looking up a +child of this parent snapshot, we open d_seq critical section on the child +before closing d_seq critical section on the parent. This gives an interlocking +ladder of snapshots to walk down. + + + proc 101 + /----------------\ + / comm: "vi" \ + / fs.root: dentry0 \ + \ fs.cwd: dentry2 / + \ / + \----------------/ + +So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will +start from current->fs->root, which is a pinned dentry. Alternatively, +"./test.c" would start from cwd; both names refer to the same path in +the context of proc101. + + dentry 0 + +---------------------+ rcu-walk begins here, we note d_seq, check the + | name: "/" | inode's permission, and then look up the next + | inode: 10 | path element which is "home"... + | children:"home", ...| + +---------------------+ + | + dentry 1 V + +---------------------+ ... which brings us here. We find dentry1 via + | name: "home" | hash lookup, then note d_seq and compare name + | inode: 678 | string and parent pointer. When we have a match, + | children:"npiggin" | we now recheck the d_seq of dentry0. Then we + +---------------------+ check inode and look up the next element. + | + dentry2 V + +---------------------+ Note: if dentry0 is now modified, lookup is + | name: "npiggin" | not necessarily invalid, so we need only keep a + | inode: 543 | parent for d_seq verification, and grandparents + | children:"a.c", ... | can be forgotten. + +---------------------+ + | + dentry3 V + +---------------------+ At this point we have our destination dentry. + | name: "a.c" | We now take its d_lock, verify d_seq of this + | inode: 14221 | dentry. If that checks out, we can increment + | children:NULL | its refcount because we're holding d_lock. + +---------------------+ + +Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock, +re-checking its d_seq, and then incrementing its refcount is called +"dropping rcu" or dropping from rcu-walk into ref-walk mode. + +It is, in some sense, a bit of a house of cards. If the seqcount check of the +parent snapshot fails, the house comes down, because we had closed the d_seq +section on the grandparent, so we have nothing left to stand on. In that case, +the path walk must be fully restarted (which we do in ref-walk mode, to avoid +live locks). It is costly to have a full restart, but fortunately they are +quite rare. + +When we reach a point where sleeping is required, or a filesystem callout +requires ref-walk, then instead of restarting the walk, we attempt to drop rcu +at the last known good dentry we have. Avoiding a full restart in ref-walk in +these cases is fundamental for performance and scalability because blocking +operations such as creates and unlinks are not uncommon. + +The detailed design for rcu-walk is like this: +* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. +* Take the RCU lock for the entire path walk, starting with the acquiring + of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are + not required for dentry persistence. +* synchronize_rcu is called when unregistering a filesystem, so we can + access d_ops and i_ops during rcu-walk. +* Similarly take the vfsmount lock for the entire path walk. So now mnt + refcounts are not required for persistence. Also we are free to perform mount + lookups, and to assume dentry mount points and mount roots are stable up and + down the path. +* Have a per-dentry seqlock to protect the dentry name, parent, and inode, + so we can load this tuple atomically, and also check whether any of its + members have changed. +* Dentry lookups (based on parent, candidate string tuple) recheck the parent + sequence after the child is found in case anything changed in the parent + during the path walk. +* inode is also RCU protected so we can load d_inode and use the inode for + limited things. +* i_mode, i_uid, i_gid can be tested for exec permissions during path walk. +* i_op can be loaded. +* When the destination dentry is reached, drop rcu there (ie. take d_lock, + verify d_seq, increment refcount). +* If seqlock verification fails anywhere along the path, do a full restart + of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of + a better errno) to signal an rcu-walk failure. + +The cases where rcu-walk cannot continue are: +* NULL dentry (ie. any uncached path element) +* parent with d_inode->i_op->permission or ACLs +* dentries with d_revalidate +* Following links + +In future patches, permission checks and d_revalidate become rcu-walk aware. It +may be possible eventually to make following links rcu-walk aware. + +Uncached path elements will always require dropping to ref-walk mode, at the +very least because i_mutex needs to be grabbed, and objects allocated. + +Final note: +"store-free" path walking is not strictly store free. We take vfsmount lock +and refcounts (both of which can be made per-cpu), and we also store to the +stack (which is essentially CPU-local), and we also have to take locks and +refcount on final dentry. + +The point is that shared data, where practically possible, is not locked +or stored into. The result is massive improvements in performance and +scalability of path resolution. + + +Papers and other documentation on dcache locking +================================================ + +1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). + +2. http://lse.sourceforge.net/locking/dcache/dcache.html diff --git a/fs/dcache.c b/fs/dcache.c index dc0551c9755d..187fea040108 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry) call_rcu(&dentry->d_u.d_rcu, __d_free); } +/** + * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * After this call, in-progress rcu-walk path lookup will fail. This + * should be called after unhashing, and after changing d_inode (if + * the dentry has not already been unhashed). + */ +static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +{ + assert_spin_locked(&dentry->d_lock); + /* Go through a barrier */ + write_seqcount_barrier(&dentry->d_seq); +} + /* * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. + * d_iput() operation if defined. Dentry has no refcount + * and is unhashed. */ static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) @@ -178,6 +192,28 @@ static void dentry_iput(struct dentry * dentry) } } +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. dentry remains in-use. + */ +static void dentry_unlink_inode(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dcache_inode_lock) +{ + struct inode *inode = dentry->d_inode; + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_inode_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); +} + /* * dentry_lru_(add|del|move_tail) must be called with d_lock held. */ @@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry) spin_lock(&dcache_hash_lock); hlist_del_rcu(&dentry->d_hash); spin_unlock(&dcache_hash_lock); + dentry_rcuwalk_barrier(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -309,6 +346,7 @@ relock: spin_unlock(&dcache_inode_lock); goto relock; } + if (ref) dentry->d_count--; /* if dentry was on the d_lru list delete it from there */ @@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_count = 1; dentry->d_flags = DCACHE_UNHASHED; spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; @@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if (inode) list_add(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; + dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } @@ -1610,6 +1650,111 @@ err_out: } EXPORT_SYMBOL(d_add_ci); +/** + * __d_lookup_rcu - search for a dentry (racy, store-free) + * @parent: parent dentry + * @name: qstr of name we wish to find + * @seq: returns d_seq value at the point where the dentry was found + * @inode: returns dentry->d_inode when the inode was found valid. + * Returns: dentry, or NULL + * + * __d_lookup_rcu is the dcache lookup function for rcu-walk name + * resolution (store-free path walking) design described in + * Documentation/filesystems/path-lookup.txt. + * + * This is not to be used outside core vfs. + * + * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock + * held, and rcu_read_lock held. The returned dentry must not be stored into + * without taking d_lock and checking d_seq sequence count against @seq + * returned here. + * + * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * function. + * + * Alternatively, __d_lookup_rcu may be called again to look up the child of + * the returned dentry, so long as its parent's seqlock is checked after the + * child is looked up. Thus, an interlocking stepping of sequence lock checks + * is formed, giving integrity down the path walk. + */ +struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_head *head = d_hash(parent, hash); + struct hlist_node *node; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Carefully use d_seq when comparing a candidate dentry, to avoid + * races with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/vfs/dcache-locking.txt for more details. + */ + hlist_for_each_entry_rcu(dentry, node, head, d_hash) { + struct inode *i; + const char *tname; + int tlen; + + if (dentry->d_name.hash != hash) + continue; + +seqretry: + *seq = read_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + i = dentry->d_inode; + /* + * This seqcount check is required to ensure name and + * len are loaded atomically, so as not to walk off the + * edge of memory when walking. If we could load this + * atomically some other way, we could drop this check. + */ + if (read_seqcount_retry(&dentry->d_seq, *seq)) + goto seqretry; + if (parent->d_op && parent->d_op->d_compare) { + if (parent->d_op->d_compare(parent, *inode, + dentry, i, + tlen, tname, name)) + continue; + } else { + if (tlen != len) + continue; + if (memcmp(tname, str, tlen)) + continue; + } + /* + * No extra seqcount check is required after the name + * compare. The caller must perform a seqcount check in + * order to do anything useful with the returned dentry + * anyway. + */ + *inode = i; + return dentry; + } + return NULL; +} + /** * d_lookup - search for a dentry * @parent: parent dentry @@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci); * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ -struct dentry * d_lookup(struct dentry * parent, struct qstr * name) +struct dentry *d_lookup(struct dentry *parent, struct qstr *name) { - struct dentry * dentry = NULL; + struct dentry *dentry; unsigned seq; do { @@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) } EXPORT_SYMBOL(d_lookup); -/* +/** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find @@ -1651,16 +1796,23 @@ EXPORT_SYMBOL(d_lookup); * * __d_lookup callers must be commented. */ -struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) +struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct hlist_head *head = d_hash(parent,hash); - struct dentry *found = NULL; struct hlist_node *node; + struct dentry *found = NULL; struct dentry *dentry; + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + /* * The hash list is protected using RCU. * @@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) rcu_read_lock(); hlist_for_each_entry_rcu(dentry, node, head, d_hash) { - struct qstr *qstr; + const char *tname; + int tlen; if (dentry->d_name.hash != hash) continue; - if (dentry->d_parent != parent) - continue; spin_lock(&dentry->d_lock); - - /* - * Recheck the dentry after taking the lock - d_move may have - * changed things. Don't bother checking the hash because - * we're about to compare the whole name anyway. - */ if (dentry->d_parent != parent) goto next; - - /* non-existing due to RCU? */ if (d_unhashed(dentry)) goto next; @@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */ - qstr = &dentry->d_name; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; if (parent->d_op && parent->d_op->d_compare) { if (parent->d_op->d_compare(parent, parent->d_inode, dentry, dentry->d_inode, - qstr->len, qstr->name, name)) + tlen, tname, name)) goto next; } else { - if (qstr->len != len) + if (tlen != len) goto next; - if (memcmp(qstr->name, str, len)) + if (memcmp(tname, str, tlen)) goto next; } @@ -1821,7 +1965,7 @@ again: goto again; } dentry->d_flags &= ~DCACHE_CANT_MOUNT; - dentry_iput(dentry); + dentry_unlink_inode(dentry); fsnotify_nameremove(dentry, isdir); return; } @@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); + write_seqcount_begin(&dentry->d_seq); memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); + write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dentry_update_name_case); @@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target) dentry_lock_for_move(dentry, target); + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&target->d_seq); + /* Move the dentry to the target hash queue, if on different bucket */ spin_lock(&dcache_hash_lock); if (!d_unhashed(dentry)) @@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target) spin_unlock(&dcache_hash_lock); /* Unhash the target: dput() will then get rid of it */ + /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ __d_drop(target); list_del(&dentry->d_u.d_child); @@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target) list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + write_seqcount_end(&target->d_seq); + write_seqcount_end(&dentry->d_seq); + dentry_unlock_parents_for_move(dentry, target); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); @@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry_lock_for_move(anon, dentry); + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&anon->d_seq); + dparent = dentry->d_parent; aparent = anon->d_parent; @@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) else INIT_LIST_HEAD(&anon->d_u.d_child); + write_seqcount_end(&dentry->d_seq); + write_seqcount_end(&anon->d_seq); + dentry_unlock_parents_for_move(anon, dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/filesystems.c b/fs/filesystems.c index 68ba492d8eef..751d6b255a12 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs) tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); + + synchronize_rcu(); + return -EINVAL; } diff --git a/fs/namei.c b/fs/namei.c index 5642bc2be418..8d3f15b3a541 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname); /* * This does basic POSIX ACL permission checking */ -static int acl_permission_check(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) +static inline int __acl_permission_check(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask), int rcu) { umode_t mode = inode->i_mode; @@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask, mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { - int error = check_acl(inode, mask); - if (error != -EAGAIN) - return error; + if (rcu) { + return -ECHILD; + } else { + int error = check_acl(inode, mask); + if (error != -EAGAIN) + return error; + } } if (in_group_p(inode->i_gid)) @@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask, return -EACCES; } +static inline int acl_permission_check(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + return __acl_permission_check(inode, mask, check_acl, 0); +} + /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for @@ -374,6 +384,173 @@ void path_put(struct path *path) } EXPORT_SYMBOL(path_put); +/** + * nameidata_drop_rcu - drop this nameidata out of rcu-walk + * @nd: nameidata pathwalk data to drop + * @Returns: 0 on success, -ECHLID on failure + * + * Path walking has 2 modes, rcu-walk and ref-walk (see + * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt + * to drop out of rcu-walk mode and take normal reference counts on dentries + * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take + * refcounts at the last known good point before rcu-walk got stuck, so + * ref-walk may continue from there. If this is not successful (eg. a seqcount + * has changed), then failure is returned and path walk restarts from the + * beginning in ref-walk mode. + * + * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into + * ref-walk. Must be called from rcu-walk context. + */ +static int nameidata_drop_rcu(struct nameidata *nd) +{ + struct fs_struct *fs = current->fs; + struct dentry *dentry = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + if (nd->root.mnt) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; +err: + spin_unlock(&dentry->d_lock); +err_root: + if (nd->root.mnt) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return nameidata_drop_rcu(nd); + return 0; +} + +/** + * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk + * @nd: nameidata pathwalk data to drop + * @dentry: dentry to drop + * @Returns: 0 on success, -ECHLID on failure + * + * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, + * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on + * @nd. Must be called from rcu-walk context. + */ +static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) +{ + struct fs_struct *fs = current->fs; + struct dentry *parent = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + /* + * If the sequence check on the child dentry passed, then the child has + * not been removed from its parent. This means the parent dentry must + * be valid and able to take a reference at this point. + */ + BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); + BUG_ON(!parent->d_count); + parent->d_count++; + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + if (nd->root.mnt) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; +err: + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); +err_root: + if (nd->root.mnt) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) +{ + if (nd->flags & LOOKUP_RCU) + return nameidata_dentry_drop_rcu(nd, dentry); + return 0; +} + +/** + * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk + * @nd: nameidata pathwalk data to drop + * @Returns: 0 on success, -ECHLID on failure + * + * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. + * nd->path should be the final element of the lookup, so nd->root is discarded. + * Must be called from rcu-walk context. + */ +static int nameidata_drop_rcu_last(struct nameidata *nd) +{ + struct dentry *dentry = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err_unlock; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + + return 0; + +err_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) +{ + if (likely(nd->flags & LOOKUP_RCU)) + return nameidata_drop_rcu_last(nd); + return 0; +} + /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd) * short-cut DAC fails, then call ->permission() to do more * complete permission check. */ -static int exec_permission(struct inode *inode) +static inline int __exec_permission(struct inode *inode, int rcu) { int ret; if (inode->i_op->permission) { + if (rcu) + return -ECHILD; ret = inode->i_op->permission(inode, MAY_EXEC); if (!ret) goto ok; return ret; } - ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu); if (!ret) goto ok; + if (rcu && ret == -ECHILD) + return ret; if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; return ret; ok: - return security_inode_permission(inode, MAY_EXEC); + return security_inode_exec_permission(inode, rcu); +} + +static int exec_permission(struct inode *inode) +{ + return __exec_permission(inode, 0); +} + +static int exec_permission_rcu(struct inode *inode) +{ + return __exec_permission(inode, 1); } static __always_inline void set_root(struct nameidata *nd) @@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd) static int link_path_walk(const char *, struct nameidata *); +static __always_inline void set_root_rcu(struct nameidata *nd) +{ + if (!nd->root.mnt) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + nd->root = fs->root; + spin_unlock(&fs->lock); + } +} + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { + int ret; + if (IS_ERR(link)) goto fail; @@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l nd->path = nd->root; path_get(&nd->root); } + nd->inode = nd->path.dentry->d_inode; - return link_path_walk(link, nd); + ret = link_path_walk(link, nd); + return ret; fail: path_put(&nd->path); return PTR_ERR(link); @@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) static inline void path_to_nameidata(struct path *path, struct nameidata *nd) { - dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) { - mntput(nd->path.mnt); - nd->path.mnt = path->mnt; + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path->mnt) + mntput(nd->path.mnt); } + nd->path.mnt = path->mnt; nd->path.dentry = path->dentry; } @@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p) if (path->mnt != nd->path.mnt) { path_to_nameidata(path, nd); + nd->inode = nd->path.dentry->d_inode; dget(dentry); } mntget(path->mnt); + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -591,6 +799,20 @@ loop: return err; } +static int follow_up_rcu(struct path *path) +{ + struct vfsmount *parent; + struct dentry *mountpoint; + + parent = path->mnt->mnt_parent; + if (parent == path->mnt) + return 0; + mountpoint = path->mnt->mnt_mountpoint; + path->dentry = mountpoint; + path->mnt = parent; + return 1; +} + int follow_up(struct path *path) { struct vfsmount *parent; @@ -615,6 +837,21 @@ int follow_up(struct path *path) /* * serialization is taken care of in namespace.c */ +static void __follow_mount_rcu(struct nameidata *nd, struct path *path, + struct inode **inode) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted; + mounted = __lookup_mnt(path->mnt, path->dentry, 1); + if (!mounted) + return; + path->mnt = mounted; + path->dentry = mounted->mnt_root; + nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *inode = path->dentry->d_inode; + } +} + static int __follow_mount(struct path *path) { int res = 0; @@ -660,7 +897,42 @@ int follow_down(struct path *path) return 0; } -static __always_inline void follow_dotdot(struct nameidata *nd) +static int follow_dotdot_rcu(struct nameidata *nd) +{ + struct inode *inode = nd->inode; + + set_root_rcu(nd); + + while(1) { + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + struct dentry *old = nd->path.dentry; + struct dentry *parent = old->d_parent; + unsigned seq; + + seq = read_seqcount_begin(&parent->d_seq); + if (read_seqcount_retry(&old->d_seq, nd->seq)) + return -ECHILD; + inode = parent->d_inode; + nd->path.dentry = parent; + nd->seq = seq; + break; + } + if (!follow_up_rcu(&nd->path)) + break; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + inode = nd->path.dentry->d_inode; + } + __follow_mount_rcu(nd, &nd->path, &inode); + nd->inode = inode; + + return 0; +} + +static void follow_dotdot(struct nameidata *nd) { set_root(nd); @@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) break; } follow_mount(&nd->path); + nd->inode = nd->path.dentry->d_inode; } /* @@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent, * It _is_ time-critical. */ static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path) + struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; - struct dentry *dentry, *parent; + struct dentry *dentry, *parent = nd->path.dentry; struct inode *dir; /* * See if the low-level filesystem might want * to use its own hash.. */ - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, - nd->path.dentry->d_inode, name); + if (parent->d_op && parent->d_op->d_hash) { + int err = parent->d_op->d_hash(parent, nd->inode, name); if (err < 0) return err; } @@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ - dentry = __d_lookup(nd->path.dentry, name); - if (!dentry) - goto need_lookup; + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + + *inode = nd->inode; + dentry = __d_lookup_rcu(parent, name, &seq, inode); + if (!dentry) { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + goto need_lookup; + } + /* Memory barrier in read_seqcount_begin of child is enough */ + if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + return -ECHILD; + + nd->seq = seq; + if (dentry->d_op && dentry->d_op->d_revalidate) { + /* We commonly drop rcu-walk here */ + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + goto need_revalidate; + } + path->mnt = mnt; + path->dentry = dentry; + __follow_mount_rcu(nd, path, inode); + } else { + dentry = __d_lookup(parent, name); + if (!dentry) + goto need_lookup; found: - if (dentry->d_op && dentry->d_op->d_revalidate) - goto need_revalidate; + if (dentry->d_op && dentry->d_op->d_revalidate) + goto need_revalidate; done: - path->mnt = mnt; - path->dentry = dentry; - __follow_mount(path); + path->mnt = mnt; + path->dentry = dentry; + __follow_mount(path); + *inode = path->dentry->d_inode; + } return 0; need_lookup: - parent = nd->path.dentry; dir = parent->d_inode; + BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); /* @@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) static int link_path_walk(const char *name, struct nameidata *nd) { struct path next; - struct inode *inode; int err; unsigned int lookup_flags = nd->flags; @@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (!*name) goto return_reval; - inode = nd->path.dentry->d_inode; if (nd->depth) lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); /* At this point we know we have a real path component. */ for(;;) { + struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; nd->flags |= LOOKUP_CONTINUE; - err = exec_permission(inode); + if (nd->flags & LOOKUP_RCU) { + err = exec_permission_rcu(nd->inode); + if (err == -ECHILD) { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + goto exec_again; + } + } else { +exec_again: + err = exec_permission(nd->inode); + } if (err) break; @@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (this.name[0] == '.') switch (this.len) { default: break; - case 2: + case 2: if (this.name[1] != '.') break; - follow_dotdot(nd); - inode = nd->path.dentry->d_inode; + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); /* fallthrough */ case 1: continue; } /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next); + err = do_lookup(nd, &this, &next, &inode); if (err) break; - err = -ENOENT; - inode = next.dentry->d_inode; if (!inode) goto out_dput; if (inode->i_op->follow_link) { + /* We commonly drop rcu-walk here */ + if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) + return -ECHILD; + BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) goto return_err; + nd->inode = nd->path.dentry->d_inode; err = -ENOENT; - inode = nd->path.dentry->d_inode; - if (!inode) + if (!nd->inode) break; - } else + } else { path_to_nameidata(&next, nd); + nd->inode = inode; + } err = -ENOTDIR; - if (!inode->i_op->lookup) + if (!nd->inode->i_op->lookup) break; continue; /* here ends the main loop */ @@ -911,32 +1226,39 @@ last_component: if (this.name[0] == '.') switch (this.len) { default: break; - case 2: + case 2: if (this.name[1] != '.') break; - follow_dotdot(nd); - inode = nd->path.dentry->d_inode; + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); /* fallthrough */ case 1: goto return_reval; } - err = do_lookup(nd, &this, &next); + err = do_lookup(nd, &this, &next, &inode); if (err) break; - inode = next.dentry->d_inode; if (follow_on_final(inode, lookup_flags)) { + if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) + return -ECHILD; + BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) goto return_err; - inode = nd->path.dentry->d_inode; - } else + nd->inode = nd->path.dentry->d_inode; + } else { path_to_nameidata(&next, nd); + nd->inode = inode; + } err = -ENOENT; - if (!inode) + if (!nd->inode) break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op->lookup) + if (!nd->inode->i_op->lookup) break; } goto return_base; @@ -958,6 +1280,8 @@ return_reval: */ if (nd->path.dentry && nd->path.dentry->d_sb && (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { + if (nameidata_drop_rcu_maybe(nd)) + return -ECHILD; err = -ESTALE; /* Note: we do not d_invalidate() */ if (!nd->path.dentry->d_op->d_revalidate( @@ -965,16 +1289,34 @@ return_reval: break; } return_base: + if (nameidata_drop_rcu_last_maybe(nd)) + return -ECHILD; return 0; out_dput: - path_put_conditional(&next, nd); + if (!(nd->flags & LOOKUP_RCU)) + path_put_conditional(&next, nd); break; } - path_put(&nd->path); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&nd->path); return_err: return err; } +static inline int path_walk_rcu(const char *name, struct nameidata *nd) +{ + current->total_link_count = 0; + + return link_path_walk(name, nd); +} + +static inline int path_walk_simple(const char *name, struct nameidata *nd) +{ + current->total_link_count = 0; + + return link_path_walk(name, nd); +} + static int path_walk(const char *name, struct nameidata *nd) { struct path save = nd->path; @@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd) return result; } +static void path_finish_rcu(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + /* RCU dangling. Cancel it. */ + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } + if (nd->file) + fput(nd->file); +} + +static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +{ + int retval = 0; + int fput_needed; + struct file *file; + + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags | LOOKUP_RCU; + nd->depth = 0; + nd->root.mnt = NULL; + nd->file = NULL; + + if (*name=='/') { + struct fs_struct *fs = current->fs; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + spin_lock(&fs->lock); + nd->root = fs->root; + nd->path = nd->root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + spin_unlock(&fs->lock); + + } else if (dfd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + spin_lock(&fs->lock); + nd->path = fs->pwd; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + spin_unlock(&fs->lock); + } else { + struct dentry *dentry; + + file = fget_light(dfd, &fput_needed); + retval = -EBADF; + if (!file) + goto out_fail; + + dentry = file->f_path.dentry; + + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; + + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + + nd->path = file->f_path; + if (fput_needed) + nd->file = file; + + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } + nd->inode = nd->path.dentry->d_inode; + return 0; + +fput_fail: + fput_light(file, fput_needed); +out_fail: + return retval; +} + static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; @@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei fput_light(file, fput_needed); } + nd->inode = nd->path.dentry->d_inode; return 0; fput_fail: @@ -1052,16 +1477,53 @@ out_fail: static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval = path_init(dfd, name, flags, nd); - if (!retval) - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->path.dentry->d_inode)) - audit_inode(name, nd->path.dentry); + int retval; + + /* + * Path walking is largely split up into 2 different synchronisation + * schemes, rcu-walk and ref-walk (explained in + * Documentation/filesystems/path-lookup.txt). These share much of the + * path walk code, but some things particularly setup, cleanup, and + * following mounts are sufficiently divergent that functions are + * duplicated. Typically there is a function foo(), and its RCU + * analogue, foo_rcu(). + * + * -ECHILD is the error number of choice (just to avoid clashes) that + * is returned if some aspect of an rcu-walk fails. Such an error must + * be handled by restarting a traditional ref-walk (which will always + * be able to complete). + */ + retval = path_init_rcu(dfd, name, flags, nd); + if (unlikely(retval)) + return retval; + retval = path_walk_rcu(name, nd); + path_finish_rcu(nd); if (nd->root.mnt) { path_put(&nd->root); nd->root.mnt = NULL; } + + if (unlikely(retval == -ECHILD || retval == -ESTALE)) { + /* slower, locked walk */ + if (retval == -ESTALE) + flags |= LOOKUP_REVAL; + retval = path_init(dfd, name, flags, nd); + if (unlikely(retval)) + return retval; + retval = path_walk(name, nd); + if (nd->root.mnt) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + } + + if (likely(!retval)) { + if (unlikely(!audit_dummy_context())) { + if (nd->path.dentry && nd->inode) + audit_inode(name, nd->path.dentry); + } + } + return retval; } @@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, path_get(&nd->path); nd->root = nd->path; path_get(&nd->root); + nd->inode = nd->path.dentry->d_inode; retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->path.dentry->d_inode)) + nd->inode)) audit_inode(name, nd->path.dentry); path_put(&nd->root); @@ -1488,6 +1951,7 @@ out_unlock: mutex_unlock(&dir->d_inode->i_mutex); dput(nd->path.dentry); nd->path.dentry = path->dentry; + if (error) return error; /* Don't check for write permission, don't truncate */ @@ -1582,6 +2046,9 @@ exit: return ERR_PTR(error); } +/* + * Handle O_CREAT case for do_filp_open + */ static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag, int acc_mode, int mode, const char *pathname) @@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* fallthrough */ case LAST_ROOT: - if (open_flag & O_CREAT) - goto exit; - /* fallthrough */ + goto exit; case LAST_BIND: audit_inode(pathname, dir); goto ok; } /* trailing slashes? */ - if (nd->last.name[nd->last.len]) { - if (open_flag & O_CREAT) - goto exit; - nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW; - } - - /* just plain open? */ - if (!(open_flag & O_CREAT)) { - error = do_lookup(nd, &nd->last, path); - if (error) - goto exit; - error = -ENOENT; - if (!path->dentry->d_inode) - goto exit_dput; - if (path->dentry->d_inode->i_op->follow_link) - return NULL; - error = -ENOTDIR; - if (nd->flags & LOOKUP_DIRECTORY) { - if (!path->dentry->d_inode->i_op->lookup) - goto exit_dput; - } - path_to_nameidata(path, nd); - audit_inode(pathname, nd->path.dentry); - goto ok; - } + if (nd->last.name[nd->last.len]) + goto exit; - /* OK, it's O_CREAT */ mutex_lock(&dir->d_inode->i_mutex); path->dentry = lookup_hash(nd); @@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, return NULL; path_to_nameidata(path, nd); + nd->inode = path->dentry->d_inode; error = -EISDIR; - if (S_ISDIR(path->dentry->d_inode->i_mode)) + if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: filp = finish_open(nd, open_flag, acc_mode); @@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; int count = 0; int flag = open_to_namei_flags(open_flag); - int force_reval = 0; + int flags; if (!(open_flag & O_CREAT)) mode = 0; @@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname, if (open_flag & O_APPEND) acc_mode |= MAY_APPEND; - /* find the parent */ -reval: - error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); + flags = LOOKUP_OPEN; + if (open_flag & O_CREAT) { + flags |= LOOKUP_CREATE; + if (open_flag & O_EXCL) + flags |= LOOKUP_EXCL; + } + if (open_flag & O_DIRECTORY) + flags |= LOOKUP_DIRECTORY; + if (!(open_flag & O_NOFOLLOW)) + flags |= LOOKUP_FOLLOW; + + filp = get_empty_filp(); + if (!filp) + return ERR_PTR(-ENFILE); + + filp->f_flags = open_flag; + nd.intent.open.file = filp; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = mode; + + if (open_flag & O_CREAT) + goto creat; + + /* !O_CREAT, simple open */ + error = do_path_lookup(dfd, pathname, flags, &nd); + if (unlikely(error)) + goto out_filp; + error = -ELOOP; + if (!(nd.flags & LOOKUP_FOLLOW)) { + if (nd.inode->i_op->follow_link) + goto out_path; + } + error = -ENOTDIR; + if (nd.flags & LOOKUP_DIRECTORY) { + if (!nd.inode->i_op->lookup) + goto out_path; + } + audit_inode(pathname, nd.path.dentry); + filp = finish_open(&nd, open_flag, acc_mode); + return filp; + +creat: + /* OK, have to create the file. Find the parent. */ + error = path_init_rcu(dfd, pathname, + LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); if (error) - return ERR_PTR(error); - if (force_reval) - nd.flags |= LOOKUP_REVAL; + goto out_filp; + error = path_walk_rcu(pathname, &nd); + path_finish_rcu(&nd); + if (unlikely(error == -ECHILD || error == -ESTALE)) { + /* slower, locked walk */ + if (error == -ESTALE) { +reval: + flags |= LOOKUP_REVAL; + } + error = path_init(dfd, pathname, + LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); + if (error) + goto out_filp; - current->total_link_count = 0; - error = link_path_walk(pathname, &nd); - if (error) { - filp = ERR_PTR(error); - goto out; + error = path_walk_simple(pathname, &nd); } - if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) + if (unlikely(error)) + goto out_filp; + if (unlikely(!audit_dummy_context())) audit_inode(pathname, nd.path.dentry); /* * We have the parent and last component. */ - - error = -ENFILE; - filp = get_empty_filp(); - if (filp == NULL) - goto exit_parent; - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_OPEN; - if (open_flag & O_CREAT) { - nd.flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - nd.flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - nd.flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - nd.flags |= LOOKUP_FOLLOW; + nd.flags = flags; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path holder; - struct inode *inode = path.dentry->d_inode; void *cookie; error = -ELOOP; /* S_ISDIR part is a temporary automount kludge */ - if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) + if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode)) goto exit_dput; if (count++ == 32) goto exit_dput; @@ -1838,36 +2310,33 @@ reval: goto exit_dput; error = __do_follow_link(&path, &nd, &cookie); if (unlikely(error)) { + if (!IS_ERR(cookie) && nd.inode->i_op->put_link) + nd.inode->i_op->put_link(path.dentry, &nd, cookie); /* nd.path had been dropped */ - if (!IS_ERR(cookie) && inode->i_op->put_link) - inode->i_op->put_link(path.dentry, &nd, cookie); - path_put(&path); - release_open_intent(&nd); - filp = ERR_PTR(error); - goto out; + nd.path = path; + goto out_path; } holder = path; nd.flags &= ~LOOKUP_PARENT; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (inode->i_op->put_link) - inode->i_op->put_link(holder.dentry, &nd, cookie); + if (nd.inode->i_op->put_link) + nd.inode->i_op->put_link(holder.dentry, &nd, cookie); path_put(&holder); } out: if (nd.root.mnt) path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !force_reval) { - force_reval = 1; + if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) goto reval; - } return filp; exit_dput: path_put_conditional(&path, &nd); +out_path: + path_put(&nd.path); +out_filp: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); -exit_parent: - path_put(&nd.path); filp = ERR_PTR(error); goto out; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ae4b0fd9033f..998e3a715bcc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + if (!inode) + return 0; if (name->len != len) return 1; if (memcmp(name->name, str, len)) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ca648685f0cc..c2e7390289cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ struct dentry { unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ + seqcount_t d_seq; /* per dentry seqlock */ int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ @@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ -extern struct dentry * d_lookup(struct dentry *, struct qstr *); -extern struct dentry * __d_lookup(struct dentry *, struct qstr *); -extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode); + +/** + * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok + * @dentry: dentry to take a ref on + * @seq: seqcount to verify against + * @Returns: 0 on failure, else 1. + * + * __d_rcu_to_refcount operates on a dentry,seq pair that was returned + * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. + */ +static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) +{ + int ret = 0; + + assert_spin_locked(&dentry->d_lock); + if (!read_seqcount_retry(&dentry->d_seq, seq)) { + ret = 1; + dentry->d_count++; + } + + return ret; +} /* validate "insecure" dentry pointer */ extern int d_validate(struct dentry *, struct dentry *); diff --git a/include/linux/namei.h b/include/linux/namei.h index aec730b53935..18d06add0a40 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -19,7 +19,10 @@ struct nameidata { struct path path; struct qstr last; struct path root; + struct file *file; + struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; + unsigned seq; int last_type; unsigned depth; char *saved_names[MAX_NESTED_LINKS + 1]; @@ -43,11 +46,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - internal "there are more path components" flag * - dentry cache is untrusted; force a real lookup */ -#define LOOKUP_FOLLOW 1 -#define LOOKUP_DIRECTORY 2 -#define LOOKUP_CONTINUE 4 -#define LOOKUP_PARENT 16 -#define LOOKUP_REVAL 64 +#define LOOKUP_FOLLOW 0x0001 +#define LOOKUP_DIRECTORY 0x0002 +#define LOOKUP_CONTINUE 0x0004 + +#define LOOKUP_PARENT 0x0010 +#define LOOKUP_REVAL 0x0020 +#define LOOKUP_RCU 0x0040 /* * Intent data */ diff --git a/include/linux/security.h b/include/linux/security.h index fd4d55fb8845..ed95401970c7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -457,7 +457,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * called when the actual read/write operations are performed. * @inode contains the inode structure to check. * @mask contains the permission mask. - * @nd contains the nameidata (may be NULL). * Return 0 if permission is granted. * @inode_setattr: * Check permission before setting file attributes. Note that the kernel @@ -1713,6 +1712,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); int security_inode_permission(struct inode *inode, int mask); +int security_inode_exec_permission(struct inode *inode, unsigned int flags); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); int security_inode_setxattr(struct dentry *dentry, const char *name, @@ -2102,6 +2102,12 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } +static inline int security_inode_exec_permission(struct inode *inode, + unsigned int flags) +{ + return 0; +} + static inline int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { diff --git a/security/security.c b/security/security.c index 1b798d3df710..c645e263ca8d 100644 --- a/security/security.c +++ b/security/security.c @@ -513,6 +513,15 @@ int security_inode_permission(struct inode *inode, int mask) return security_ops->inode_permission(inode, mask); } +int security_inode_exec_permission(struct inode *inode, unsigned int flags) +{ + if (unlikely(IS_PRIVATE(inode))) + return 0; + if (flags) + return -ECHILD; + return security_ops->inode_permission(inode, MAY_EXEC); +} + int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { if (unlikely(IS_PRIVATE(dentry->d_inode))) -- cgit v1.2.3 From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:57 +1100 Subject: fs: rcu-walk aware d_revalidate method Require filesystems be aware of .d_revalidate being called in rcu-walk mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning -ECHILD from all implementations. Signed-off-by: Nick Piggin --- Documentation/filesystems/Locking | 18 +++++------ Documentation/filesystems/path-lookup.txt | 5 ++- Documentation/filesystems/porting | 20 ++++++++++++ Documentation/filesystems/vfs.txt | 9 ++++++ drivers/staging/autofs/root.c | 5 ++- drivers/staging/smbfs/dir.c | 16 ++++++--- fs/afs/dir.c | 4 +++ fs/autofs4/root.c | 13 ++++++-- fs/ceph/dir.c | 7 +++- fs/cifs/dir.c | 3 ++ fs/coda/dir.c | 7 +++- fs/ecryptfs/dentry.c | 9 ++++-- fs/fat/namei_vfat.c | 6 ++++ fs/fuse/dir.c | 6 +++- fs/gfs2/dentry.c | 17 +++++++--- fs/hfs/sysdep.c | 7 +++- fs/jfs/namei.c | 2 ++ fs/namei.c | 54 ++++++++++++++++++++----------- fs/ncpfs/dir.c | 4 +++ fs/ncpfs/inode.c | 1 + fs/nfs/dir.c | 8 +++-- fs/ocfs2/dcache.c | 10 ++++-- fs/proc/base.c | 33 +++++++++++++++---- fs/proc/proc_sysctl.c | 3 ++ fs/reiserfs/xattr.c | 2 ++ fs/sysfs/dir.c | 6 +++- include/linux/dcache.h | 1 - 27 files changed, 215 insertions(+), 61 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index bdad6414dfa0..e90ffe61eb65 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -9,7 +9,7 @@ be able to use diff(1). --------------------------- dentry_operations -------------------------- prototypes: - int (*d_revalidate)(struct dentry *, int); + int (*d_revalidate)(struct dentry *, struct nameidata *); int (*d_hash)(const struct dentry *, const struct inode *, struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, @@ -21,14 +21,14 @@ prototypes: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); locking rules: - rename_lock ->d_lock may block -d_revalidate: no no yes -d_hash no no no -d_compare: yes no no -d_delete: no yes no -d_release: no no yes -d_iput: no no yes -d_dname: no no no + rename_lock ->d_lock may block rcu-walk +d_revalidate: no no yes (ref-walk) maybe +d_hash no no no maybe +d_compare: yes no no maybe +d_delete: no yes no no +d_release: no no yes no +d_iput: no no yes no +d_dname: no no no no --------------------------- inode_operations --------------------------- prototypes: diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt index 09b2878724a1..8789d1810bed 100644 --- a/Documentation/filesystems/path-lookup.txt +++ b/Documentation/filesystems/path-lookup.txt @@ -317,11 +317,10 @@ The detailed design for rcu-walk is like this: The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs -* dentries with d_revalidate * Following links -In future patches, permission checks and d_revalidate become rcu-walk aware. It -may be possible eventually to make following links rcu-walk aware. +In future patches, permission checks become rcu-walk aware. It may be possible +eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index ccf0ce7866b9..cd9756a2709d 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -360,3 +360,23 @@ i_dentry to be reinitialized before it is freed, so an: INIT_LIST_HEAD(&inode->i_dentry); must be done in the RCU callback. + +-- +[recommended] + vfs now tries to do path walking in "rcu-walk mode", which avoids +atomic operations and scalability hazards on dentries and inodes (see +Documentation/filesystems/path-walk.txt). d_hash and d_compare changes (above) +are examples of the changes required to support this. For more complex +filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so +no changes are required to the filesystem. However, this is costly and loses +the benefits of rcu-walk mode. We will begin to add filesystem callbacks that +are rcu-walk aware, shown below. Filesystems should take advantage of this +where possible. + +-- +[mandatory] + d_revalidate is a callback that is made on every path element (if +the filesystem provides it), which requires dropping out of rcu-walk mode. This +may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be +returned if the filesystem cannot handle rcu-walk. See +Documentation/filesystems/vfs.txt for more details. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 69b10ff5ec81..c936b4912383 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -863,6 +863,15 @@ struct dentry_operations { dcache. Most filesystems leave this as NULL, because all their dentries in the dcache are valid + d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU). + If in rcu-walk mode, the filesystem must revalidate the dentry without + blocking or storing to the dentry, d_parent and d_inode should not be + used without care (because they can go NULL), instead nd->inode should + be used. + + If a situation is encountered that rcu-walk cannot handle, return + -ECHILD and it will be called again in ref-walk mode. + d_hash: called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is to be hashed into. The inode is the dentry's inode. diff --git a/drivers/staging/autofs/root.c b/drivers/staging/autofs/root.c index b09adb57971f..bf0e9755da67 100644 --- a/drivers/staging/autofs/root.c +++ b/drivers/staging/autofs/root.c @@ -154,13 +154,16 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str * yet completely filled in, and revalidate has to delay such * lookups.. */ -static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd) +static int autofs_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode * dir; struct autofs_sb_info *sbi; struct autofs_dir_ent *ent; int res; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + lock_kernel(); dir = dentry->d_parent->d_inode; sbi = autofs_sbi(dir->i_sb); diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c index 78f09412740c..dd612f50749f 100644 --- a/drivers/staging/smbfs/dir.c +++ b/drivers/staging/smbfs/dir.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "smb_fs.h" #include "smb_mount.h" @@ -301,13 +302,20 @@ static const struct dentry_operations smbfs_dentry_operations_case = * This is the callback when the dcache has a lookup hit. */ static int -smb_lookup_validate(struct dentry * dentry, struct nameidata *nd) +smb_lookup_validate(struct dentry *dentry, struct nameidata *nd) { - struct smb_sb_info *server = server_from_dentry(dentry); - struct inode * inode = dentry->d_inode; - unsigned long age = jiffies - dentry->d_time; + struct smb_sb_info *server; + struct inode *inode; + unsigned long age; int valid; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + server = server_from_dentry(dentry); + inode = dentry->d_inode; + age = jiffies - dentry->d_time; + /* * The default validation is based on dentry age: * we believe in dentries for a few seconds. (But each diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b8bb7e7148d0..34a3263d60a4 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -607,6 +608,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) void *dir_version; int ret; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + vnode = AFS_FS_I(dentry->d_inode); if (dentry->d_inode) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bfe3f2eb684d..651e4ef563b1 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -315,12 +315,19 @@ out_error: */ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *dir = dentry->d_parent->d_inode; - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - int oz_mode = autofs4_oz_mode(sbi); + struct inode *dir; + struct autofs_sb_info *sbi; + int oz_mode; int flags = nd ? nd->flags : 0; int status = 1; + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dentry->d_parent->d_inode; + sbi = autofs4_sbi(dir->i_sb); + oz_mode = autofs4_oz_mode(sbi); + /* Pending dentry */ spin_lock(&sbi->fs_lock); if (autofs4_ispending(dentry)) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index cc01cf826769..fa7ca04ee816 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -990,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + dir = dentry->d_parent->d_inode; dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e3b10ca6d453..db2a58c00f7b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -656,6 +656,9 @@ lookup_out: static int cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index aa40c811f8d2..619a8303766e 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -541,9 +542,13 @@ out: /* called when a cache lookup succeeds */ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) { - struct inode *inode = de->d_inode; + struct inode *inode; struct coda_inode_info *cii; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = de->d_inode; if (!inode || coda_isroot(inode)) goto out; if (is_bad_inode(inode)) diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 906e803f7f79..6fc4f319b550 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -44,12 +44,17 @@ */ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; struct dentry *dentry_save; struct vfsmount *vfsmount_save; int rc = 1; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; dentry_save = nd->path.dentry; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 3be5ed7d859f..e3ffc5e12332 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + /* This is not negative dentry. Always valid. */ if (dentry->d_inode) return 1; @@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + /* * This is not negative dentry. Always valid. * diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c9a8a426a395..07f4b5e675fc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) */ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { - struct inode *inode = entry->d_inode; + struct inode *inode; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + inode = entry->d_inode; if (inode && is_bad_inode(inode)) return 0; else if (fuse_dentry_time(entry) < get_jiffies_64()) { diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 50497f65763b..4a456338b873 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "gfs2.h" @@ -34,15 +35,23 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *parent = dget_parent(dentry); - struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); - struct gfs2_inode *dip = GFS2_I(parent->d_inode); - struct inode *inode = dentry->d_inode; + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error; int had_lock = 0; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(parent->d_inode); + dip = GFS2_I(parent->d_inode); + inode = dentry->d_inode; + if (inode) { if (is_bad_inode(inode)) goto invalid; diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 7478f5c219aa..19cf291eb91f 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -8,15 +8,20 @@ * This file contains the code to do various system dependent things. */ +#include #include "hfs_fs.h" /* dentry case-handling: just lowercase everything */ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode; int diff; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; if(!inode) return 1; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a151cbdec626..4414e3a42264 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1608,6 +1608,8 @@ out: static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; /* * This is not negative dentry. Always valid. * diff --git a/fs/namei.c b/fs/namei.c index 90bd2873e117..6e275363e89d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -563,10 +563,26 @@ void release_open_intent(struct nameidata *nd) fput(nd->intent.open.file); } +static int d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int status; + + status = dentry->d_op->d_revalidate(dentry, nd); + if (status == -ECHILD) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return status; + status = dentry->d_op->d_revalidate(dentry, nd); + } + + return status; +} + static inline struct dentry * do_revalidate(struct dentry *dentry, struct nameidata *nd) { - int status = dentry->d_op->d_revalidate(dentry, nd); + int status; + + status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { /* * The dentry failed validation. @@ -574,14 +590,20 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) * the dentry otherwise d_revalidate is asking us * to return a fail status. */ - if (!status) { + if (status < 0) { + /* If we're in rcu-walk, we don't have a ref */ + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = ERR_PTR(status); + + } else { + /* Don't d_invalidate in rcu-walk mode */ + if (nameidata_dentry_drop_rcu_maybe(nd, dentry)) + return ERR_PTR(-ECHILD); if (!d_invalidate(dentry)) { dput(dentry); dentry = NULL; } - } else { - dput(dentry); - dentry = ERR_PTR(status); } } return dentry; @@ -626,7 +648,7 @@ force_reval_path(struct path *path, struct nameidata *nd) if (!need_reval_dot(dentry)) return 0; - status = dentry->d_op->d_revalidate(dentry, nd); + status = d_revalidate(dentry, nd); if (status > 0) return 0; @@ -1039,12 +1061,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; - if (dentry->d_flags & DCACHE_OP_REVALIDATE) { - /* We commonly drop rcu-walk here */ - if (nameidata_dentry_drop_rcu(nd, dentry)) - return -ECHILD; + if (dentry->d_flags & DCACHE_OP_REVALIDATE) goto need_revalidate; - } path->mnt = mnt; path->dentry = dentry; __follow_mount_rcu(nd, path, inode); @@ -1292,12 +1310,11 @@ return_reval: * We may need to check the cached dentry for staleness. */ if (need_reval_dot(nd->path.dentry)) { - if (nameidata_drop_rcu_maybe(nd)) - return -ECHILD; - err = -ESTALE; /* Note: we do not d_invalidate() */ - if (!nd->path.dentry->d_op->d_revalidate( - nd->path.dentry, nd)) + err = d_revalidate(nd->path.dentry, nd); + if (!err) + err = -ESTALE; + if (err < 0) break; } return_base: @@ -2080,10 +2097,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path, dir = nd->path.dentry; case LAST_DOT: if (need_reval_dot(dir)) { - if (!dir->d_op->d_revalidate(dir, nd)) { + error = d_revalidate(nd->path.dentry, nd); + if (!error) error = -ESTALE; + if (error < 0) goto exit; - } } /* fallthrough */ case LAST_ROOT: diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 4b9cbb28d7fa..28f136d4aaec 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -308,6 +309,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd) int res, val = 0, len; __u8 __name[NCP_MAXPATHLEN + 1]; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + parent = dget_parent(dentry); dir = parent->d_inode; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 0c75a5f3cafd..9531c052d7a4 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -29,6 +29,7 @@ #include #include #include +#include #include diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 37e0a8bb077e..58beace14b19 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -938,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) * component of the path. * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. */ -static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) +static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, + unsigned int mask) { if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) return 0; @@ -1018,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) +static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; struct inode *inode; @@ -1027,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) struct nfs_fattr *fattr = NULL; int error; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + parent = dget_parent(dentry); dir = parent->d_inode; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 4d54c60ceee4..0310b16a7238 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) static int ocfs2_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode; int ret = 0; /* if all else fails, just return false */ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + osb = OCFS2_SB(dentry->d_sb); mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); diff --git a/fs/proc/base.c b/fs/proc/base.c index 85f0a80912aa..dc5b2fcadc3b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1719,10 +1719,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { @@ -1888,12 +1894,19 @@ static int proc_fd_link(struct inode *inode, struct path *path) static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int fd = proc_fd(inode); + struct inode *inode; + struct task_struct *task; + int fd; struct files_struct *files; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + if (task) { files = get_files_struct(task); if (files) { @@ -2563,8 +2576,14 @@ static const struct pid_entry proc_base_stuff[] = { */ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); if (task) { put_task_struct(task); return 1; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 35efd85a4d32..c9097f43b425 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -389,6 +390,8 @@ static const struct inode_operations proc_sys_dir_operations = { static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e0f0d7ea10a1..9ea22a56cdf1 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -972,6 +972,8 @@ int reiserfs_permission(struct inode *inode, int mask) static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; return -EPERM; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 3e076caa8daf..ea9120a830d8 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -239,9 +239,13 @@ static int sysfs_dentry_delete(const struct dentry *dentry) static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_dirent *sd; int is_dir; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + sd = dentry->d_fsdata; mutex_lock(&sysfs_mutex); /* The sysfs dirent has been deleted */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b1aeda077258..8b2064d02928 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -190,7 +190,6 @@ struct dentry_operations { #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 - extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; -- cgit v1.2.3 From b74c79e99389cd79b31fcc08f82c24e492e63c7e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:58 +1100 Subject: fs: provide rcu-walk aware permission i_ops Signed-off-by: Nick Piggin --- Documentation/filesystems/Locking | 6 +-- Documentation/filesystems/path-lookup.txt | 44 ++++++++++++++++-- Documentation/filesystems/porting | 5 +++ Documentation/filesystems/vfs.txt | 10 ++++- drivers/staging/smbfs/file.c | 5 ++- fs/9p/acl.c | 5 ++- fs/9p/acl.h | 2 +- fs/afs/internal.h | 2 +- fs/afs/security.c | 7 ++- fs/bad_inode.c | 5 ++- fs/btrfs/acl.c | 6 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 7 ++- fs/ceph/inode.c | 11 +++-- fs/ceph/super.h | 2 +- fs/cifs/cifsfs.c | 7 ++- fs/coda/dir.c | 5 ++- fs/coda/pioctl.c | 6 ++- fs/ecryptfs/inode.c | 4 +- fs/ext2/acl.c | 8 +++- fs/ext2/acl.h | 2 +- fs/ext3/acl.c | 8 +++- fs/ext3/acl.h | 2 +- fs/ext4/acl.c | 8 +++- fs/ext4/acl.h | 2 +- fs/fuse/dir.c | 10 +++-- fs/generic_acl.c | 8 +++- fs/gfs2/acl.c | 5 ++- fs/gfs2/acl.h | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/inode.c | 4 +- fs/gfs2/inode.h | 2 +- fs/gfs2/ops_inode.c | 18 +++++--- fs/hostfs/hostfs_kern.c | 7 ++- fs/hpfs/namei.c | 2 +- fs/jffs2/acl.c | 5 ++- fs/jffs2/acl.h | 2 +- fs/jfs/acl.c | 8 +++- fs/jfs/jfs_acl.h | 2 +- fs/logfs/dir.c | 6 ++- fs/namei.c | 75 ++++++++++++------------------- fs/nfs/dir.c | 7 ++- fs/nilfs2/inode.c | 10 +++-- fs/nilfs2/nilfs.h | 2 +- fs/ocfs2/acl.c | 8 +++- fs/ocfs2/acl.h | 2 +- fs/ocfs2/file.c | 7 ++- fs/ocfs2/file.h | 2 +- fs/proc/base.c | 6 ++- fs/proc/proc_sysctl.c | 5 ++- fs/reiserfs/xattr.c | 14 ++++-- fs/sysfs/inode.c | 11 +++-- fs/sysfs/sysfs.h | 2 +- fs/xfs/linux-2.6/xfs_acl.c | 8 +++- fs/xfs/xfs_acl.h | 2 +- include/linux/coda_linux.h | 2 +- include/linux/fs.h | 10 +++-- include/linux/generic_acl.h | 2 +- include/linux/nfs_fs.h | 2 +- include/linux/reiserfs_xattr.h | 2 +- 60 files changed, 287 insertions(+), 146 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index e90ffe61eb65..977d8919cc69 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -47,8 +47,8 @@ ata *); void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, struct nameidata *); - int (*check_acl)(struct inode *, int); + int (*permission) (struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int, unsigned int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -76,7 +76,7 @@ follow_link: no put_link: no truncate: yes (see below) setattr: yes -permission: no +permission: no (may not block if called in rcu-walk mode) check_acl: no getattr: no setxattr: yes diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt index 8789d1810bed..eb59c8b44be9 100644 --- a/Documentation/filesystems/path-lookup.txt +++ b/Documentation/filesystems/path-lookup.txt @@ -316,11 +316,9 @@ The detailed design for rcu-walk is like this: The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) -* parent with d_inode->i_op->permission or ACLs * Following links -In future patches, permission checks become rcu-walk aware. It may be possible -eventually to make following links rcu-walk aware. +It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. @@ -336,9 +334,49 @@ or stored into. The result is massive improvements in performance and scalability of path resolution. +Interesting statistics +====================== + +The following table gives rcu lookup statistics for a few simple workloads +(2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to +drop rcu that fail due to d_seq failure and requiring the entire path lookup +again. Other cases are successful rcu-drops that are required before the final +element, nodentry for missing dentry, revalidate for filesystem revalidate +routine requiring rcu drop, permission for permission check requiring drop, +and link for symlink traversal requiring drop. + + rcu-lookups restart nodentry link revalidate permission +bootup 47121 0 4624 1010 10283 7852 +dbench 25386793 0 6778659(26.7%) 55 549 1156 +kbuild 2696672 10 64442(2.3%) 108764(4.0%) 1 1590 +git diff 39605 0 28 2 0 106 +vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651 + +What this shows is that failed rcu-walk lookups, ie. ones that are restarted +entirely with ref-walk, are quite rare. Even the "vfstest" case which +specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise +such races is not showing a huge amount of restarts. + +Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where +the reference count needs to be taken for some reason. This is either because +we have reached the target of the path walk, or because we have encountered a +condition that can't be resolved in rcu-walk mode. Ideally, we drop rcu-walk +only when we have reached the target dentry, so the other statistics show where +this does not happen. + +Note that a graceful drop from rcu-walk mode due to something such as the +dentry not existing (which can be common) is not necessarily a failure of +rcu-walk scheme, because some elements of the path may have been walked in +rcu-walk mode. The further we get from common path elements (such as cwd or +root), the less contended the dentry is likely to be. The closer we are to +common path elements, the more likely they will exist in dentry cache. + + Papers and other documentation on dcache locking ================================================ 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). 2. http://lse.sourceforge.net/locking/dcache/dcache.html + + diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index cd9756a2709d..07a32b42cf9c 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -379,4 +379,9 @@ where possible. the filesystem provides it), which requires dropping out of rcu-walk mode. This may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be returned if the filesystem cannot handle rcu-walk. See +Documentation/filesystems/vfs.txt for more details. + + permission and check_acl are inode permission checks that are called +on many or all directory inodes on the way down a path walk (to check for +exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See Documentation/filesystems/vfs.txt for more details. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c936b4912383..fbb324e2bd43 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -325,7 +325,8 @@ struct inode_operations { void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, struct nameidata *); + int (*permission) (struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int, unsigned int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -414,6 +415,13 @@ otherwise noted. permission: called by the VFS to check for access rights on a POSIX-like filesystem. + May be called in rcu-walk mode (flags & IPERM_RCU). If in rcu-walk + mode, the filesystem must check the permission without blocking or + storing to the inode. + + If a situation is encountered that rcu-walk cannot handle, return + -ECHILD and it will be called again in ref-walk mode. + setattr: called by the VFS to set attributes for a file. This method is called by chmod(2) and related system calls. diff --git a/drivers/staging/smbfs/file.c b/drivers/staging/smbfs/file.c index 5dcd19c60eb9..31372e7b12de 100644 --- a/drivers/staging/smbfs/file.c +++ b/drivers/staging/smbfs/file.c @@ -407,11 +407,14 @@ smb_file_release(struct inode *inode, struct file * file) * privileges, so we need our own check for this. */ static int -smb_file_permission(struct inode *inode, int mask) +smb_file_permission(struct inode *inode, int mask, unsigned int flags) { int mode = inode->i_mode; int error = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + VERBOSE("mode=%x, mask=%x\n", mode, mask); /* Look at user permissions */ diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 12d602351dbe..6e58c4ca1e6e 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -int v9fs_check_acl(struct inode *inode, int mask) +int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; struct v9fs_session_info *v9ses; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + v9ses = v9fs_inode2v9ses(inode); if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { /* diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 59e18c2e8c7e..7ef3ac9f6d95 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,7 +16,7 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask); +extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl *, struct posix_acl *); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index cca8eef736fc..6d4bc1c8ff60 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -624,7 +624,7 @@ extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, long); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); -extern int afs_permission(struct inode *, int); +extern int afs_permission(struct inode *, int, unsigned int); /* * server.c diff --git a/fs/afs/security.c b/fs/afs/security.c index bb4ed144d0e4..f44b9d355377 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask) +int afs_permission(struct inode *inode, int mask, unsigned int flags) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t uninitialized_var(access); struct key *key; int ret; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + _enter("{{%x:%u},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); @@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask) } key_put(key); - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, flags, NULL); _leave(" = %d", ret); return ret; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index f024d8aaddef..9ad2369d9e35 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask) +static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + return -EIO; } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 2222d161c7b6..cb518a4b917c 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -185,13 +185,15 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return ret; } -int btrfs_check_acl(struct inode *inode, int mask) +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int error = -EAGAIN; - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af52f6d7a4d8..a142d204b526 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2544,7 +2544,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags); #else #define btrfs_check_acl NULL #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 63e4546b478a..5cf0db0081f9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7211,11 +7211,14 @@ static int btrfs_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static int btrfs_permission(struct inode *inode, int mask) +static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; - return generic_permission(inode, mask, btrfs_check_acl); + return generic_permission(inode, mask, flags, btrfs_check_acl); } static const struct inode_operations btrfs_dir_inode_operations = { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 47f8c8baf3b5..e61de4f7b99d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1781,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask) * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask) +int ceph_permission(struct inode *inode, int mask, unsigned int flags) { - int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + int err; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); if (!err) - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, flags, NULL); return err; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7f01728a4657..4553d8829edb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -665,7 +665,7 @@ extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); extern int ceph_do_getattr(struct inode *inode, int mask); -extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_permission(struct inode *inode, int mask, unsigned int flags); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 223717dcc401..8e21e0fe65d5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -283,10 +283,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int cifs_permission(struct inode *inode, int mask) +static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { @@ -298,7 +301,7 @@ static int cifs_permission(struct inode *inode, int mask) on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } static struct kmem_cache *cifs_inode_cachep; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 619a8303766e..29badd91360f 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -135,10 +135,13 @@ exit: } -int coda_permission(struct inode *inode, int mask) +int coda_permission(struct inode *inode, int mask, unsigned int flags) { int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (!mask) diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 2fd89b5c5c7b..741f0bd03918 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,7 @@ #include /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask); +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask) +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index f91b35db4c6e..337352a94751 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -980,8 +980,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) } static int -ecryptfs_permission(struct inode *inode, int mask) +ecryptfs_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; return inode_permission(ecryptfs_inode_to_lower(inode), mask); } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 2bcc0431bada..dd9bb3f0c8d7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -232,10 +232,14 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) } int -ext2_check_acl(struct inode *inode, int mask) +ext2_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3ff6cbb9ac44..c939b7b12099 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int); +extern int ext2_check_acl (struct inode *, int, unsigned int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 8a11fe212183..9e49da8302d3 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -240,10 +240,14 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext3_check_acl(struct inode *inode, int mask) +ext3_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de9..5faf8048e906 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); +extern int ext3_check_acl (struct inode *, int, unsigned int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5e2ed4504ead..373dcaeedba9 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -238,10 +238,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_check_acl(struct inode *inode, int mask) +ext4_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac4..dec821168fd4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +extern int ext4_check_acl(struct inode *, int, unsigned int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 07f4b5e675fc..f738599fd8cd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -985,12 +985,15 @@ static int fuse_access(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask) +static int fuse_permission(struct inode *inode, int mask, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + if (!fuse_allow_task(fc, current)) return -EACCES; @@ -1005,7 +1008,7 @@ static int fuse_permission(struct inode *inode, int mask) } if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, flags, NULL); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1013,7 +1016,8 @@ static int fuse_permission(struct inode *inode, int mask) if (err == -EACCES && !refreshed) { err = fuse_do_getattr(inode, NULL, NULL); if (!err) - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, + flags, NULL); } /* Note: the opposite of the above test does not diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 6bc9e3a5a693..628004282130 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -190,10 +190,14 @@ generic_acl_chmod(struct inode *inode) } int -generic_check_acl(struct inode *inode, int mask) +generic_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 48171f4c943d..7118f1a780a9 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) * Returns: errno */ -int gfs2_check_acl(struct inode *inode, int mask) +int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b522b0cb39ea..a93907c8159b 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask); +extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index aa996471ec5c..fca6689e12e6 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -241,7 +241,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE); + error = gfs2_permission(inode, MAY_WRITE, 0); if (error) goto out; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1213f7f9217..6163be9686be 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -509,7 +509,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(dir, MAY_EXEC); + error = gfs2_permission(dir, MAY_EXEC, 0); if (error) goto out; } @@ -539,7 +539,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); if (error) return error; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index d8499fadcc53..732a183efdb3 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -113,7 +113,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f28f89796f4d..5c63a06fcd7c 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -166,7 +166,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_child; - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); if (error) goto out_gunlock; @@ -289,7 +289,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); if (error) return error; @@ -822,7 +822,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); if (error) goto out_gunlock; @@ -857,7 +857,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); if (error) goto out_gunlock; } @@ -1041,13 +1041,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) * Returns: errno */ -int gfs2_permission(struct inode *inode, int mask) +int gfs2_permission(struct inode *inode, int mask, unsigned int flags) { - struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inode *ip; struct gfs2_holder i_gh; int error; int unlock = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) @@ -1058,7 +1062,7 @@ int gfs2_permission(struct inode *inode, int mask) if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EACCES; else - error = generic_permission(inode, mask, gfs2_check_acl); + error = generic_permission(inode, mask, flags, gfs2_check_acl); if (unlock) gfs2_glock_dq_uninit(&i_gh); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 0bc81cf256b8..d3244d949a4e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -749,11 +749,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired) +int hostfs_permission(struct inode *ino, int desired, unsigned int flags) { char *name; int r = 0, w = 0, x = 0, err; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + if (desired & MAY_READ) r = 1; if (desired & MAY_WRITE) w = 1; if (desired & MAY_EXEC) x = 1; @@ -768,7 +771,7 @@ int hostfs_permission(struct inode *ino, int desired) err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(ino, desired, NULL); + err = generic_permission(ino, desired, flags, NULL); return err; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 11c2b4080f65..f4ad9e31ddc4 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -419,7 +419,7 @@ again: unlock_kernel(); return -ENOSPC; } - if (generic_permission(inode, MAY_WRITE, NULL) || + if (generic_permission(inode, MAY_WRITE, 0, NULL) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { d_rehash(dentry); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 54a92fd02bbd..95b79672150a 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int rc; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 5e42de8d9541..3119f59253d3 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int, unsigned int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1057a4998e4e..e5de9422fa32 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -114,10 +114,14 @@ out: return rc; } -int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 54e07559878d..f9285c4900fa 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int); +int jfs_check_acl(struct inode *, int, unsigned int flags); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 409dfd65e9a1..f9ddf0c388c8 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask) +static int logfs_permission(struct inode *inode, int mask, unsigned int flags) { - return generic_permission(inode, mask, NULL); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + return generic_permission(inode, mask, flags, NULL); } static int logfs_link(struct dentry *old_dentry, struct inode *dir, diff --git a/fs/namei.c b/fs/namei.c index 6e275363e89d..4e957bf744ae 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname); /* * This does basic POSIX ACL permission checking */ -static inline int __acl_permission_check(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask), int rcu) +static int acl_permission_check(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) { umode_t mode = inode->i_mode; @@ -180,13 +180,9 @@ static inline int __acl_permission_check(struct inode *inode, int mask, mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { - if (rcu) { - return -ECHILD; - } else { - int error = check_acl(inode, mask); - if (error != -EAGAIN) - return error; - } + int error = check_acl(inode, mask, flags); + if (error != -EAGAIN) + return error; } if (in_group_p(inode->i_gid)) @@ -201,32 +197,31 @@ static inline int __acl_permission_check(struct inode *inode, int mask, return -EACCES; } -static inline int acl_permission_check(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) -{ - return __acl_permission_check(inode, mask, check_acl, 0); -} - /** - * generic_permission - check for access rights on a Posix-like filesystem + * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @check_acl: optional callback to check for Posix ACLs + * @flags IPERM_FLAG_ flags. * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which - * are used for other things.. + * are used for other things. + * + * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk + * request cannot be satisfied (eg. requires blocking or too much complexity). + * It would then be called again in ref-walk mode. */ -int generic_permission(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) +int generic_permission(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) { int ret; /* * Do the basic POSIX ACL permission checks. */ - ret = acl_permission_check(inode, mask, check_acl); + ret = acl_permission_check(inode, mask, flags, check_acl); if (ret != -EACCES) return ret; @@ -281,9 +276,10 @@ int inode_permission(struct inode *inode, int mask) } if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask); + retval = inode->i_op->permission(inode, mask, 0); else - retval = generic_permission(inode, mask, inode->i_op->check_acl); + retval = generic_permission(inode, mask, 0, + inode->i_op->check_acl); if (retval) return retval; @@ -668,22 +664,19 @@ force_reval_path(struct path *path, struct nameidata *nd) * short-cut DAC fails, then call ->permission() to do more * complete permission check. */ -static inline int __exec_permission(struct inode *inode, int rcu) +static inline int exec_permission(struct inode *inode, unsigned int flags) { int ret; if (inode->i_op->permission) { - if (rcu) - return -ECHILD; - ret = inode->i_op->permission(inode, MAY_EXEC); - if (!ret) - goto ok; - return ret; + ret = inode->i_op->permission(inode, MAY_EXEC, flags); + } else { + ret = acl_permission_check(inode, MAY_EXEC, flags, + inode->i_op->check_acl); } - ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu); - if (!ret) + if (likely(!ret)) goto ok; - if (rcu && ret == -ECHILD) + if (ret == -ECHILD) return ret; if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) @@ -691,17 +684,7 @@ static inline int __exec_permission(struct inode *inode, int rcu) return ret; ok: - return security_inode_exec_permission(inode, rcu); -} - -static int exec_permission(struct inode *inode) -{ - return __exec_permission(inode, 0); -} - -static int exec_permission_rcu(struct inode *inode) -{ - return __exec_permission(inode, 1); + return security_inode_exec_permission(inode, flags); } static __always_inline void set_root(struct nameidata *nd) @@ -1165,7 +1148,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; if (nd->flags & LOOKUP_RCU) { - err = exec_permission_rcu(nd->inode); + err = exec_permission(nd->inode, IPERM_FLAG_RCU); if (err == -ECHILD) { if (nameidata_drop_rcu(nd)) return -ECHILD; @@ -1173,7 +1156,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) } } else { exec_again: - err = exec_permission(nd->inode); + err = exec_permission(nd->inode, 0); } if (err) break; @@ -1620,7 +1603,7 @@ static struct dentry *__lookup_hash(struct qstr *name, struct dentry *dentry; int err; - err = exec_permission(inode); + err = exec_permission(inode, 0); if (err) return ERR_PTR(err); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 58beace14b19..d33da530097a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2189,11 +2189,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } -int nfs_permission(struct inode *inode, int mask) +int nfs_permission(struct inode *inode, int mask, unsigned int flags) { struct rpc_cred *cred; int res = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + nfs_inc_stats(inode, NFSIOS_VFSACCESS); if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) @@ -2241,7 +2244,7 @@ out: out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask, NULL); + res = generic_permission(inode, mask, flags, NULL); goto out; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 71d4bc8464e0..77b48c8fab17 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -785,15 +785,19 @@ out_err: return err; } -int nilfs_permission(struct inode *inode, int mask) +int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root = NILFS_I(inode)->i_root; + struct nilfs_root *root; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f7560da5a567..0ca98823db59 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -256,7 +256,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); -int nilfs_permission(struct inode *inode, int mask); +int nilfs_permission(struct inode *inode, int mask, unsigned int flags); extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, struct buffer_head **); extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 391915093fe1..704f6b1742f3 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_check_acl(struct inode *inode, int mask) +int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int ret = -EAGAIN; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return ret; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 5c5d31f05853..4fe7c9cf4bfb 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,7 +26,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int); +extern int ocfs2_check_acl(struct inode *, int, unsigned int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f6cba566429d..bdadbae09094 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1307,10 +1307,13 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask) +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) { int ret; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + mlog_entry_void(); ret = ocfs2_inode_lock(inode, NULL, 0); @@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask) goto out; } - ret = generic_permission(inode, mask, ocfs2_check_acl); + ret = generic_permission(inode, mask, flags, ocfs2_check_acl); ocfs2_inode_unlock(inode, 0); out: diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..f5afbbef6703 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int ocfs2_permission(struct inode *inode, int mask); +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5b2fcadc3b..b953d41d9abf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2114,11 +2114,13 @@ static const struct file_operations proc_fd_operations = { * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -static int proc_fd_permission(struct inode *inode, int mask) +static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { int rv; - rv = generic_permission(inode, mask, NULL); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c9097f43b425..09a1f92a34ef 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -295,7 +295,7 @@ out: return ret; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) { /* * sysctl entries that are not writeable, @@ -305,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask) struct ctl_table *table; int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 9ea22a56cdf1..3cfb2e933644 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -870,11 +870,14 @@ out: return err; } -static int reiserfs_check_acl(struct inode *inode, int mask) +static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int error = -EAGAIN; /* do regular unix permission checks by default */ + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); if (acl) { @@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct inode *inode, int mask) +int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. @@ -965,9 +970,10 @@ int reiserfs_permission(struct inode *inode, int mask) * Stat data v1 doesn't support ACLs. */ if (get_inode_sd_version(inode) != STAT_DATA_V1) - return generic_permission(inode, mask, reiserfs_check_acl); + return generic_permission(inode, mask, flags, + reiserfs_check_acl); #endif - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index cffb1fd8ba33..30ac27345586 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -348,13 +348,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha return -ENOENT; } -int sysfs_permission(struct inode *inode, int mask) +int sysfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct sysfs_dirent *sd = inode->i_private; + struct sysfs_dirent *sd; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + sd = inode->i_private; mutex_lock(&sysfs_mutex); sysfs_refresh_inode(sd, inode); mutex_unlock(&sysfs_mutex); - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d9be60a2e956..ffaaa816bfba 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -200,7 +200,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); void sysfs_evict_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); +int sysfs_permission(struct inode *inode, int mask, unsigned int flags); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index b2771862fd3d..4b11eaf6a580 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -219,12 +219,16 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } int -xfs_check_acl(struct inode *inode, int mask) +xfs_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip; struct posix_acl *acl; int error = -EAGAIN; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + ip = XFS_I(inode); trace_xfs_check_acl(ip); /* diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 0135e2a669d7..11dd72070cbb 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -42,7 +42,7 @@ struct xfs_acl { #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask); +extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode); diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h index 2e914d0771b9..4ccc59c1ea82 100644 --- a/include/linux/coda_linux.h +++ b/include/linux/coda_linux.h @@ -37,7 +37,7 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct inode *inode, int mask); +int coda_permission(struct inode *inode, int mask, unsigned int flags); int coda_revalidate_inode(struct dentry *); int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); int coda_setattr(struct dentry *, struct iattr *); diff --git a/include/linux/fs.h b/include/linux/fs.h index a04aa96acb71..d5a4d42f655b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1550,11 +1550,13 @@ struct file_operations { int (*setlease)(struct file *, long, struct file_lock **); }; +#define IPERM_FLAG_RCU 0x0001 + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); + int (*permission) (struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int, unsigned int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); @@ -2165,8 +2167,8 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, - int (*check_acl)(struct inode *, int)); +extern int generic_permission(struct inode *, int, unsigned int, + int (*check_acl)(struct inode *, int, unsigned int)); static inline bool execute_ok(struct inode *inode) { diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 574bea4013b6..0437e377b555 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h @@ -10,6 +10,6 @@ extern const struct xattr_handler generic_acl_default_handler; int generic_acl_init(struct inode *, struct inode *); int generic_acl_chmod(struct inode *); -int generic_check_acl(struct inode *inode, int mask); +int generic_check_acl(struct inode *inode, int mask, unsigned int flags); #endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 29d504d5d1c3..0779bb8f95be 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -351,7 +351,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int nfs_permission(struct inode *, int); +extern int nfs_permission(struct inode *, int, unsigned int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index b2cf2089769b..3b94c91f20a6 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -41,7 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask); +int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -- cgit v1.2.3