From 8786fde8421ce755a842051f9528674a1b1f0b9a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 22 Jan 2022 20:54:52 +0000 Subject: Convert NFS from readpages to readahead NFS is one of the last two users of the deprecated ->readpages aop. This conversion looks straightforward, but I have only compile-tested it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 68f81d8d36de..333ea05e2531 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -601,8 +601,7 @@ nfs_have_writebacks(struct inode *inode) * linux/fs/nfs/read.c */ extern int nfs_readpage(struct file *, struct page *); -extern int nfs_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void nfs_readahead(struct readahead_control *); /* * inline functions -- cgit v1.2.3 From 43245eca6e670ebf65908b549641c1460a9cc944 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 2 Feb 2022 17:55:02 -0500 Subject: NFSv4.1 support for NFS4_RESULT_PRESERVER_UNLINKED In 4.1+, the server is allowed to set a flag NFS4_RESULT_PRESERVE_UNLINKED in reply to the OPEN, that tells the client that it does not need to do a silly rename of an opened file when it's being removed. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 ++++++++-- fs/nfs/nfs4proc.c | 2 ++ include/linux/nfs_fs.h | 1 + include/uapi/linux/nfs4.h | 1 + 4 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fbb4a522d716..8b190c8e4a45 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1419,7 +1419,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if (flags & LOOKUP_REVAL) goto out_force; out: - return (inode->i_nlink == 0) ? -ESTALE : 0; + if (inode->i_nlink > 0 || + (inode->i_nlink == 0 && + test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) + return 0; + else + return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; @@ -2330,7 +2335,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); - if (d_count(dentry) > 1) { + if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, + &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b3793b82a5e7..73a9b6de666c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3050,6 +3050,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED) + set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 333ea05e2531..0e79dbbc759a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -277,6 +277,7 @@ struct nfs4_copy_state { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ +#define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FORCE_READDIR (7) /* force readdirplus */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 800bb0ffa6e6..1d2043708bf1 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -45,6 +45,7 @@ #define NFS4_OPEN_RESULT_CONFIRM 0x0002 #define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_PRESERVE_UNLINKED 0x0008 #define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK 0x0020 #define NFS4_SHARE_ACCESS_MASK 0x000F -- cgit v1.2.3 From 88a6099fc3274a27814d26dd688fdc5cd7a480ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 9 Feb 2022 13:22:48 -0500 Subject: NFS: Replace last uses of NFS_INO_REVAL_PAGECACHE Now that we have more fine grained attribute revalidation, let's just get rid of NFS_INO_REVAL_PAGECACHE. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 24 +++++++++++------------- fs/nfs/write.c | 2 +- include/linux/nfs_fs.h | 8 +++----- 3 files changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8cf29c6cd9f9..3adf8b4a0079 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -236,19 +236,17 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); - } else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 987a187bd39a..f88b0eb9b18e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -306,7 +306,7 @@ static void nfs_set_pageerror(struct address_space *mapping) /* Force file size revalidation */ spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | - NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); spin_unlock(&inode->i_lock); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 0e79dbbc759a..ce3128e4bffa 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -356,11 +356,9 @@ static inline void nfs_mark_for_revalidate(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME; + nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_SIZE; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 41e97b7f8a15d15da03ca15e6ff7b9b7ab7f588c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 9 Feb 2022 13:26:19 -0500 Subject: NFS: Remove unused flag NFS_INO_REVAL_PAGECACHE Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 ++--- fs/nfs/nfstrace.h | 1 - include/linux/nfs_fs.h | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3adf8b4a0079..f9fc506ebb29 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -203,14 +203,13 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) NFS_INO_INVALID_OTHER | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); - } else if (flags & NFS_INO_REVAL_PAGECACHE) - flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; + } if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode, 0); - flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + flags &= ~NFS_INO_REVAL_FORCED; nfsi->cache_validity |= flags; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 4611aa3a21a4..45a310b586ce 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -21,7 +21,6 @@ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ - { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ce3128e4bffa..72a732a5103c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -247,7 +247,6 @@ struct nfs4_copy_state { #define NFS_INO_INVALID_ATIME BIT(2) /* cached atime is invalid */ #define NFS_INO_INVALID_ACCESS BIT(3) /* cached access cred invalid */ #define NFS_INO_INVALID_ACL BIT(4) /* cached acls are invalid */ -#define NFS_INO_REVAL_PAGECACHE BIT(5) /* must revalidate pagecache */ #define NFS_INO_REVAL_FORCED BIT(6) /* force revalidation ignoring a delegation */ #define NFS_INO_INVALID_LABEL BIT(7) /* cached label is invalid */ #define NFS_INO_INVALID_CHANGE BIT(8) /* cached change is invalid */ -- cgit v1.2.3 From 1a93b82c59ab45f2d9f14c47f09d2e341ff02381 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Feb 2022 07:07:08 -0500 Subject: NFS: constify nfs_server_capable() and nfs_have_writebacks() Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 72a732a5103c..6e10725887d1 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -363,7 +363,7 @@ static inline void nfs_mark_for_revalidate(struct inode *inode) spin_unlock(&inode->i_lock); } -static inline int nfs_server_capable(struct inode *inode, int cap) +static inline int nfs_server_capable(const struct inode *inode, int cap) { return NFS_SERVER(inode)->caps & cap; } @@ -587,12 +587,11 @@ extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); -static inline int -nfs_have_writebacks(struct inode *inode) +static inline bool nfs_have_writebacks(const struct inode *inode) { if (S_ISREG(inode->i_mode)) return atomic_long_read(&NFS_I(inode)->nrequests) != 0; - return 0; + return false; } /* -- cgit v1.2.3 From 728dd0ab37421396927749fc8cec9c2009c526c8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2022 08:59:33 -0500 Subject: NFS: Don't re-read the entire page cache to find the next cookie If the page cache entry that was last read gets invalidated for some reason, then make sure we can re-create it on the next call to readdir. This, combined with the cache page validation, allows us to reuse the cached value of page-index on successive calls to nfs_readdir. Credit is due to Benjamin Coddington for showing that the concept works, and that it allows for improved cache sharing between processes even in the case where pages are lost due to LRU or active invalidation. Suggested-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 +++++++--- include/linux/nfs_fs.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a1767f755460..93f70698e401 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1120,6 +1120,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->dup_cookie = dir_ctx->dup_cookie; desc->duped = dir_ctx->duped; page_index = dir_ctx->page_index; + desc->page_index = page_index; + desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); @@ -1168,6 +1170,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; dir_ctx->dup_cookie = desc->dup_cookie; + dir_ctx->last_cookie = desc->last_cookie; dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; @@ -1209,10 +1212,11 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) } if (offset != filp->f_pos) { filp->f_pos = offset; - if (nfs_readdir_use_cookie(filp)) - dir_ctx->dir_cookie = offset; - else + if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; + dir_ctx->page_index = 0; + } else + dir_ctx->dir_cookie = offset; if (offset == 0) memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 6e10725887d1..1c533f2c1f36 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -105,6 +105,7 @@ struct nfs_open_dir_context { __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 dup_cookie; + __u64 last_cookie; pgoff_t page_index; signed char duped; bool eof; -- cgit v1.2.3 From 580f236737d13ee25d5b0b1d124f50014fe6833b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Feb 2022 13:37:00 -0500 Subject: NFS: Adjust the amount of readahead performed by NFS readdir The current NFS readdir code will always try to maximise the amount of readahead it performs on the assumption that we can cache anything that isn't immediately read by the process. There are several cases where this assumption breaks down, including when the 'ls -l' heuristic kicks in to try to force use of readdirplus as a batch replacement for lookup/getattr. This patch therefore tries to tone down the amount of readahead we perform, and adjust it to try to match the amount of data being requested by user space. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nfs_fs.h | 1 + 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 60f7feee0a16..520dc3ec4aef 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -69,6 +69,8 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; +#define NFS_INIT_DTSIZE PAGE_SIZE + static struct nfs_open_dir_context * alloc_nfs_open_dir_context(struct inode *dir) { @@ -78,6 +80,7 @@ alloc_nfs_open_dir_context(struct inode *dir) ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { ctx->attr_gencount = nfsi->attr_gencount; + ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -154,6 +157,7 @@ struct nfs_readdir_descriptor { struct page *page; struct dir_context *ctx; pgoff_t page_index; + pgoff_t page_index_max; u64 dir_cookie; u64 last_cookie; u64 dup_cookie; @@ -166,12 +170,36 @@ struct nfs_readdir_descriptor { unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; + unsigned int buffer_fills; + unsigned int dtsize; signed char duped; bool plus; bool eob; bool eof; }; +static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) +{ + struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); + unsigned int maxsize = server->dtsize; + + if (sz > maxsize) + sz = maxsize; + if (sz < NFS_MIN_FILE_IO_SIZE) + sz = NFS_MIN_FILE_IO_SIZE; + desc->dtsize = sz; +} + +static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize >> 1); +} + +static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize << 1); +} + static void nfs_readdir_array_init(struct nfs_cache_array *array) { memset(array, 0, sizeof(struct nfs_cache_array)); @@ -784,6 +812,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, break; arrays++; *arrays = page = new; + desc->page_index_max++; } else { new = nfs_readdir_page_get_next(mapping, page->index + 1, @@ -793,6 +822,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, if (page != *arrays) nfs_readdir_page_unlock_and_put(page); page = new; + desc->page_index_max = new->index; } status = nfs_readdir_add_to_array(entry, page); } while (!status && !entry->eof); @@ -858,7 +888,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); - size_t dtsize = NFS_SERVER(inode)->dtsize; + unsigned int dtsize = desc->dtsize; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); @@ -894,6 +924,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); + desc->buffer_fills++; } while (!status && nfs_readdir_page_needs_filling(page) && page_mapping(page)); @@ -941,6 +972,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) if (!desc->page) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { + /* Grow the dtsize if we had to go back for more pages */ + if (desc->page_index == desc->page_index_max) + nfs_grow_dtsize(desc); + desc->page_index_max = desc->page_index; res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->page, 1); if (res < 0) { @@ -1075,6 +1110,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->duped = 0; + desc->page_index_max = 0; status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); @@ -1084,10 +1120,22 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) } desc->page = NULL; + /* + * Grow the dtsize if we have to go back for more pages, + * or shrink it if we're reading too many. + */ + if (!desc->eof) { + if (!desc->eob) + nfs_grow_dtsize(desc); + else if (desc->buffer_fills == 1 && + i < (desc->page_index_max >> 1)) + nfs_shrink_dtsize(desc); + } for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_page_array_free(arrays[i]); out: + desc->page_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; @@ -1126,6 +1174,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->file = file; desc->ctx = ctx; desc->plus = nfs_use_readdirplus(inode, ctx); + desc->page_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; @@ -1136,6 +1185,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; + nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); @@ -1187,6 +1237,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; dir_ctx->eof = desc->eof; + dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); out_free: diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1c533f2c1f36..691a27936849 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -107,6 +107,7 @@ struct nfs_open_dir_context { __u64 dup_cookie; __u64 last_cookie; pgoff_t page_index; + unsigned int dtsize; signed char duped; bool eof; }; -- cgit v1.2.3 From 230bc98f7a2a49eb472d184bdec91fd3096384b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 17 Feb 2022 11:08:24 -0500 Subject: NFS: Improve heuristic for readdirplus The heuristic for readdirplus is designed to try to detect 'ls -l' and similar patterns. It does so by looking for cache hit/miss patterns in both the attribute cache and in the dcache of the files in a given directory, and then sets a flag for the readdirplus code to interpret. The problem with this approach is that a single attribute or dcache miss can cause the NFS code to force a refresh of the attributes for the entire set of files contained in the directory. To be able to make a more nuanced decision, let's sample the number of hits and misses in the set of open directory descriptors. That allows us to set thresholds at which we start preferring READDIRPLUS over regular READDIR, or at which we start to force a re-read of the remaining readdir cache using READDIRPLUS. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 80 ++++++++++++++++++++++++++++++++------------------ fs/nfs/inode.c | 4 +-- fs/nfs/internal.h | 4 +-- fs/nfs/nfstrace.h | 1 - include/linux/nfs_fs.h | 5 ++-- 5 files changed, 58 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index dc6acfd14fc7..098fc1bdaac8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -87,8 +87,7 @@ alloc_nfs_open_dir_context(struct inode *dir) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); - list_add(&ctx->list, &nfsi->open_files); - clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; @@ -99,9 +98,9 @@ alloc_nfs_open_dir_context(struct inode *dir) static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); - list_del(&ctx->list); + list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } /* @@ -594,7 +593,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); desc->plus = arg.plus = false; goto again; } @@ -644,51 +642,63 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 1; } -static -bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) + +static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, + unsigned int cache_hits, + unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; - if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) - return true; - if (ctx->pos == 0) + if (ctx->pos == 0 || + cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* - * This function is called by the lookup and getattr code to request the + * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ -void nfs_advise_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_hits); + rcu_read_unlock(); + } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. - * Do this by checking if there is an active file descriptor - * and calling nfs_advise_use_readdirplus, then forcing a - * cache flush. */ -void nfs_force_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) { - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); - set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_misses); + rcu_read_unlock(); } } +static void nfs_lookup_advise_force_readdirplus(struct inode *dir) +{ + nfs_readdir_record_entry_cache_miss(dir); +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) @@ -1122,6 +1132,19 @@ out: return status; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + +static void nfs_readdir_handle_cache_misses(struct inode *inode, + struct nfs_readdir_descriptor *desc, + pgoff_t page_index, + unsigned int cache_misses) +{ + if (desc->ctx->pos == 0 || + cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD) + return; + invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -1133,6 +1156,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; + unsigned int cache_hits, cache_misses; pgoff_t page_index; int res; @@ -1154,7 +1178,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out; desc->file = file; desc->ctx = ctx; - desc->plus = nfs_use_readdirplus(inode, ctx); desc->page_index_max = -1; spin_lock(&file->f_lock); @@ -1168,6 +1191,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->eof = dir_ctx->eof; nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); + cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); spin_unlock(&file->f_lock); if (desc->eof) { @@ -1175,9 +1200,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && - list_is_singular(&nfsi->open_files)) - invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); + desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); + nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses); do { res = readdir_search_pagecache(desc); @@ -1196,7 +1220,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); desc->page_index = 0; desc->plus = false; @@ -1610,7 +1633,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, nfs_set_verifier(dentry, dir_verifier); /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); + nfs_lookup_advise_force_readdirplus(dir); ret = 1; out: nfs_free_fattr(fattr); @@ -1667,7 +1690,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, nfs_mark_dir_for_revalidate(dir); goto out_bad; } - nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1872,7 +1894,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in goto out; /* Notify readdir to use READDIRPLUS */ - nfs_force_use_readdirplus(dir); + nfs_lookup_advise_force_readdirplus(dir); no_entry: res = d_splice_alias(inode, dentry); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7cecabf57b95..bbf4357ff727 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -787,7 +787,7 @@ static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) return; parent = dget_parent(dentry); - nfs_force_use_readdirplus(d_inode(parent)); + nfs_readdir_record_entry_cache_miss(d_inode(parent)); dput(parent); } @@ -798,7 +798,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) return; parent = dget_parent(dentry); - nfs_advise_use_readdirplus(d_inode(parent)); + nfs_readdir_record_entry_cache_hit(d_inode(parent)); dput(parent); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b5398af53c7f..194840a97e3a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -366,8 +366,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ -extern void nfs_advise_use_readdirplus(struct inode *dir); -extern void nfs_force_use_readdirplus(struct inode *dir); +extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); +extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 45a310b586ce..3672f6703ee7 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -36,7 +36,6 @@ #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ - { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 691a27936849..20a4cf0acad2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -101,6 +101,8 @@ struct nfs_open_context { struct nfs_open_dir_context { struct list_head list; + atomic_t cache_hits; + atomic_t cache_misses; unsigned long attr_gencount; __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; @@ -110,6 +112,7 @@ struct nfs_open_dir_context { unsigned int dtsize; signed char duped; bool eof; + struct rcu_head rcu_head; }; /* @@ -274,13 +277,11 @@ struct nfs4_copy_state { /* * Bit offsets in flags field */ -#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ -#define NFS_INO_FORCE_READDIR (7) /* force readdirplus */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ -- cgit v1.2.3 From f648022faa68ef76058aa121d1aa3a967d59cae8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 23 Feb 2022 11:31:51 -0500 Subject: NFS: Convert readdir page cache to use a cookie based index Instead of using a linear index to address the pages, use the cookie of the first entry, since that is what we use to match the page anyway. This allows us to avoid re-reading the entire cache on a seekdir() type of operation. The latter is very common when re-exporting NFS, and is a major performance drain. The change does affect our duplicate cookie detection, since we can no longer rely on the page index as a linear offset for detecting whether we looped backwards. However since we no longer do a linear search through all the pages on each call to nfs_readdir(), this is less of a concern than it was previously. The other downside is that invalidate_mapping_pages() no longer can use the page index to avoid clearing pages that have been read. A subsequent patch will restore the functionality this provides to the 'ls -l' heuristic. Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 4 ++ fs/nfs/dir.c | 149 +++++++++++++++++++++---------------------------- include/linux/nfs_fs.h | 2 - 3 files changed, 69 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 14a72224b657..47a53b3362b6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -4,6 +4,10 @@ config NFS_FS depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC + select CRYPTO + select CRYPTO_HASH + select XXHASH + select CRYPTO_XXHASH select NFS_ACL_SUPPORT if NFS_V3_ACL help Choose Y here if you want to access files residing on other diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4983950de2ad..8c2552d89310 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "delegation.h" #include "iostat.h" @@ -159,9 +160,7 @@ struct nfs_readdir_descriptor { pgoff_t page_index_max; u64 dir_cookie; u64 last_cookie; - u64 dup_cookie; loff_t current_index; - loff_t prev_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; @@ -171,7 +170,6 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; unsigned int buffer_fills; unsigned int dtsize; - signed char duped; bool plus; bool eob; bool eof; @@ -331,6 +329,28 @@ out: return ret; } +#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) +/* + * Hash algorithm allowing content addressible access to sequences + * of directory cookies. Content is addressed by the value of the + * cookie index of the first readdir entry in a page. + * + * The xxhash algorithm is chosen because it is fast, and is supposed + * to result in a decent flat distribution of hashes. + * + * We then select only the first 18 bits to avoid issues with excessive + * memory use for the page cache XArray. 18 bits should allow the caching + * of 262144 pages of sequences of readdir entries. Since each page holds + * 127 readdir entries for a typical 64-bit system, that works out to a + * cache of ~ 33 million entries per directory. + */ +static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) +{ + if (cookie == 0) + return 0; + return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK; +} + static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, u64 change_attr) { @@ -352,15 +372,15 @@ static void nfs_readdir_page_unlock_and_put(struct page *page) } static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, - pgoff_t index, u64 last_cookie) + u64 last_cookie, + u64 change_attr) { + pgoff_t index = nfs_readdir_page_cookie_hash(last_cookie); struct page *page; - u64 change_attr; page = grab_cache_page(mapping, index); if (!page) return NULL; - change_attr = inode_peek_iversion_raw(mapping->host); if (PageUptodate(page)) { if (nfs_readdir_page_validate(page, last_cookie, change_attr)) return page; @@ -371,11 +391,6 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, return page; } -static loff_t nfs_readdir_page_offset(struct page *page) -{ - return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries(); -} - static u64 nfs_readdir_page_last_cookie(struct page *page) { struct nfs_cache_array *array; @@ -408,11 +423,11 @@ static void nfs_readdir_page_set_eof(struct page *page) } static struct page *nfs_readdir_page_get_next(struct address_space *mapping, - pgoff_t index, u64 cookie) + u64 cookie, u64 change_attr) { struct page *page; - page = nfs_readdir_page_get_locked(mapping, index, cookie); + page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); if (page) { if (nfs_readdir_page_last_cookie(page) == cookie) return page; @@ -452,6 +467,13 @@ static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, desc->last_cookie = array->array[0].cookie; } +static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) +{ + desc->current_index = 0; + desc->last_cookie = 0; + desc->page_index = 0; +} + static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { @@ -492,8 +514,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { - int i; - loff_t new_pos; + unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) @@ -501,32 +522,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { - struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - - new_pos = nfs_readdir_page_offset(desc->page) + i; - if (desc->attr_gencount != nfsi->attr_gencount) { - desc->duped = 0; - desc->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->prev_index) { - if (desc->duped > 0 - && desc->dup_cookie == desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %pD2 contains a readdir loop." - "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, array->array[i].name, desc->dir_cookie); - } - status = -ELOOP; - goto out; - } - desc->dup_cookie = desc->dir_cookie; - desc->duped = -1; - } if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else - desc->ctx->pos = new_pos; - desc->prev_index = new_pos; + desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } @@ -538,7 +537,6 @@ check_eof: desc->eof = true; } else nfs_readdir_seek_next_array(array, desc); -out: return status; } @@ -785,10 +783,9 @@ out: /* Perform conversion from xdr to cache array */ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, - struct page **xdr_pages, - unsigned int buflen, - struct page **arrays, - size_t narrays) + struct page **xdr_pages, unsigned int buflen, + struct page **arrays, size_t narrays, + u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; @@ -828,18 +825,16 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, break; arrays++; *arrays = page = new; - desc->page_index_max++; } else { - new = nfs_readdir_page_get_next(mapping, - page->index + 1, - entry->prev_cookie); + new = nfs_readdir_page_get_next( + mapping, entry->prev_cookie, change_attr); if (!new) break; if (page != *arrays) nfs_readdir_page_unlock_and_put(page); page = new; - desc->page_index_max = new->index; } + desc->page_index_max++; status = nfs_readdir_add_to_array(entry, page); } while (!status && !entry->eof); @@ -899,6 +894,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct page **arrays, size_t narrays) { + u64 change_attr; struct page **pages; struct page *page = *arrays; struct nfs_entry *entry; @@ -923,6 +919,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, if (!pages) goto out; + change_attr = inode_peek_iversion_raw(inode); status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, dtsize, verf_res); if (status < 0) @@ -931,7 +928,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, pglen = status; if (pglen != 0) status = nfs_readdir_page_filler(desc, entry, pages, pglen, - arrays, narrays); + arrays, narrays, change_attr); else nfs_readdir_page_set_eof(page); desc->buffer_fills++; @@ -961,9 +958,11 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) static struct page * nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) { - return nfs_readdir_page_get_locked(desc->file->f_mapping, - desc->page_index, - desc->last_cookie); + struct address_space *mapping = desc->file->f_mapping; + u64 change_attr = inode_peek_iversion_raw(mapping->host); + + return nfs_readdir_page_get_locked(mapping, desc->last_cookie, + change_attr); } /* @@ -995,7 +994,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); - desc->page_index = 0; + nfs_readdir_rewind_search(desc); trace_nfs_readdir_invalidate_cache_range( inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; @@ -1009,12 +1008,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); - invalidate_inode_pages2_range(desc->file->f_mapping, - desc->page_index_max + 1, + invalidate_inode_pages2_range(desc->file->f_mapping, 1, -1); trace_nfs_readdir_invalidate_cache_range( - inode, desc->page_index_max + 1, - MAX_LFS_FILESIZE); + inode, 1, MAX_LFS_FILESIZE); } } res = nfs_readdir_search_array(desc); @@ -1030,11 +1027,6 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) int res; do { - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -1072,8 +1064,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (desc->duped != 0) - desc->duped = 1; } if (array->page_is_eof) desc->eof = !desc->eob; @@ -1115,7 +1105,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) desc->page_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; - desc->duped = 0; desc->page_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, @@ -1148,6 +1137,8 @@ out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_page_array_free(arrays[i]); out: + if (!nfs_readdir_use_cookie(desc->file)) + nfs_readdir_rewind_search(desc); desc->page_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); @@ -1158,17 +1149,14 @@ out: static void nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, - pgoff_t page_index, unsigned int cache_misses) { if (desc->ctx->pos == 0 || cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD) return; - if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0) + if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0) return; - trace_nfs_readdir_invalidate_cache_range( - inode, (loff_t)(page_index + 1) << PAGE_SHIFT, - MAX_LFS_FILESIZE); + trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE); } /* The file offset position represents the dirent entry number. A @@ -1183,7 +1171,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; unsigned int cache_hits, cache_misses; - pgoff_t page_index; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -1208,10 +1195,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; - desc->dup_cookie = dir_ctx->dup_cookie; - desc->duped = dir_ctx->duped; - page_index = dir_ctx->page_index; - desc->page_index = page_index; + desc->page_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; @@ -1227,7 +1211,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) } desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); - nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses); + nfs_readdir_handle_cache_misses(inode, desc, cache_misses); do { res = readdir_search_pagecache(desc); @@ -1247,7 +1231,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) } if (res == -ETOOSMALL && desc->plus) { nfs_zap_caches(inode); - desc->page_index = 0; desc->plus = false; desc->eof = false; continue; @@ -1261,9 +1244,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; - dir_ctx->dup_cookie = desc->dup_cookie; dir_ctx->last_cookie = desc->last_cookie; - dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; dir_ctx->eof = desc->eof; @@ -1306,13 +1287,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->page_index = 0; - if (!nfs_readdir_use_cookie(filp)) + if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; - else + dir_ctx->last_cookie = 0; + } else { dir_ctx->dir_cookie = offset; - if (offset == 0) - memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); - dir_ctx->duped = 0; + dir_ctx->last_cookie = offset; + } dir_ctx->eof = false; } spin_unlock(&filp->f_lock); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 20a4cf0acad2..42aad886d3c0 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -106,11 +106,9 @@ struct nfs_open_dir_context { unsigned long attr_gencount; __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; - __u64 dup_cookie; __u64 last_cookie; pgoff_t page_index; unsigned int dtsize; - signed char duped; bool eof; struct rcu_head rcu_head; }; -- cgit v1.2.3 From b0365ccb0712efacf99936e94e92eb7ae63de4d5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 23 Feb 2022 13:29:59 -0500 Subject: NFS: Fix up forced readdirplus Avoid clearing the entire readdir page cache if we're just doing forced readdirplus for the 'ls -l' heuristic. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 56 +++++++++++++++++++++++++++++++++++--------------- fs/nfs/nfstrace.h | 1 + include/linux/nfs_fs.h | 1 + 3 files changed, 41 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8c2552d89310..f6aac1e8a8b9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -170,6 +170,7 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; unsigned int buffer_fills; unsigned int dtsize; + bool clear_cache; bool plus; bool eob; bool eof; @@ -227,6 +228,13 @@ static void nfs_readdir_clear_array(struct page *page) kunmap_atomic(array); } +static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie, + u64 change_attr) +{ + nfs_readdir_clear_array(page); + nfs_readdir_page_init_array(page, last_cookie, change_attr); +} + static struct page * nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) { @@ -428,12 +436,11 @@ static struct page *nfs_readdir_page_get_next(struct address_space *mapping, struct page *page; page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); - if (page) { - if (nfs_readdir_page_last_cookie(page) == cookie) - return page; - nfs_readdir_page_unlock_and_put(page); - } - return NULL; + if (!page) + return NULL; + if (nfs_readdir_page_last_cookie(page) != cookie) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } static inline @@ -960,9 +967,15 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); + u64 cookie = desc->last_cookie; + struct page *page; - return nfs_readdir_page_get_locked(mapping, desc->last_cookie, - change_attr); + page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); + if (!page) + return NULL; + if (desc->clear_cache && !nfs_readdir_page_needs_filling(page)) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } /* @@ -1013,6 +1026,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) trace_nfs_readdir_invalidate_cache_range( inode, 1, MAX_LFS_FILESIZE); } + desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) @@ -1147,16 +1161,17 @@ out: #define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) -static void nfs_readdir_handle_cache_misses(struct inode *inode, +static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, - unsigned int cache_misses) + unsigned int cache_misses, + bool force_clear) { - if (desc->ctx->pos == 0 || - cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD) - return; - if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0) - return; - trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE); + if (desc->ctx->pos == 0 || !desc->plus) + return false; + if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) + return false; + trace_nfs_readdir_force_readdirplus(inode); + return true; } /* The file offset position represents the dirent entry number. A @@ -1171,6 +1186,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; unsigned int cache_hits, cache_misses; + bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -1203,6 +1219,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); + force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { @@ -1211,7 +1228,9 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) } desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); - nfs_readdir_handle_cache_misses(inode, desc, cache_misses); + force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, + force_clear); + desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); @@ -1240,6 +1259,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); + if (desc->page_index == desc->page_index_max) + desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); @@ -1247,6 +1268,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index ec2645d20abf..59f4ca803fd0 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -160,6 +160,7 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); +DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus); DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done); DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 42aad886d3c0..3893386ceaed 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -109,6 +109,7 @@ struct nfs_open_dir_context { __u64 last_cookie; pgoff_t page_index; unsigned int dtsize; + bool force_clear; bool eof; struct rcu_head rcu_head; }; -- cgit v1.2.3 From 0adf85b445c7fbc5d2df1f8c1bc54d62c4340237 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Feb 2022 12:46:24 -0500 Subject: NFS: Optimise away the previous cookie field Replace the 'previous cookie' field in struct nfs_entry with the array->last_cookie. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 26 ++++++++++++++------------ fs/nfs/nfs2xdr.c | 1 - fs/nfs/nfs3xdr.c | 1 - fs/nfs/nfs4xdr.c | 1 - include/linux/nfs_xdr.h | 3 +-- 5 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f6aac1e8a8b9..033249a72e92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -301,19 +301,20 @@ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) return 0; } -static -int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +static int nfs_readdir_page_array_append(struct page *page, + const struct nfs_entry *entry, + u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; - int ret; + int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); - if (!name) - return -ENOMEM; array = kmap_atomic(page); + if (!name) + goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); @@ -321,7 +322,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) } cache_entry = &array->array[array->size]; - cache_entry->cookie = entry->prev_cookie; + cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; @@ -333,6 +334,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: + *cookie = array->last_cookie; kunmap_atomic(array); return ret; } @@ -798,6 +800,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, struct xdr_stream stream; struct xdr_buf buf; struct page *scratch, *new, *page = *arrays; + u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); @@ -819,22 +822,21 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_page_array_append(page, entry, &cookie); if (status != -ENOSPC) continue; if (page->mapping != mapping) { if (!--narrays) break; - new = nfs_readdir_page_array_alloc(entry->prev_cookie, - GFP_KERNEL); + new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = page = new; } else { - new = nfs_readdir_page_get_next( - mapping, entry->prev_cookie, change_attr); + new = nfs_readdir_page_get_next(mapping, cookie, + change_attr); if (!new) break; if (page != *arrays) @@ -842,7 +844,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, page = new; } desc->page_index_max++; - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_page_array_append(page, entry, &cookie); } while (!status && !entry->eof); switch (status) { diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 3d5ba43f44bb..05c3b4b2b3dd 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -955,7 +955,6 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, * The type (size and byte order) of nfscookie isn't defined in * RFC 1094. This implementation assumes that it's an XDR uint32. */ - entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) return -EAGAIN; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d6779ceeb39e..3b0b650c9c5a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -2024,7 +2024,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, zero_nfs_fh3(entry->fh); } - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b7780b97dc4d..86a5f6516928 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7508,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 728cb0c1f0b6..82f7c2730b9a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -745,8 +745,7 @@ struct nfs_auth_info { */ struct nfs_entry { __u64 ino; - __u64 cookie, - prev_cookie; + __u64 cookie; const char * name; unsigned int len; int eof; -- cgit v1.2.3 From a41b05edfedb939440e83666f23de3ef9af33acf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: SUNRPC/auth: async tasks mustn't block waiting for memory When memory is short, new worker threads cannot be created and we depend on the minimum one rpciod thread to be able to handle everything. So it must not block waiting for memory. mempools are particularly a problem as memory can only be released back to the mempool by an async rpc task running. If all available workqueue threads are waiting on the mempool, no thread is available to return anything. lookup_cred() can block on a mempool or kmalloc - and this can cause deadlocks. So add a new RPCAUTH_LOOKUP flag for async lookups and don't block on memory. If the -ENOMEM gets back to call_refreshresult(), wait a short while and try again. HZ>>4 is chosen as it is used elsewhere for -ENOMEM retries. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 1 + net/sunrpc/auth.c | 6 +++++- net/sunrpc/auth_gss/auth_gss.c | 6 +++++- net/sunrpc/auth_unix.c | 10 ++++++++-- net/sunrpc/clnt.c | 3 +++ 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 98da816b5fc2..3e6ce288a7fc 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -99,6 +99,7 @@ struct rpc_auth_create_args { /* Flags for rpcauth_lookupcred() */ #define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */ +#define RPCAUTH_LOOKUP_ASYNC 0x02 /* Don't block waiting for memory */ /* * Client authentication ops diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index a9f0d17fdb0d..6bfa19f9fa6a 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -615,6 +615,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) }; struct rpc_cred *ret; + if (RPC_IS_ASYNC(task)) + lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; @@ -631,6 +633,8 @@ rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) if (!acred.principal) return NULL; + if (RPC_IS_ASYNC(task)) + lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } @@ -654,7 +658,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) }; if (flags & RPC_TASK_ASYNC) - lookupflags |= RPCAUTH_LOOKUP_NEW; + lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index affd64a54f02..ac0828108204 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1341,7 +1341,11 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) static struct rpc_cred * gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags, GFP_KERNEL); + gfp_t gfp = GFP_KERNEL; + + if (flags & RPCAUTH_LOOKUP_ASYNC) + gfp = GFP_NOWAIT | __GFP_NOWARN; + return rpcauth_lookup_credcache(auth, acred, flags, gfp); } static struct rpc_cred * diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 3600d8641644..c629d366030e 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -43,8 +43,14 @@ unx_destroy(struct rpc_auth *auth) static struct rpc_cred * unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_KERNEL); - + gfp_t gfp = GFP_KERNEL; + struct rpc_cred *ret; + + if (flags & RPCAUTH_LOOKUP_ASYNC) + gfp = GFP_NOWAIT | __GFP_NOWARN; + ret = mempool_alloc(unix_pool, gfp); + if (!ret) + return ERR_PTR(-ENOMEM); rpcauth_init_cred(ret, acred, auth, &unix_credops); ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; return ret; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 97165a545cb3..9556eb7b065b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1745,6 +1745,9 @@ call_refreshresult(struct rpc_task *task) task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; + case -ENOMEM: + rpc_delay(task, HZ >> 4); + return; } trace_rpc_refresh_status(task); rpc_call_rpcerror(task, status); -- cgit v1.2.3 From 89c2be8a951654758dffeaaa6272328d9c8f29be Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS NFS_RPC_SWAPFLAGS is only used for READ requests. It sets RPC_TASK_SWAPPER which gives some memory-allocation priority to requests. This is not needed for swap READ - though it is for writes where it is set via a different mechanism. RPC_TASK_ROOTCREDS causes the 'machine' credential to be used. This is not needed as the root credential is saved when the swap file is opened, and this is used for all IO. So NFS_RPC_SWAPFLAGS isn't needed, and as it is the only user of RPC_TASK_ROOTCREDS, that isn't needed either. Remove both. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 4 ---- include/linux/nfs_fs.h | 5 ----- include/linux/sunrpc/sched.h | 1 - include/trace/events/sunrpc.h | 1 - net/sunrpc/auth.c | 2 +- 5 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e4c1a49b0126..5e7657374bc3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = hdr->inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - - task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); trace_nfs_initiate_read(hdr); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 3893386ceaed..9074ed0b65aa 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -45,11 +45,6 @@ */ #define NFS_MAX_TRANSPORTS 16 -/* - * These are the default flags for swap requests - */ -#define NFS_RPC_SWAPFLAGS (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) - /* * Size of the NFS directory verifier */ diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index db964bb63912..56710f8056d3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -124,7 +124,6 @@ struct rpc_task_setup { #define RPC_TASK_MOVEABLE 0x0004 /* nfs4.1+ rpc tasks */ #define RPC_TASK_NULLCREDS 0x0010 /* Use AUTH_NULL credential */ #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ -#define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */ #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ #define RPC_TASK_NO_ROUND_ROBIN 0x0100 /* send requests on "main" xprt */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 29982d60b68a..ac33892da411 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -311,7 +311,6 @@ TRACE_EVENT(rpc_request, { RPC_TASK_MOVEABLE, "MOVEABLE" }, \ { RPC_TASK_NULLCREDS, "NULLCREDS" }, \ { RPC_CALL_MAJORSEEN, "MAJORSEEN" }, \ - { RPC_TASK_ROOTCREDS, "ROOTCREDS" }, \ { RPC_TASK_DYNAMIC, "DYNAMIC" }, \ { RPC_TASK_NO_ROUND_ROBIN, "NO_ROUND_ROBIN" }, \ { RPC_TASK_SOFT, "SOFT" }, \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 6bfa19f9fa6a..682fcd24bf43 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -670,7 +670,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) /* If machine cred couldn't be bound, try a root cred */ if (new) ; - else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS)) + else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); -- cgit v1.2.3 From 4dc73c679114a2f408567e2e44770ed934190db2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFSv4: keep state manager thread active if swap is enabled If we are swapping over NFSv4, we may not be able to allocate memory to start the state-manager thread at the time when we need it. So keep it always running when swap is enabled, and just signal it to start. This requires updating and testing the cl_swapper count on the root rpc_clnt after following all ->cl_parent links. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 15 ++++++++++++--- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 20 ++++++++++++++++++++ fs/nfs/nfs4state.c | 40 +++++++++++++++++++++++++++++++++------- include/linux/nfs_xdr.h | 2 ++ net/sunrpc/clnt.c | 2 ++ 6 files changed, 70 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93c01aaa0a8d..d31bc430dce3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -483,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; spin_lock(&inode->i_lock); blocks = inode->i_blocks; @@ -497,14 +498,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, *span = sis->pages; + + if (cl->rpc_ops->enable_swap) + cl->rpc_ops->enable_swap(inode); + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; rpc_clnt_swap_deactivate(clnt); + if (cl->rpc_ops->disable_swap) + cl->rpc_ops->disable_swap(file_inode(file)); } const struct address_space_operations nfs_file_aops = { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 84f39b6f1b1e..79df6e83881b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -42,6 +42,7 @@ enum nfs4_client_state { NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, NFS4CLNT_RUN_MANAGER, + NFS4CLNT_MANAGER_AVAILABLE, NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fd8eece12e94..dd7a4c2a3f05 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10468,6 +10468,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) return error + error2 + error3; } +static void nfs4_enable_swap(struct inode *inode) +{ + /* The state manager thread must always be running. + * It will notice the client is a swapper, and stay put. + */ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); +} + +static void nfs4_disable_swap(struct inode *inode) +{ + /* The state manager thread will now exit once it is + * woken. + */ + wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state); +} + static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -10545,6 +10563,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .create_server = nfs4_create_server, .clone_server = nfs_clone_server, .discover_trunking = nfs4_discover_trunking, + .enable_swap = nfs4_enable_swap, + .disable_swap = nfs4_disable_swap, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 58054dfdf2b0..4b2ea239a537 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1207,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { + wake_up_var(&clp->cl_state); return; + } + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1226,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); nfs4_clear_state_manager_bit(clp); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2680,12 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); } - /* Did we race with an attempt to give us more work? */ - if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) - return; - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - return; - memflags = memalloc_nofs_save(); + return; + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2706,9 +2710,31 @@ out_drain: static int nfs4_run_state_manager(void *ptr) { struct nfs_client *clp = ptr; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; allow_signal(SIGKILL); +again: + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); + if (atomic_read(&cl->cl_swapper)) { + wait_var_event_interruptible(&clp->cl_state, + test_bit(NFS4CLNT_RUN_MANAGER, + &clp->cl_state)); + if (atomic_read(&cl->cl_swapper) && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + goto again; + /* Either no longer a swapper, or were signalled */ + } + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + goto again; + nfs_put_client(clp); module_put_and_kthread_exit(0); return 0; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 82f7c2730b9a..49ba486aea5f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1797,6 +1797,8 @@ struct nfs_rpc_ops { struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); int (*discover_trunking)(struct nfs_server *, struct nfs_fh *); + void (*enable_swap)(struct inode *inode); + void (*disable_swap)(struct inode *inode); }; /* diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4117ea4caa2e..0f54a56d19d2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -3069,6 +3069,8 @@ rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt, int rpc_clnt_swap_activate(struct rpc_clnt *clnt) { + while (clnt != clnt->cl_parent) + clnt = clnt->cl_parent; if (atomic_inc_return(&clnt->cl_swapper) == 1) return rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_swap_activate_callback, NULL); -- cgit v1.2.3 From 64158668ac8b31626a8ce48db4cad08496eb8340 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFS: swap IO handling is slightly different for O_DIRECT IO 1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding possible deadlocks with "fs_reclaim". These deadlocks could, I believe, eventuate if a buffered read on the swapfile was attempted. We don't need coherence with the page cache for a swap file, and buffered writes are forbidden anyway. There is no other need for i_rwsem during direct IO. So never take it for swap_rw() 2/ generic_write_checks() explicitly forbids writes to swap, and performs checks that are not needed for swap. So bypass it for swap_rw(). Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 42 ++++++++++++++++++++++++++++-------------- fs/nfs/file.c | 4 ++-- include/linux/nfs_fs.h | 8 ++++---- 3 files changed, 34 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index eabfdab543c8..04aaf39a05cb 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -173,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -425,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -440,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -482,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -876,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -892,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -906,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -937,16 +947,20 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d31bc430dce3..81c80548a5c6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -157,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -623,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 9074ed0b65aa..c47c448befc8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -508,10 +508,10 @@ static inline const struct cred *nfs_file_cred(struct file *file) * linux/fs/nfs/direct.c */ extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); -extern ssize_t nfs_file_direct_read(struct kiocb *iocb, - struct iov_iter *iter); -extern ssize_t nfs_file_direct_write(struct kiocb *iocb, - struct iov_iter *iter); +ssize_t nfs_file_direct_read(struct kiocb *iocb, + struct iov_iter *iter, bool swap); +ssize_t nfs_file_direct_write(struct kiocb *iocb, + struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c -- cgit v1.2.3 From a43bf604446414103b7535f38e739b65601c4fb2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 16 Mar 2022 18:24:26 -0400 Subject: NFSv4.1 provide mount option to toggle trunking discovery Introduce a new mount option -- trunkdiscovery,notrunkdiscovery -- to toggle whether or not the client will engage in actively discovery of trunking locations. v2 make notrunkdiscovery default Signed-off-by: Olga Kornievskaia Fixes: 1976b2b31462 ("NFSv4.1 query for fs_location attr on a new file system") Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 3 ++- fs/nfs/fs_context.c | 8 ++++++++ include/linux/nfs_fs_sb.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d1f34229e11a..e828504cc396 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -857,7 +857,8 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } if (clp->rpc_ops->discover_trunking != NULL && - (server->caps & NFS_CAP_FS_LOCATIONS)) { + (server->caps & NFS_CAP_FS_LOCATIONS && + (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index ea17fa1f31ec..e2d59bb5e6bb 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -80,6 +80,7 @@ enum nfs_param { Opt_source, Opt_tcp, Opt_timeo, + Opt_trunkdiscovery, Opt_udp, Opt_v, Opt_vers, @@ -180,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("source", Opt_source), fsparam_flag ("tcp", Opt_tcp), fsparam_u32 ("timeo", Opt_timeo), + fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), fsparam_flag ("udp", Opt_udp), fsparam_flag ("v2", Opt_v), fsparam_flag ("v3", Opt_v), @@ -529,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, else ctx->flags &= ~NFS_MOUNT_NOCTO; break; + case Opt_trunkdiscovery: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; + else + ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; + break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ca0959e51e81..b0e3fd550122 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -151,6 +151,7 @@ struct nfs_server { #define NFS_MOUNT_SOFTREVAL 0x800000 #define NFS_MOUNT_WRITE_EAGER 0x01000000 #define NFS_MOUNT_WRITE_WAIT 0x02000000 +#define NFS_MOUNT_TRUNK_DISCOVERY 0x04000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From 89f42494f92f448747bd8a7ab1ae8b5d5520577d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Mar 2022 19:10:43 -0400 Subject: SUNRPC: Don't call connect() more than once on a TCP socket Avoid socket state races due to repeated calls to ->connect() using the same socket. If connect() returns 0 due to the connection having completed, but we are in fact in a closing state, then we may leave the XPRT_CONNECTING flag set on the transport. Reported-by: Enrico Scholz Fixes: 3be232f11a3c ("SUNRPC: Prevent immediate close+reconnect") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 8c2a712cb242..689062afdd61 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -89,5 +89,6 @@ struct sock_xprt { #define XPRT_SOCK_WAKE_WRITE (5) #define XPRT_SOCK_WAKE_PENDING (6) #define XPRT_SOCK_WAKE_DISCONNECT (7) +#define XPRT_SOCK_CONNECT_SENT (8) #endif /* _LINUX_SUNRPC_XPRTSOCK_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e39f87cde2d..8f8a03c3315a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2235,10 +2235,15 @@ static void xs_tcp_setup_socket(struct work_struct *work) if (atomic_read(&xprt->swapper)) current->flags |= PF_MEMALLOC; - if (!sock) { - sock = xs_create_sock(xprt, transport, - xs_addr(xprt)->sa_family, SOCK_STREAM, - IPPROTO_TCP, true); + + if (xprt_connected(xprt)) + goto out; + if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT, + &transport->sock_state) || + !sock) { + xs_reset_transport(transport); + sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, + SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { xprt_wake_pending_tasks(xprt, PTR_ERR(sock)); goto out; @@ -2262,6 +2267,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) fallthrough; case -EINPROGRESS: /* SYN_SENT! */ + set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state); if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; fallthrough; @@ -2323,13 +2329,9 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - if (transport->sock != NULL && !xprt_connecting(xprt)) { + if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " - "seconds\n", - xprt, xprt->reestablish_timeout / HZ); - - /* Start by resetting any existing state */ - xs_reset_transport(transport); + "seconds\n", xprt, xprt->reestablish_timeout / HZ); delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); -- cgit v1.2.3 From 2790a624d43084de590884934969e19c7a82316a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2022 08:12:40 -0400 Subject: SUNRPC: Replace internal use of SOCKWQ_ASYNC_NOSPACE The socket's SOCKWQ_ASYNC_NOSPACE can be cleared by various actors in the socket layer, so replace it with our own flag in the transport sock_state field. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 22 ++++------------------ 2 files changed, 5 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 689062afdd61..3eb0079669c5 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -90,5 +90,6 @@ struct sock_xprt { #define XPRT_SOCK_WAKE_PENDING (6) #define XPRT_SOCK_WAKE_DISCONNECT (7) #define XPRT_SOCK_CONNECT_SENT (8) +#define XPRT_SOCK_NOSPACE (9) #endif /* _LINUX_SUNRPC_XPRTSOCK_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 68eee352d69a..2450b31b807a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -780,14 +780,8 @@ static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); - rcu_read_unlock(); - /* wait for more buffer space */ + set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; xprt_wait_for_buffer_space(xprt); @@ -1151,6 +1145,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); + clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) @@ -1497,7 +1492,6 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket_wq *wq; struct sock_xprt *transport; struct rpc_xprt *xprt; @@ -1508,15 +1502,10 @@ static void xs_write_space(struct sock *sk) if (unlikely(!(xprt = xprt_from_sock(sk)))) return; transport = container_of(xprt, struct sock_xprt, xprt); - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) - goto out; - + if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state)) + return; xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE); sk->sk_write_pending--; -out: - rcu_read_unlock(); } /** @@ -1857,7 +1846,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; - sock_set_flag(sk, SOCK_FASYNC); sk->sk_error_report = xs_error_report; xprt_clear_connected(xprt); @@ -2051,7 +2039,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; - sock_set_flag(sk, SOCK_FASYNC); xprt_set_connected(xprt); @@ -2218,7 +2205,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sock_set_flag(sk, SOCK_FASYNC); sk->sk_error_report = xs_error_report; /* socket options */ -- cgit v1.2.3 From 33e5c765bc1ea5e06ea7603637f14d727e6fcdf3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Mar 2022 22:02:22 -0400 Subject: NFS: Fix memory allocation in rpc_malloc() When in a low memory situation, we do want rpciod to kick off direct reclaim in the case where that helps, however we don't want it looping forever in mempool_alloc(). So first try allocating from the slab using GFP_KERNEL | __GFP_NORETRY, and then fall back to a GFP_NOWAIT allocation from the mempool. Ditto for rpc_alloc_task() Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/sched.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 56710f8056d3..1d7a3e51b795 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -262,6 +262,7 @@ void rpc_destroy_mempool(void); extern struct workqueue_struct *rpciod_workqueue; extern struct workqueue_struct *xprtiod_workqueue; void rpc_prepare_task(struct rpc_task *task); +gfp_t rpc_task_gfp_mask(void); static inline int rpc_wait_for_completion_task(struct rpc_task *task) { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 7c8f87ebdbc0..d59a033820be 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -57,6 +57,13 @@ struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; EXPORT_SYMBOL_GPL(xprtiod_workqueue); +gfp_t rpc_task_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + unsigned long rpc_task_timeout(const struct rpc_task *task) { @@ -1030,15 +1037,15 @@ int rpc_malloc(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; - gfp_t gfp = GFP_KERNEL; - - if (RPC_IS_ASYNC(task)) - gfp = GFP_NOWAIT | __GFP_NOWARN; + gfp_t gfp = rpc_task_gfp_mask(); size += sizeof(struct rpc_buffer); - if (size <= RPC_BUFFER_MAXSIZE) - buf = mempool_alloc(rpc_buffer_mempool, gfp); - else + if (size <= RPC_BUFFER_MAXSIZE) { + buf = kmem_cache_alloc(rpc_buffer_slabp, gfp); + /* Reach for the mempool if dynamic allocation fails */ + if (!buf && RPC_IS_ASYNC(task)) + buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT); + } else buf = kmalloc(size, gfp); if (!buf) -- cgit v1.2.3 From 515dcdcd48736576c6f5c197814da6f81c60a21e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2022 12:34:19 -0400 Subject: NFS: nfsiod should not block forever in mempool_alloc() The concern is that since nfsiod is sometimes required to kick off a commit, it can get locked up waiting forever in mempool_alloc() instead of failing gracefully and leaving the commit until later. Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY, then fall back to a non-blocking attempt to allocate from the memory pool. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 7 +++++++ fs/nfs/pnfs_nfs.c | 8 ++++++-- fs/nfs/write.c | 24 +++++++++--------------- include/linux/nfs_fs.h | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 194840a97e3a..57b0497105c8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -587,6 +587,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 316f68f96e57..657c242a18ff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 599a82406d38..ef47e3700e4b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); - if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -1826,7 +1816,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c47c448befc8..db305abafc9e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -580,7 +580,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); +extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); -- cgit v1.2.3 From 421ab1be43bd015ffe744f4ea25df4f19d1ce6fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Mar 2022 10:37:31 -0400 Subject: SUNRPC: Do not dereference non-socket transports in sysfs Do not cast the struct xprt to a sock_xprt unless we know it is a UDP or TCP transport. Otherwise the call to lock the mutex will scribble over whatever structure is actually there. This has been seen to cause hard system lockups when the underlying transport was RDMA. Fixes: b49ea673e119 ("SUNRPC: lock against ->sock changing during sysfs read") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +++ include/linux/sunrpc/xprtsock.h | 1 - net/sunrpc/sysfs.c | 55 ++++++++++++++++++++--------------------- net/sunrpc/xprtsock.c | 26 +++++++++++++++++-- 4 files changed, 54 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 955ea4d7af0b..eef5e87c03b4 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -139,6 +139,9 @@ struct rpc_xprt_ops { void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); + int (*get_srcaddr)(struct rpc_xprt *xprt, char *buf, + size_t buflen); + unsigned short (*get_srcport)(struct rpc_xprt *xprt); int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(struct rpc_task *task); void (*prepare_request)(struct rpc_rqst *req); diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 3eb0079669c5..38284f25eddf 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -10,7 +10,6 @@ int init_socket_xprt(void); void cleanup_socket_xprt(void); -unsigned short get_srcport(struct rpc_xprt *); #define RPC_MIN_RESVPORT (1U) #define RPC_MAX_RESVPORT (65535U) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 05c758da6a92..9d8a7d9f3e41 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -97,7 +97,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, return 0; ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_put(xprt); - return ret + 1; + return ret; } static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, @@ -105,33 +105,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); - struct sockaddr_storage saddr; - struct sock_xprt *sock; - ssize_t ret = -1; + size_t buflen = PAGE_SIZE; + ssize_t ret = -ENOTSOCK; if (!xprt || !xprt_connected(xprt)) { - xprt_put(xprt); - return -ENOTCONN; + ret = -ENOTCONN; + } else if (xprt->ops->get_srcaddr) { + ret = xprt->ops->get_srcaddr(xprt, buf, buflen); + if (ret > 0) { + if (ret < buflen - 1) { + buf[ret] = '\n'; + ret++; + buf[ret] = '\0'; + } + } } - - sock = container_of(xprt, struct sock_xprt, xprt); - mutex_lock(&sock->recv_mutex); - if (sock->sock == NULL || - kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) - goto out; - - ret = sprintf(buf, "%pISc\n", &saddr); -out: - mutex_unlock(&sock->recv_mutex); xprt_put(xprt); - return ret + 1; + return ret; } static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) + struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + unsigned short srcport = 0; + size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { @@ -139,7 +137,11 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, return -ENOTCONN; } - ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" + if (xprt->ops->get_srcport) + srcport = xprt->ops->get_srcport(xprt); + + ret = snprintf(buf, buflen, + "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" @@ -147,14 +149,11 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, - xprt->backlog.qlen, xprt->main, - (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? - get_srcport(xprt) : 0, + xprt->backlog.qlen, xprt->main, srcport, atomic_long_read(&xprt->queuelen), - (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? - xprt->address_strings[RPC_DISPLAY_PORT] : "0"); + xprt->address_strings[RPC_DISPLAY_PORT]); xprt_put(xprt); - return ret + 1; + return ret; } static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, @@ -201,7 +200,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, } xprt_put(xprt); - return ret + 1; + return ret; } static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, @@ -220,7 +219,7 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); - return ret + 1; + return ret; } static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b52eaa8a0cda..78af7518f263 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1638,7 +1638,7 @@ static int xs_get_srcport(struct sock_xprt *transport) return port; } -unsigned short get_srcport(struct rpc_xprt *xprt) +static unsigned short xs_sock_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); unsigned short ret = 0; @@ -1648,7 +1648,25 @@ unsigned short get_srcport(struct rpc_xprt *xprt) mutex_unlock(&sock->recv_mutex); return ret; } -EXPORT_SYMBOL(get_srcport); + +static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen) +{ + struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); + union { + struct sockaddr sa; + struct sockaddr_storage st; + } saddr; + int ret = -ENOTCONN; + + mutex_lock(&sock->recv_mutex); + if (sock->sock) { + ret = kernel_getsockname(sock->sock, &saddr.sa); + if (ret >= 0) + ret = snprintf(buf, buflen, "%pISc", &saddr.sa); + } + mutex_unlock(&sock->recv_mutex); + return ret; +} static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { @@ -2622,6 +2640,8 @@ static const struct rpc_xprt_ops xs_udp_ops = { .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, + .get_srcaddr = xs_sock_srcaddr, + .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, @@ -2644,6 +2664,8 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, + .get_srcaddr = xs_sock_srcaddr, + .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, -- cgit v1.2.3