From b8bea9f6cdd7236c7c2238d022145e9b2f8aac22 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Jan 2023 07:15:09 -0500 Subject: nfsd: don't open-code clear_and_wake_up_bit Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 6e8712bd7c99..5b5d39ec7b01 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1174,9 +1174,7 @@ open_file: status = nfserr_jukebox; if (status != nfs_ok) nfsd_file_unhash(nf); - clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); - smp_mb__after_atomic(); - wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); goto out; } -- cgit v1.2.3 From 6c31e4c98853a4ba47355ea151b36a77c42b7734 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Jan 2023 10:39:00 -0500 Subject: nfsd: NFSD_FILE_KEY_INODE only needs to find GC'ed entries Since v4 files are expected to be long-lived, there's little value in closing them out of the cache when there is conflicting access. Change the comparator to also match the gc value in the key. Change both of the current users of that key to set the gc value in the key to "true". Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 5b5d39ec7b01..c36e3032d438 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -175,6 +175,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, switch (key->type) { case NFSD_FILE_KEY_INODE: + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) + return 1; if (nf->nf_inode != key->inode) return 1; break; @@ -695,6 +697,7 @@ nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_INODE, .inode = inode, + .gc = true, }; struct nfsd_file *nf; @@ -1049,6 +1052,7 @@ nfsd_file_is_cached(struct inode *inode) struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_INODE, .inode = inode, + .gc = true, }; bool ret = false; -- cgit v1.2.3 From d69b8dbfd0866abc5ec84652cc1c10fc3d4d91ef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Jan 2023 10:39:01 -0500 Subject: nfsd: simplify test_bit return in NFSD_FILE_KEY_FULL comparator test_bit returns bool, so we can just compare the result of that to the key->gc value without the "!!". Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c36e3032d438..568963b8a477 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -189,7 +189,7 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, return 1; if (!nfsd_match_cred(nf->nf_cred, key->cred)) return 1; - if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) return 1; if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) return 1; -- cgit v1.2.3 From c6593366c0bf222be9c7561354dfb921c611745e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Jan 2023 07:15:11 -0500 Subject: nfsd: don't kill nfsd_files because of lease break error An error from break_lease is non-fatal, so we needn't destroy the nfsd_file in that case. Just put the reference like we normally would and return the error. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 568963b8a477..ab37b85b7207 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1102,7 +1102,7 @@ retry: nf = nfsd_file_alloc(&key, may_flags); if (!nf) { status = nfserr_jukebox; - goto out_status; + goto out; } ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, @@ -1111,13 +1111,11 @@ retry: if (likely(ret == 0)) goto open_file; - nfsd_file_slab_free(&nf->nf_rcu); - nf = NULL; if (ret == -EEXIST) goto retry; trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); status = nfserr_jukebox; - goto out_status; + goto construction_err; wait_for_construction: wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); @@ -1127,29 +1125,25 @@ wait_for_construction: trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); if (!open_retry) { status = nfserr_jukebox; - goto out; + goto construction_err; } open_retry = false; - if (refcount_dec_and_test(&nf->nf_ref)) - nfsd_file_free(nf); goto retry; } - this_cpu_inc(nfsd_file_cache_hits); status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags)); + if (status != nfs_ok) { + nfsd_file_put(nf); + nf = NULL; + } + out: if (status == nfs_ok) { this_cpu_inc(nfsd_file_acquisitions); nfsd_file_check_write_error(nf); *pnf = nf; - } else { - if (refcount_dec_and_test(&nf->nf_ref)) - nfsd_file_free(nf); - nf = NULL; } - -out_status: put_cred(key.cred); trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); return status; @@ -1179,6 +1173,13 @@ open_file: if (status != nfs_ok) nfsd_file_unhash(nf); clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); + if (status == nfs_ok) + goto out; + +construction_err: + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; goto out; } -- cgit v1.2.3 From b680cb9b737331aad271feebbedafb865504e234 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Jan 2023 07:15:12 -0500 Subject: nfsd: add some comments to nfsd_file_do_acquire David Howells mentioned that he found this bit of code confusing, so sprinkle in some comments to clarify. Reported-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ab37b85b7207..50349449a4e5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1094,6 +1094,11 @@ retry: rcu_read_unlock(); if (nf) { + /* + * If the nf is on the LRU then it holds an extra reference + * that must be put if it's removed. It had better not be + * the last one however, since we should hold another. + */ if (nfsd_file_lru_remove(nf)) WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); goto wait_for_construction; -- cgit v1.2.3 From b2ff1bd71db2a1b193a6dde0845adcd69cbcf75e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Jan 2023 12:31:37 -0500 Subject: nfsd: don't take/put an extra reference when putting a file The last thing that filp_close does is an fput, so don't bother taking and putting the extra reference. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 50349449a4e5..51e2947c21a7 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -382,10 +382,8 @@ nfsd_file_free(struct nfsd_file *nf) if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { - get_file(nf->nf_file); - filp_close(nf->nf_file, NULL); nfsd_file_check_write_error(nf); - fput(nf->nf_file); + filp_close(nf->nf_file, NULL); } /* -- cgit v1.2.3 From 972cc0e0924598cb293b919d39c848dc038b2c28 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 26 Jan 2023 12:21:16 -0500 Subject: nfsd: update comment over __nfsd_file_cache_purge Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 51e2947c21a7..9b7082fdd211 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -907,7 +907,8 @@ out_err: * @net: net-namespace to shut down the cache (may be NULL) * * Walk the nfsd_file cache and close out any that match @net. If @net is NULL, - * then close out everything. Called when an nfsd instance is being shut down. + * then close out everything. Called when an nfsd instance is being shut down, + * and when the exports table is flushed. */ static void __nfsd_file_cache_purge(struct net *net) -- cgit v1.2.3 From dcb779fcd4ed5984ad15991d574943d12a8693d1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 15 Feb 2023 06:53:54 -0500 Subject: nfsd: allow reaping files still under writeback On most filesystems, there is no reason to delay reaping an nfsd_file just because its underlying inode is still under writeback. nfsd just relies on client activity or the local flusher threads to do writeback. The main exception is NFS, which flushes all of its dirty data on last close. Add a new EXPORT_OP_FLUSH_ON_CLOSE flag to allow filesystems to signal that they do this, and only skip closing files under writeback on such filesystems. Also, remove a redundant NULL file pointer check in nfsd_file_check_writeback, and clean up nfs's export op flag definitions. Signed-off-by: Jeff Layton Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/nfs/export.c | 9 ++++++--- fs/nfsd/filecache.c | 12 +++++++++++- include/linux/exportfs.h | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index d6a6d1ebb8fd..be686b8e0c54 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -149,7 +149,10 @@ const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| - EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + .flags = EXPORT_OP_NOWCC | + EXPORT_OP_NOSUBTREECHK | + EXPORT_OP_CLOSE_BEFORE_UNLINK | + EXPORT_OP_REMOTE_FS | + EXPORT_OP_NOATOMIC_ATTR | + EXPORT_OP_FLUSH_ON_CLOSE, }; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9b7082fdd211..a6fa6e980277 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -402,13 +402,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf) struct file *file = nf->nf_file; struct address_space *mapping; - if (!file || !(file->f_mode & FMODE_WRITE)) + /* File not open for write? */ + if (!(file->f_mode & FMODE_WRITE)) return false; + + /* + * Some filesystems (e.g. NFS) flush all dirty data on close. + * On others, there is no need to wait for writeback. + */ + if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE)) + return false; + mapping = file->f_mapping; return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } + static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 601700fedc91..9edb29101ec8 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -220,6 +220,7 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ +#define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ unsigned long flags; }; -- cgit v1.2.3 From c4c649ab413ba6a785b25f0edbb12f617c87db2a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 24 Nov 2022 15:09:04 -0500 Subject: NFSD: Convert filecache to rhltable While we were converting the nfs4_file hashtable to use the kernel's resizable hashtable data structure, Neil Brown observed that the list variant (rhltable) would be better for managing nfsd_file items as well. The nfsd_file hash table will contain multiple entries for the same inode -- these should be kept together on a list. And, it could be possible for exotic or malicious client behavior to cause the hash table to resize itself on every insertion. A nice simplification is that rhltable_lookup() can return a list that contains only nfsd_file items that match a given inode, which enables us to eliminate specialized hash table helper functions and use the default functions provided by the rhashtable implementation). Since we are now storing nfsd_file items for the same inode on a single list, that effectively reduces the number of hash entries that have to be tracked in the hash table. The mininum bucket count is therefore lowered. Light testing with fstests generic/531 show no regressions. Suggested-by: Neil Brown Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 311 +++++++++++++++++++++------------------------------- fs/nfsd/filecache.h | 9 +- 2 files changed, 133 insertions(+), 187 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a6fa6e980277..2f0b2d964cbb 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -74,70 +74,9 @@ static struct list_lru nfsd_file_lru; static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static struct delayed_work nfsd_filecache_laundrette; -static struct rhashtable nfsd_file_rhash_tbl +static struct rhltable nfsd_file_rhltable ____cacheline_aligned_in_smp; -enum nfsd_file_lookup_type { - NFSD_FILE_KEY_INODE, - NFSD_FILE_KEY_FULL, -}; - -struct nfsd_file_lookup_key { - struct inode *inode; - struct net *net; - const struct cred *cred; - unsigned char need; - bool gc; - enum nfsd_file_lookup_type type; -}; - -/* - * The returned hash value is based solely on the address of an in-code - * inode, a pointer to a slab-allocated object. The entropy in such a - * pointer is concentrated in its middle bits. - */ -static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) -{ - unsigned long ptr = (unsigned long)inode; - u32 k; - - k = ptr >> L1_CACHE_SHIFT; - k &= 0x00ffffff; - return jhash2(&k, 1, seed); -} - -/** - * nfsd_file_key_hashfn - Compute the hash value of a lookup key - * @data: key on which to compute the hash value - * @len: rhash table's key_len parameter (unused) - * @seed: rhash table's random seed of the day - * - * Return value: - * Computed 32-bit hash value - */ -static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nfsd_file_lookup_key *key = data; - - return nfsd_file_inode_hash(key->inode, seed); -} - -/** - * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file - * @data: object on which to compute the hash value - * @len: rhash table's key_len parameter (unused) - * @seed: rhash table's random seed of the day - * - * Return value: - * Computed 32-bit hash value - */ -static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nfsd_file *nf = data; - - return nfsd_file_inode_hash(nf->nf_inode, seed); -} - static bool nfsd_match_cred(const struct cred *c1, const struct cred *c2) { @@ -158,55 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) return true; } -/** - * nfsd_file_obj_cmpfn - Match a cache item against search criteria - * @arg: search criteria - * @ptr: cache item to check - * - * Return values: - * %0 - Item matches search criteria - * %1 - Item does not match search criteria - */ -static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct nfsd_file_lookup_key *key = arg->key; - const struct nfsd_file *nf = ptr; - - switch (key->type) { - case NFSD_FILE_KEY_INODE: - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) - return 1; - if (nf->nf_inode != key->inode) - return 1; - break; - case NFSD_FILE_KEY_FULL: - if (nf->nf_inode != key->inode) - return 1; - if (nf->nf_may != key->need) - return 1; - if (nf->nf_net != key->net) - return 1; - if (!nfsd_match_cred(nf->nf_cred, key->cred)) - return 1; - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) - return 1; - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) - return 1; - break; - } - return 0; -} - static const struct rhashtable_params nfsd_file_rhash_params = { .key_len = sizeof_field(struct nfsd_file, nf_inode), .key_offset = offsetof(struct nfsd_file, nf_inode), - .head_offset = offsetof(struct nfsd_file, nf_rhash), - .hashfn = nfsd_file_key_hashfn, - .obj_hashfn = nfsd_file_obj_hashfn, - .obj_cmpfn = nfsd_file_obj_cmpfn, - /* Reduce resizing churn on light workloads */ - .min_size = 512, /* buckets */ + .head_offset = offsetof(struct nfsd_file, nf_rlist), + + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, .automatic_shrinking = true, }; @@ -309,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) } static struct nfsd_file * -nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) +nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, + bool want_gc) { struct nfsd_file *nf; nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); - if (nf) { - INIT_LIST_HEAD(&nf->nf_lru); - nf->nf_birthtime = ktime_get(); - nf->nf_file = NULL; - nf->nf_cred = get_current_cred(); - nf->nf_net = key->net; - nf->nf_flags = 0; - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); - if (key->gc) - __set_bit(NFSD_FILE_GC, &nf->nf_flags); - nf->nf_inode = key->inode; - refcount_set(&nf->nf_ref, 1); - nf->nf_may = key->need; - nf->nf_mark = NULL; - } + if (unlikely(!nf)) + return NULL; + + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_birthtime = ktime_get(); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = want_gc ? + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); + nf->nf_inode = inode; + refcount_set(&nf->nf_ref, 1); + nf->nf_may = need; + nf->nf_mark = NULL; return nf; } @@ -354,8 +254,8 @@ static void nfsd_file_hash_remove(struct nfsd_file *nf) { trace_nfsd_file_unhash(nf); - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, - nfsd_file_rhash_params); + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); } static bool @@ -688,8 +588,8 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) * @inode: inode on which to close out nfsd_files * @dispose: list on which to gather nfsd_files to close out * - * An nfsd_file represents a struct file being held open on behalf of nfsd. An - * open file however can block other activity (such as leases), or cause + * An nfsd_file represents a struct file being held open on behalf of nfsd. + * An open file however can block other activity (such as leases), or cause * undesirable behavior (e.g. spurious silly-renames when reexporting NFS). * * This function is intended to find open nfsd_files when this sort of @@ -702,21 +602,17 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) static void nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_INODE, - .inode = inode, - .gc = true, - }; + struct rhlist_head *tmp, *list; struct nfsd_file *nf; rcu_read_lock(); - do { - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); - if (!nf) - break; + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (!test_bit(NFSD_FILE_GC, &nf->nf_flags)) + continue; nfsd_file_cond_queue(nf, dispose); - } while (1); + } rcu_read_unlock(); } @@ -840,7 +736,7 @@ nfsd_file_cache_init(void) if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) return 0; - ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); if (ret) return ret; @@ -908,7 +804,7 @@ out_err: nfsd_file_mark_slab = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; - rhashtable_destroy(&nfsd_file_rhash_tbl); + rhltable_destroy(&nfsd_file_rhltable); goto out; } @@ -927,7 +823,7 @@ __nfsd_file_cache_purge(struct net *net) struct nfsd_file *nf; LIST_HEAD(dispose); - rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); + rhltable_walk_enter(&nfsd_file_rhltable, &iter); do { rhashtable_walk_start(&iter); @@ -1033,7 +929,7 @@ nfsd_file_cache_shutdown(void) nfsd_file_mark_slab = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; - rhashtable_destroy(&nfsd_file_rhash_tbl); + rhltable_destroy(&nfsd_file_rhltable); for_each_possible_cpu(i) { per_cpu(nfsd_file_cache_hits, i) = 0; @@ -1044,6 +940,35 @@ nfsd_file_cache_shutdown(void) } } +static struct nfsd_file * +nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, + struct inode *inode, unsigned char need, + bool want_gc) +{ + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; + + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (nf->nf_may != need) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, cred)) + continue; + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) + continue; + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) + continue; + + if (!nfsd_file_get(nf)) + continue; + return nf; + } + return NULL; +} + /** * nfsd_file_is_cached - are there any cached open files for this inode? * @inode: inode to check @@ -1058,16 +983,20 @@ nfsd_file_cache_shutdown(void) bool nfsd_file_is_cached(struct inode *inode) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_INODE, - .inode = inode, - .gc = true, - }; + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; bool ret = false; - if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params) != NULL) - ret = true; + rcu_read_lock(); + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) { + ret = true; + break; + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, (int)ret); return ret; } @@ -1077,14 +1006,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct file *file, struct nfsd_file **pnf, bool want_gc) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_FULL, - .need = may_flags & NFSD_FILE_MAY_MASK, - .net = SVC_NET(rqstp), - .gc = want_gc, - }; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + struct net *net = SVC_NET(rqstp); + struct nfsd_file *new, *nf; + const struct cred *cred; bool open_retry = true; - struct nfsd_file *nf; + struct inode *inode; __be32 status; int ret; @@ -1092,14 +1019,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, may_flags|NFSD_MAY_OWNER_OVERRIDE); if (status != nfs_ok) return status; - key.inode = d_inode(fhp->fh_dentry); - key.cred = get_current_cred(); + inode = d_inode(fhp->fh_dentry); + cred = get_current_cred(); retry: rcu_read_lock(); - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); - nf = nfsd_file_get(nf); + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); rcu_read_unlock(); if (nf) { @@ -1113,21 +1038,32 @@ retry: goto wait_for_construction; } - nf = nfsd_file_alloc(&key, may_flags); - if (!nf) { + new = nfsd_file_alloc(net, inode, need, want_gc); + if (!new) { status = nfserr_jukebox; goto out; } - ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, - &key, &nf->nf_rhash, - nfsd_file_rhash_params); + rcu_read_lock(); + spin_lock(&inode->i_lock); + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + if (unlikely(nf)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + nfsd_file_slab_free(&new->nf_rcu); + goto wait_for_construction; + } + nf = new; + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (likely(ret == 0)) goto open_file; if (ret == -EEXIST) goto retry; - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); status = nfserr_jukebox; goto construction_err; @@ -1136,7 +1072,7 @@ wait_for_construction: /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); if (!open_retry) { status = nfserr_jukebox; goto construction_err; @@ -1158,13 +1094,13 @@ out: nfsd_file_check_write_error(nf); *pnf = nf; } - put_cred(key.cred); - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); + put_cred(cred); + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); return status; open_file: trace_nfsd_file_alloc(nf); - nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); + nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode); if (nf->nf_mark) { if (file) { get_file(file); @@ -1182,7 +1118,7 @@ open_file: * If construction failed, or we raced with a call to unlink() * then unhash. */ - if (status == nfs_ok && key.inode->i_nlink == 0) + if (status != nfs_ok || inode->i_nlink == 0) status = nfserr_jukebox; if (status != nfs_ok) nfsd_file_unhash(nf); @@ -1209,8 +1145,11 @@ construction_err: * seconds after the final nfsd_file_put() in case the caller * wants to re-use it. * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1230,8 +1169,11 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, * but not garbage-collected. The object is unhashed after the * final nfsd_file_put(). * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1252,8 +1194,11 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, * and @file is non-NULL, use it to instantiate a new nfsd_file instead of * opening a new one. * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1284,7 +1229,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) lru = list_lru_count(&nfsd_file_lru); rcu_read_lock(); - ht = &nfsd_file_rhash_tbl; + ht = &nfsd_file_rhltable.ht; count = atomic_read(&ht->nelems); tbl = rht_dereference_rcu(ht->tbl, ht); buckets = tbl->size; @@ -1300,7 +1245,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) evictions += per_cpu(nfsd_file_evictions, i); } - seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "total inodes: %u\n", count); seq_printf(m, "hash buckets: %u\n", buckets); seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 41516a4263ea..e54165a3224f 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -29,9 +29,8 @@ struct nfsd_file_mark { * never be dereferenced, only used for comparison. */ struct nfsd_file { - struct rhash_head nf_rhash; - struct list_head nf_lru; - struct rcu_head nf_rcu; + struct rhlist_head nf_rlist; + void *nf_inode; struct file *nf_file; const struct cred *nf_cred; struct net *nf_net; @@ -40,10 +39,12 @@ struct nfsd_file { #define NFSD_FILE_REFERENCED (2) #define NFSD_FILE_GC (3) unsigned long nf_flags; - struct inode *nf_inode; /* don't deref */ refcount_t nf_ref; unsigned char nf_may; + struct nfsd_file_mark *nf_mark; + struct list_head nf_lru; + struct rcu_head nf_rcu; ktime_t nf_birthtime; }; -- cgit v1.2.3 From bfca7a6f0c75f5d97f5efc2c80cca55bdbf5f79a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:15:57 -0500 Subject: lockd: purge resources held on behalf of nlm clients when shutting down It's easily possible for the server to have an outstanding lock when we go to shut down. When that happens, we often get a warning like this in the kernel log: lockd: couldn't shutdown host module for net f0000000! This is because the shutdown procedures skip removing any hosts that still have outstanding resources (locks). Eventually, things seem to get cleaned up anyway, but the log message is unsettling, and server shutdown doesn't seem to be working the way it was intended. Ensure that we tear down any resources held on behalf of a client when tearing one down for server shutdown. Reported-by: Yongcheng Yang Link: https://bugzilla.redhat.com/show_bug.cgi?id=2063818 Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index cdc8e12cdac4..127a728fcbc8 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -629,6 +629,7 @@ nlm_shutdown_hosts_net(struct net *net) rpc_shutdown_client(host->h_rpcclnt); host->h_rpcclnt = NULL; } + nlmsvc_free_host_resources(host); } /* Then, perform a garbage collection pass */ -- cgit v1.2.3 From c88c680c6de5077292667fae4a147fd5a6523479 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:15:58 -0500 Subject: lockd: remove 2 unused helper functions Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0168ac9fdda8..26c2aed31a0c 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -99,21 +99,11 @@ struct nsm_handle { /* * Rigorous type checking on sockaddr type conversions */ -static inline struct sockaddr_in *nlm_addr_in(const struct nlm_host *host) -{ - return (struct sockaddr_in *)&host->h_addr; -} - static inline struct sockaddr *nlm_addr(const struct nlm_host *host) { return (struct sockaddr *)&host->h_addr; } -static inline struct sockaddr_in *nlm_srcaddr_in(const struct nlm_host *host) -{ - return (struct sockaddr_in *)&host->h_srcaddr; -} - static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) { return (struct sockaddr *)&host->h_srcaddr; -- cgit v1.2.3 From f0aa4852e63f9c1cfd4322c770e69d7e6817e906 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:15:59 -0500 Subject: lockd: move struct nlm_wait to lockd.h The next patch needs struct nlm_wait in fs/lockd/clntproc.c, so move the definition to a shared header file. As an added clean-up, drop the unused b_reclaim field. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clntlock.c | 12 ------------ include/linux/lockd/lockd.h | 11 ++++++++++- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 82b19a30e0f0..464cb15c1a06 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -29,18 +29,6 @@ static int reclaimer(void *ptr); * client perspective. */ -/* - * This is the representation of a blocked client lock. - */ -struct nlm_wait { - struct list_head b_list; /* linked list */ - wait_queue_head_t b_wait; /* where to wait on */ - struct nlm_host * b_host; - struct file_lock * b_lock; /* local file lock */ - unsigned short b_reclaim; /* got to reclaim lock */ - __be32 b_status; /* grant callback status */ -}; - static LIST_HEAD(nlm_blocked); static DEFINE_SPINLOCK(nlm_blocked_lock); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 26c2aed31a0c..de5ad2c2179a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -121,7 +121,16 @@ struct nlm_lockowner { uint32_t pid; }; -struct nlm_wait; +/* + * This is the representation of a blocked client lock. + */ +struct nlm_wait { + struct list_head b_list; /* linked list */ + wait_queue_head_t b_wait; /* where to wait on */ + struct nlm_host *b_host; + struct file_lock *b_lock; /* local file lock */ + __be32 b_status; /* grant callback status */ +}; /* * Memory chunk for NLM client RPC request. -- cgit v1.2.3 From 2005f5b9c35bd736c81e9f24f5c5051967c022ee Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:16:00 -0500 Subject: lockd: fix races in client GRANTED_MSG wait logic After the wait for a grant is done (for whatever reason), nlmclnt_block updates the status of the nlm_rqst with the status of the block. At the point it does this, however, the block is still queued its status could change at any time. This is particularly a problem when the waiting task is signaled during the wait. We can end up giving up on the lock just before the GRANTED_MSG callback comes in, and accept it even though the lock request gets back an error, leaving a dangling lock on the server. Since the nlm_wait never lives beyond the end of nlmclnt_lock, put it on the stack and add functions to allow us to enqueue and dequeue the block. Enqueue it just before the lock/wait loop, and dequeue it just after we exit the loop instead of waiting until the end of the function. Also, scrape the status at the time that we dequeue it to ensure that it's final. Reported-by: Yongcheng Yang Link: https://bugzilla.redhat.com/show_bug.cgi?id=2063818 Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clntlock.c | 42 +++++++++++++++++++++--------------------- fs/lockd/clntproc.c | 28 ++++++++++++++++++---------- include/linux/lockd/lockd.h | 8 +++++--- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 464cb15c1a06..c374ee072db3 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -82,41 +82,42 @@ void nlmclnt_done(struct nlm_host *host) } EXPORT_SYMBOL_GPL(nlmclnt_done); +void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct file_lock *fl) +{ + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = nlm_lck_blocked; +} + /* * Queue up a lock for blocking so that the GRANTED request can see it */ -struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) +void nlmclnt_queue_block(struct nlm_wait *block) { - struct nlm_wait *block; - - block = kmalloc(sizeof(*block), GFP_KERNEL); - if (block != NULL) { - block->b_host = host; - block->b_lock = fl; - init_waitqueue_head(&block->b_wait); - block->b_status = nlm_lck_blocked; - - spin_lock(&nlm_blocked_lock); - list_add(&block->b_list, &nlm_blocked); - spin_unlock(&nlm_blocked_lock); - } - return block; + spin_lock(&nlm_blocked_lock); + list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); } -void nlmclnt_finish_block(struct nlm_wait *block) +/* + * Dequeue the block and return its final status + */ +__be32 nlmclnt_dequeue_block(struct nlm_wait *block) { - if (block == NULL) - return; + __be32 status; + spin_lock(&nlm_blocked_lock); list_del(&block->b_list); + status = block->b_status; spin_unlock(&nlm_blocked_lock); - kfree(block); + return status; } /* * Block on a lock */ -int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) +int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout) { long ret; @@ -142,7 +143,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* Reset the lock status after a server reboot so we resend */ if (block->b_status == nlm_lck_denied_grace_period) block->b_status = nlm_lck_blocked; - req->a_res.status = block->b_status; return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 16b4de868cd2..a14c9110719c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -516,9 +516,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) const struct cred *cred = nfs_file_cred(fl->fl_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - struct nlm_wait *block = NULL; + struct nlm_wait block; unsigned char fl_flags = fl->fl_flags; unsigned char fl_type; + __be32 b_status; int status = -ENOLCK; if (nsm_monitor(host) < 0) @@ -531,31 +532,41 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) goto out; - block = nlmclnt_prepare_block(host, fl); + nlmclnt_prepare_block(&block, host, fl); again: /* * Initialise resp->status to a valid non-zero value, * since 0 == nlm_lck_granted */ resp->status = nlm_lck_blocked; - for(;;) { + + /* + * A GRANTED callback can come at any time -- even before the reply + * to the LOCK request arrives, so we queue the wait before + * requesting the lock. + */ + nlmclnt_queue_block(&block); + for (;;) { /* Reboot protection */ fl->fl_u.nfs_fl.state = host->h_state; status = nlmclnt_call(cred, req, NLMPROC_LOCK); if (status < 0) break; /* Did a reclaimer thread notify us of a server reboot? */ - if (resp->status == nlm_lck_denied_grace_period) + if (resp->status == nlm_lck_denied_grace_period) continue; if (resp->status != nlm_lck_blocked) break; /* Wait on an NLM blocking lock */ - status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; - if (resp->status != nlm_lck_blocked) + if (block.b_status != nlm_lck_blocked) break; } + b_status = nlmclnt_dequeue_block(&block); + if (resp->status == nlm_lck_blocked) + resp->status = b_status; /* if we were interrupted while blocking, then cancel the lock request * and exit @@ -564,7 +575,7 @@ again: if (!req->a_args.block) goto out_unlock; if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) - goto out_unblock; + goto out; } if (resp->status == nlm_granted) { @@ -593,8 +604,6 @@ again: status = -ENOLCK; else status = nlm_stat_to_errno(resp->status); -out_unblock: - nlmclnt_finish_block(block); out: nlmclnt_release_call(req); return status; @@ -602,7 +611,6 @@ out_unlock: /* Fatal error: ensure that we remove the lock altogether */ dprintk("lockd: lock attempt ended in fatal error.\n" " Attempting to unlock.\n"); - nlmclnt_finish_block(block); fl_type = fl->fl_type; fl->fl_type = F_UNLCK; down_read(&host->h_rwsem); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index de5ad2c2179a..f42594a9efe0 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -211,9 +211,11 @@ struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); void nlmclnt_release_call(struct nlm_rqst *); -struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl); -void nlmclnt_finish_block(struct nlm_wait *block); -int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout); +void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, + struct file_lock *fl); +void nlmclnt_queue_block(struct nlm_wait *block); +__be32 nlmclnt_dequeue_block(struct nlm_wait *block); +int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout); __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock); void nlmclnt_recovery(struct nlm_host *); -- cgit v1.2.3 From 244cc19196d2f6691d34e8cd9bc34a55e4f778e5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:16:01 -0500 Subject: lockd: server should unlock lock if client rejects the grant Currently lockd just dequeues the block and ignores it if the client sends a GRANT_RES with a status of nlm_lck_denied. That status is an indicator that the client has rejected the lock, so the right thing to do is to unlock the lock we were trying to grant. Reported-by: Yongcheng Yang Link: https://bugzilla.redhat.com/show_bug.cgi?id=2063818 Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 4e30f3c50970..c43ccdf28ed9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -954,19 +954,32 @@ void nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) { struct nlm_block *block; + struct file_lock *fl; + int error; dprintk("grant_reply: looking for cookie %x, s=%d \n", *(unsigned int *)(cookie->data), status); if (!(block = nlmsvc_find_block(cookie))) return; - if (status == nlm_lck_denied_grace_period) { + switch (status) { + case nlm_lck_denied_grace_period: /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); - } else { + break; + case nlm_lck_denied: + /* Client doesn't want it, just unlock it */ + nlmsvc_unlink_block(block); + fl = &block->b_call->a_args.lock.fl; + fl->fl_type = F_UNLCK; + error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL); + if (error) + pr_warn("lockd: unable to unlock lock rejected by client!\n"); + break; + default: /* - * Lock is now held by client, or has been rejected. - * In both cases, the block should be removed. + * Either it was accepted or the status makes no sense + * just unlink it either way. */ nlmsvc_unlink_block(block); } -- cgit v1.2.3 From e59fb6749ed833deee5b3cfd7e89925296d41f49 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:16:02 -0500 Subject: nfs: move nfs_fhandle_hash to common include file lockd needs to be able to hash filehandles for tracepoints. Move the nfs_fhandle_hash() helper to a common nfs include file. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfs/internal.h | 15 --------------- include/linux/nfs.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a65fe2a63ab..10fb5e7573eb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -846,27 +846,12 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) } #ifdef CONFIG_CRC32 -/** - * nfs_fhandle_hash - calculate the crc32 hash for the filehandle - * @fh - pointer to filehandle - * - * returns a crc32 hash for the filehandle that is compatible with - * the one displayed by "wireshark". - */ -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); -} static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } #else -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return 0; -} static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) { return 0; diff --git a/include/linux/nfs.h b/include/linux/nfs.h index b06375e88e58..ceb70a926b95 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -10,6 +10,7 @@ #include #include +#include #include /* @@ -44,4 +45,23 @@ enum nfs3_stable_how { /* used by direct.c to mark verf as invalid */ NFS_INVALID_STABLE_HOW = -1 }; + +#ifdef CONFIG_CRC32 +/** + * nfs_fhandle_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); +} +#else /* CONFIG_CRC32 */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return 0; +} +#endif /* CONFIG_CRC32 */ #endif /* _LINUX_NFS_H */ -- cgit v1.2.3 From 2f90e18ffec47c265c14e313047189b79784bc0e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Mar 2023 07:16:03 -0500 Subject: lockd: add some client-side tracepoints Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/Makefile | 6 ++- fs/lockd/clntlock.c | 4 ++ fs/lockd/clntproc.c | 14 +++++++ fs/lockd/trace.c | 3 ++ fs/lockd/trace.h | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 fs/lockd/trace.c create mode 100644 fs/lockd/trace.h diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 6d5e83ed4476..ac9f9d84510e 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -3,10 +3,12 @@ # Makefile for the linux lock manager stuff # +ccflags-y += -I$(src) # needed for trace events + obj-$(CONFIG_LOCKD) += lockd.o -lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o xdr.o +lockd-objs-y += clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ + svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o lockd-objs-$(CONFIG_PROC_FS) += procfs.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index c374ee072db3..e3972aa3045a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,9 +14,12 @@ #include #include #include +#include #include #include +#include "trace.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT /* @@ -186,6 +189,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) res = nlm_granted; } spin_unlock(&nlm_blocked_lock); + trace_nlmclnt_grant(lock, addr, svc_addr_len(addr), res); return res; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index a14c9110719c..fba6c7fa7474 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -20,6 +20,8 @@ #include #include +#include "trace.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) #define NLMCLNT_POLL_TIMEOUT (30*HZ) @@ -451,6 +453,9 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) status = nlm_stat_to_errno(req->a_res.status); } out: + trace_nlmclnt_test(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; } @@ -605,10 +610,16 @@ again: else status = nlm_stat_to_errno(resp->status); out: + trace_nlmclnt_lock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; out_unlock: /* Fatal error: ensure that we remove the lock altogether */ + trace_nlmclnt_lock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); dprintk("lockd: lock attempt ended in fatal error.\n" " Attempting to unlock.\n"); fl_type = fl->fl_type; @@ -704,6 +715,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: + trace_nlmclnt_unlock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; } diff --git a/fs/lockd/trace.c b/fs/lockd/trace.c new file mode 100644 index 000000000000..d9a6ff6e673c --- /dev/null +++ b/fs/lockd/trace.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h new file mode 100644 index 000000000000..7461b13b6e74 --- /dev/null +++ b/fs/lockd/trace.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lockd + +#if !defined(_TRACE_LOCKD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_LOCKD_H + +#include +#include +#include +#include + +#ifdef CONFIG_LOCKD_V4 +#define NLM_STATUS_LIST \ + nlm_status_code(LCK_GRANTED) \ + nlm_status_code(LCK_DENIED) \ + nlm_status_code(LCK_DENIED_NOLOCKS) \ + nlm_status_code(LCK_BLOCKED) \ + nlm_status_code(LCK_DENIED_GRACE_PERIOD) \ + nlm_status_code(DEADLCK) \ + nlm_status_code(ROFS) \ + nlm_status_code(STALE_FH) \ + nlm_status_code(FBIG) \ + nlm_status_code_end(FAILED) +#else +#define NLM_STATUS_LIST \ + nlm_status_code(LCK_GRANTED) \ + nlm_status_code(LCK_DENIED) \ + nlm_status_code(LCK_DENIED_NOLOCKS) \ + nlm_status_code(LCK_BLOCKED) \ + nlm_status_code_end(LCK_DENIED_GRACE_PERIOD) +#endif + +#undef nlm_status_code +#undef nlm_status_code_end +#define nlm_status_code(x) TRACE_DEFINE_ENUM(NLM_##x); +#define nlm_status_code_end(x) TRACE_DEFINE_ENUM(NLM_##x); + +NLM_STATUS_LIST + +#undef nlm_status_code +#undef nlm_status_code_end +#define nlm_status_code(x) { NLM_##x, #x }, +#define nlm_status_code_end(x) { NLM_##x, #x } + +#define show_nlm_status(x) __print_symbolic(x, NLM_STATUS_LIST) + +DECLARE_EVENT_CLASS(nlmclnt_lock_event, + TP_PROTO( + const struct nlm_lock *lock, + const struct sockaddr *addr, + unsigned int addrlen, + __be32 status + ), + + TP_ARGS(lock, addr, addrlen, status), + + TP_STRUCT__entry( + __field(u32, oh) + __field(u32, svid) + __field(u32, fh) + __field(unsigned long, status) + __field(u64, start) + __field(u64, len) + __sockaddr(addr, addrlen) + ), + + TP_fast_assign( + __entry->oh = ~crc32_le(0xffffffff, lock->oh.data, lock->oh.len); + __entry->svid = lock->svid; + __entry->fh = nfs_fhandle_hash(&lock->fh); + __entry->start = lock->lock_start; + __entry->len = lock->lock_len; + __entry->status = be32_to_cpu(status); + __assign_sockaddr(addr, addr, addrlen); + ), + + TP_printk( + "addr=%pISpc oh=0x%08x svid=0x%08x fh=0x%08x start=%llu len=%llu status=%s", + __get_sockaddr(addr), __entry->oh, __entry->svid, + __entry->fh, __entry->start, __entry->len, + show_nlm_status(__entry->status) + ) +); + +#define DEFINE_NLMCLNT_EVENT(name) \ + DEFINE_EVENT(nlmclnt_lock_event, name, \ + TP_PROTO( \ + const struct nlm_lock *lock, \ + const struct sockaddr *addr, \ + unsigned int addrlen, \ + __be32 status \ + ), \ + TP_ARGS(lock, addr, addrlen, status)) + +DEFINE_NLMCLNT_EVENT(nlmclnt_test); +DEFINE_NLMCLNT_EVENT(nlmclnt_lock); +DEFINE_NLMCLNT_EVENT(nlmclnt_unlock); +DEFINE_NLMCLNT_EVENT(nlmclnt_grant); + +#endif /* _TRACE_LOCKD_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include -- cgit v1.2.3 From cf64b9bce95095b80f4589e4f54572cc5d8c1538 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 8 Mar 2023 17:51:00 +1100 Subject: SUNRPC: return proper error from get_expiry() The get_expiry() function currently returns a timestamp, and uses the special return value of 0 to indicate an error. Unfortunately this causes a problem when 0 is the correct return value. On a system with no RTC it is possible that the boot time will be seen to be "3". When exportfs probes to see if a particular filesystem supports NFS export it tries to cache information with an expiry time of "3". The intention is for this to be "long in the past". Even with no RTC it will not be far in the future (at most a second or two) so this is harmless. But if the boot time happens to have been calculated to be "3", then get_expiry will fail incorrectly as it converts the number to "seconds since bootime" - 0. To avoid this problem we change get_expiry() to report the error quite separately from the expiry time. The error is now the return value. The expiry time is reported through a by-reference parameter. Reported-by: Jerry Zhang Tested-by: Jerry Zhang Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 13 ++++++------- fs/nfsd/nfs4idmap.c | 8 ++++---- include/linux/sunrpc/cache.h | 15 ++++++++------- net/sunrpc/auth_gss/svcauth_gss.c | 12 ++++++------ net/sunrpc/svcauth_unix.c | 12 ++++++------ 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 668c7527b17e..6da74aebe1fb 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -123,11 +123,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) /* OK, we seem to have a valid key */ key.h.flags = 0; - key.h.expiry_time = get_expiry(&mesg); - if (key.h.expiry_time == 0) + err = get_expiry(&mesg, &key.h.expiry_time); + if (err) goto out; - key.ek_client = dom; + key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); @@ -610,9 +610,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_devid_map = NULL; /* expiry */ - err = -EINVAL; - exp.h.expiry_time = get_expiry(&mesg); - if (exp.h.expiry_time == 0) + err = get_expiry(&mesg, &exp.h.expiry_time); + if (err) goto out3; /* flags */ @@ -624,7 +623,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err || an_int < 0) goto out3; exp.ex_flags= an_int; - + /* anon uid */ err = get_int(&mesg, &an_int); if (err) diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5e9809aff37e..7a806ac13e31 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -240,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; error = -ENOMEM; @@ -408,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; /* ID */ diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index ec5a555df96f..518bd28f5ab8 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -300,17 +300,18 @@ static inline int get_time(char **bpp, time64_t *time) return 0; } -static inline time64_t get_expiry(char **bpp) +static inline int get_expiry(char **bpp, time64_t *rvp) { - time64_t rv; + int error; struct timespec64 boot; - if (get_time(bpp, &rv)) - return 0; - if (rv < 0) - return 0; + error = get_time(bpp, rvp); + if (error) + return error; + getboottime64(&boot); - return rv - boot.tv_sec; + (*rvp) -= boot.tv_sec; + return 0; } #endif /* _LINUX_SUNRPC_CACHE_H_ */ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 9c843974bb48..c4a566737085 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -257,11 +257,11 @@ static int rsi_parse(struct cache_detail *cd, rsii.h.flags = 0; /* expiry */ - expiry = get_expiry(&mesg); - status = -EINVAL; - if (expiry == 0) + status = get_expiry(&mesg, &expiry); + if (status) goto out; + status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) @@ -483,11 +483,11 @@ static int rsc_parse(struct cache_detail *cd, rsci.h.flags = 0; /* expiry */ - expiry = get_expiry(&mesg); - status = -EINVAL; - if (expiry == 0) + status = get_expiry(&mesg, &expiry); + if (status) goto out; + status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 4246363cb095..4485088ce27b 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -225,9 +225,9 @@ static int ip_map_parse(struct cache_detail *cd, return -EINVAL; } - expiry = get_expiry(&mesg); - if (expiry ==0) - return -EINVAL; + err = get_expiry(&mesg, &expiry); + if (err) + return err; /* domainname, or empty for NEGATIVE */ len = qword_get(&mesg, buf, mlen); @@ -506,9 +506,9 @@ static int unix_gid_parse(struct cache_detail *cd, uid = make_kuid(current_user_ns(), id); ug.uid = uid; - expiry = get_expiry(&mesg); - if (expiry == 0) - return -EINVAL; + err = get_expiry(&mesg, &expiry); + if (err) + return err; rv = get_int(&mesg, &gids); if (rv || gids < 0 || gids > 8192) -- cgit v1.2.3 From 376bcd9b37632cf191711a68aa25ab42f0048c2e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Sat, 11 Mar 2023 15:39:40 -0800 Subject: sunrpc: simplify two-level sysctl registration for svcrdma_parm_table There is no need to declare two tables to just create directories, this can be easily be done with a prefix path with register_sysctl(). Simplify this registration. Signed-off-by: Luis Chamberlain Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 5bc20e9d09cd..f0d5eeed4c88 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -212,24 +212,6 @@ static struct ctl_table svcrdma_parm_table[] = { { }, }; -static struct ctl_table svcrdma_table[] = { - { - .procname = "svc_rdma", - .mode = 0555, - .child = svcrdma_parm_table - }, - { }, -}; - -static struct ctl_table svcrdma_root_table[] = { - { - .procname = "sunrpc", - .mode = 0555, - .child = svcrdma_table - }, - { }, -}; - static void svc_rdma_proc_cleanup(void) { if (!svcrdma_table_header) @@ -263,7 +245,8 @@ static int svc_rdma_proc_init(void) if (rc) goto out_err; - svcrdma_table_header = register_sysctl_table(svcrdma_root_table); + svcrdma_table_header = register_sysctl("sunrpc/svc_rdma", + svcrdma_parm_table); return 0; out_err: -- cgit v1.2.3 From 0f5162480bd25bd97b91c9153db7afbd89698804 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Mar 2023 17:09:20 -0400 Subject: NFSD: Watch for rq_pages bounds checking errors in nfsd_splice_actor() There have been several bugs over the years where the NFSD splice actor has attempted to write outside the rq_pages array. This is a "should never happen" condition, but if for some reason the pipe splice actor should attempt to walk past the end of rq_pages, it needs to terminate the READ operation to prevent corruption of the pointer addresses in the fields just beyond the array. A server crash is thus prevented. Since the code is not behaving, the READ operation returns -EIO to the client. None of the READ payload data can be trusted if the splice actor isn't operating as expected. Suggested-by: Jeff Layton Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- fs/nfsd/vfs.c | 6 +++++- include/linux/sunrpc/svc.h | 2 +- include/trace/events/sunrpc.h | 25 +++++++++++++++++++++++++ net/sunrpc/svc.c | 15 ++++++++++++++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5783209f17fc..10aa68ca82ef 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -930,6 +930,9 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines * directly. They will be released after the sending has completed. + * + * Return values: Number of bytes consumed, or -EIO if there are no + * remaining pages in rqstp->rq_pages. */ static int nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, @@ -948,7 +951,8 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, */ if (page == *(rqstp->rq_next_page - 1)) continue; - svc_rqst_replace_page(rqstp, page); + if (unlikely(!svc_rqst_replace_page(rqstp, page))) + return -EIO; } if (rqstp->rq_res.page_len == 0) // first call rqstp->rq_res.page_base = offset % PAGE_SIZE; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 877891536c2f..f5af055280ff 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -422,7 +422,7 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int, int (*threadfn)(void *data)); struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); -void svc_rqst_replace_page(struct svc_rqst *rqstp, +bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 3ca54536f8f7..5a3bb42e1f50 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1790,6 +1790,31 @@ DEFINE_EVENT(svc_rqst_status, svc_send, TP_PROTO(const struct svc_rqst *rqst, int status), TP_ARGS(rqst, status)); +TRACE_EVENT(svc_replace_page_err, + TP_PROTO(const struct svc_rqst *rqst), + + TP_ARGS(rqst), + TP_STRUCT__entry( + SVC_RQST_ENDPOINT_FIELDS(rqst) + + __field(const void *, begin) + __field(const void *, respages) + __field(const void *, nextpage) + ), + + TP_fast_assign( + SVC_RQST_ENDPOINT_ASSIGNMENTS(rqst); + + __entry->begin = rqst->rq_pages; + __entry->respages = rqst->rq_respages; + __entry->nextpage = rqst->rq_next_page; + ), + + TP_printk(SVC_RQST_ENDPOINT_FORMAT " begin=%p respages=%p nextpage=%p", + SVC_RQST_ENDPOINT_VARARGS, + __entry->begin, __entry->respages, __entry->nextpage) +); + TRACE_EVENT(svc_stats_latency, TP_PROTO( const struct svc_rqst *rqst diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fea7ce8fba14..633aa1eb476b 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -842,9 +842,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); * * When replacing a page in rq_pages, batch the release of the * replaced pages to avoid hammering the page allocator. + * + * Return values: + * %true: page replaced + * %false: array bounds checking failed */ -void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { + struct page **begin = rqstp->rq_pages; + struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES]; + + if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { + trace_svc_replace_page_err(rqstp); + return false; + } + if (*rqstp->rq_next_page) { if (!pagevec_space(&rqstp->rq_pvec)) __pagevec_release(&rqstp->rq_pvec); @@ -853,6 +865,7 @@ void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) get_page(page); *(rqstp->rq_next_page++) = page; + return true; } EXPORT_SYMBOL_GPL(svc_rqst_replace_page); -- cgit v1.2.3 From ae0d77708aae219a9264a74188d5c1b1a5754da6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 7 Mar 2023 09:14:46 -0500 Subject: SUNRPC: Ensure server-side sockets have a sock->file The TLS handshake upcall mechanism requires a non-NULL sock->file on the socket it hands to user space. svc_sock_free() already releases sock->file properly if one exists. Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 03a4f5615086..302a14dd7882 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1293,26 +1293,37 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, int flags) { + struct file *filp = NULL; struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int err = 0; svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + if (!sock->file) { + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + if (IS_ERR(filp)) { + kfree(svsk); + return ERR_CAST(filp); + } + } + inet = sock->sk; - /* Register socket with portmapper */ - if (pmap_register) + if (pmap_register) { + int err; + err = svc_register(serv, sock_net(sock->sk), inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); - - if (err < 0) { - kfree(svsk); - return ERR_PTR(err); + if (err < 0) { + if (filp) + fput(filp); + kfree(svsk); + return ERR_PTR(err); + } } svsk->sk_sock = sock; -- cgit v1.2.3 From 55fcc7d9159de886296626e47db2c81f8578c7e1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 3 Apr 2023 13:53:07 -0400 Subject: SUNRPC: Ignore return value of ->xpo_sendto Clean up: All callers of svc_process() ignore its return value, so svc_process() can safely be converted to return void. Ditto for svc_send(). The return value of ->xpo_sendto() is now used only as part of a trace event. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- include/linux/sunrpc/svcsock.h | 2 +- net/sunrpc/svc.c | 13 +++++++------ net/sunrpc/svc_xprt.c | 21 +++++++++------------ 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index f5af055280ff..2d31121fc2e6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -430,7 +430,7 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); -int svc_process(struct svc_rqst *); +void svc_process(struct svc_rqst *rqstp); int bc_svc_process(struct svc_serv *, struct rpc_rqst *, struct svc_rqst *); int svc_register(const struct svc_serv *, struct net *, const int, diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index bcc555c7ae9c..dd73fa174af5 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -56,7 +56,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk) */ void svc_close_net(struct svc_serv *, struct net *); int svc_recv(struct svc_rqst *, long); -int svc_send(struct svc_rqst *); +void svc_send(struct svc_rqst *rqstp); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); bool svc_alien_sock(struct net *net, int fd); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 633aa1eb476b..0aa8892fad63 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1444,11 +1444,12 @@ err_system_err: goto sendit; } -/* - * Process the RPC request. +/** + * svc_process - Execute one RPC transaction + * @rqstp: RPC transaction context + * */ -int -svc_process(struct svc_rqst *rqstp) +void svc_process(struct svc_rqst *rqstp) { struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; @@ -1484,7 +1485,8 @@ svc_process(struct svc_rqst *rqstp) if (!svc_process_common(rqstp)) goto out_drop; - return svc_send(rqstp); + svc_send(rqstp); + return; out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", @@ -1492,7 +1494,6 @@ out_baddir: rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); - return 0; } EXPORT_SYMBOL_GPL(svc_process); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ba629297da4e..36c79b718323 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -909,18 +909,20 @@ void svc_drop(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_drop); -/* - * Return reply to client. +/** + * svc_send - Return reply to client + * @rqstp: RPC transaction context + * */ -int svc_send(struct svc_rqst *rqstp) +void svc_send(struct svc_rqst *rqstp) { struct svc_xprt *xprt; - int len = -EFAULT; struct xdr_buf *xb; + int status; xprt = rqstp->rq_xprt; if (!xprt) - goto out; + return; /* calculate over-all length */ xb = &rqstp->rq_res; @@ -930,15 +932,10 @@ int svc_send(struct svc_rqst *rqstp) trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); - len = xprt->xpt_ops->xpo_sendto(rqstp); + status = xprt->xpt_ops->xpo_sendto(rqstp); - trace_svc_send(rqstp, len); + trace_svc_send(rqstp, status); svc_xprt_release(rqstp); - - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) - len = 0; -out: - return len; } /* -- cgit v1.2.3 From 92e4a6733f922f0fef1d0995f7b2d0eaff86c7ea Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Apr 2023 17:31:44 -0400 Subject: nfsd: simplify the delayed disposal list code When queueing a dispose list to the appropriate "freeme" lists, it pointlessly queues the objects one at a time to an intermediate list. Remove a few helpers and just open code a list_move to make it more clear and efficient. Better document the resulting functions with kerneldoc comments. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 64 ++++++++++++++++++----------------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2f0b2d964cbb..f40d8f3b35a4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -402,49 +402,26 @@ nfsd_file_dispose_list(struct list_head *dispose) } } -static void -nfsd_file_list_remove_disposal(struct list_head *dst, - struct nfsd_fcache_disposal *l) -{ - spin_lock(&l->lock); - list_splice_init(&l->freeme, dst); - spin_unlock(&l->lock); -} - -static void -nfsd_file_list_add_disposal(struct list_head *files, struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfsd_fcache_disposal *l = nn->fcache_disposal; - - spin_lock(&l->lock); - list_splice_tail_init(files, &l->freeme); - spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); -} - -static void -nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src, - struct net *net) -{ - struct nfsd_file *nf, *tmp; - - list_for_each_entry_safe(nf, tmp, src, nf_lru) { - if (nf->nf_net == net) - list_move_tail(&nf->nf_lru, dst); - } -} - +/** + * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list + * @dispose: list of nfsd_files to be disposed + * + * Transfers each file to the "freeme" list for its nfsd_net, to eventually + * be disposed of by the per-net garbage collector. + */ static void nfsd_file_dispose_list_delayed(struct list_head *dispose) { - LIST_HEAD(list); - struct nfsd_file *nf; - while(!list_empty(dispose)) { - nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - nfsd_file_list_add_pernet(&list, dispose, nf->nf_net); - nfsd_file_list_add_disposal(&list, nf->nf_net); + struct nfsd_file *nf = list_first_entry(dispose, + struct nfsd_file, nf_lru); + struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + spin_lock(&l->lock); + list_move_tail(&nf->nf_lru, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); } } @@ -665,8 +642,8 @@ nfsd_file_close_inode_sync(struct inode *inode) * nfsd_file_delayed_close - close unused nfsd_files * @work: dummy * - * Walk the LRU list and destroy any entries that have not been used since - * the last scan. + * Scrape the freeme list for this nfsd_net, and then dispose of them + * all. */ static void nfsd_file_delayed_close(struct work_struct *work) @@ -675,7 +652,10 @@ nfsd_file_delayed_close(struct work_struct *work) struct nfsd_fcache_disposal *l = container_of(work, struct nfsd_fcache_disposal, work); - nfsd_file_list_remove_disposal(&head, l); + spin_lock(&l->lock); + list_splice_init(&l->freeme, &head); + spin_unlock(&l->lock); + nfsd_file_dispose_list(&head); } -- cgit v1.2.3 From b20cb39def085723868972182fb58fa906839a4f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Apr 2023 20:17:56 -0400 Subject: SUNRPC: Relocate svc_free_res_pages() Clean-up: There doesn't seem to be a reason why this function is stuck in a header. One thing it prevents is the convenient addition of tracing. Moving it to a source file also makes the rq_respages clean-up logic easier to find. Reviewed-by: Calum Mackay Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 12 +----------- net/sunrpc/svc.c | 19 +++++++++++++++++++ net/sunrpc/svc_xprt.c | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 2d31121fc2e6..762d7231e574 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -309,17 +309,6 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst) return (struct sockaddr *) &rqst->rq_daddr; } -static inline void svc_free_res_pages(struct svc_rqst *rqstp) -{ - while (rqstp->rq_next_page != rqstp->rq_respages) { - struct page **pp = --rqstp->rq_next_page; - if (*pp) { - put_page(*pp); - *pp = NULL; - } - } -} - struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ struct svc_xprt *xprt; @@ -424,6 +413,7 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); +void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0aa8892fad63..0fc70cc405b2 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -869,6 +869,25 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) } EXPORT_SYMBOL_GPL(svc_rqst_replace_page); +/** + * svc_rqst_release_pages - Release Reply buffer pages + * @rqstp: RPC transaction context + * + * Release response pages that might still be in flight after + * svc_send, and any spliced filesystem-owned pages. + */ +void svc_rqst_release_pages(struct svc_rqst *rqstp) +{ + while (rqstp->rq_next_page != rqstp->rq_respages) { + struct page **pp = --rqstp->rq_next_page; + + if (*pp) { + put_page(*pp); + *pp = NULL; + } + } +} + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 36c79b718323..533e08c4f319 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -542,7 +542,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) rqstp->rq_deferred = NULL; pagevec_release(&rqstp->rq_pvec); - svc_free_res_pages(rqstp); + svc_rqst_release_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; -- cgit v1.2.3 From 647a2a6428f2cd01e53079ac16e17fdeff229e68 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Apr 2023 20:18:02 -0400 Subject: SUNRPC: Convert svc_xprt_release() to the release_pages() API Instead of invoking put_page() one-at-a-time, pass the "response" portion of rq_pages directly to release_pages() to reduce the number of times each nfsd thread invokes a page allocator API. Since svc_xprt_release() is not invoked while a client is waiting for an RPC Reply, this is not expected to directly impact mean request latencies on a lightly or moderately loaded server. However as workload intensity increases, I expect somewhat better scalability: the same number of server threads should be able to handle more work. Reviewed-by: Calum Mackay Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0fc70cc405b2..b982f802f2a0 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -878,13 +878,12 @@ EXPORT_SYMBOL_GPL(svc_rqst_replace_page); */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { - while (rqstp->rq_next_page != rqstp->rq_respages) { - struct page **pp = --rqstp->rq_next_page; + int i, count = rqstp->rq_next_page - rqstp->rq_respages; - if (*pp) { - put_page(*pp); - *pp = NULL; - } + if (count) { + release_pages(rqstp->rq_respages, count); + for (i = 0; i < count; i++) + rqstp->rq_respages[i] = NULL; } } -- cgit v1.2.3 From 6a0cdf56bfc9a1791ddd6955d60f3678523e599c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Apr 2023 20:18:15 -0400 Subject: SUNRPC: Be even lazier about releasing pages A single RPC transaction that touches only a couple of pages means rq_pvec will not be even close to full in svc_xpt_release(). This is a common case. Instead, just leave the pages in rq_pvec until it is completely full. This improves the efficiency of the batch release mechanism on workloads that involve small RPC messages. The rq_pvec is also fully emptied just before thread exit. Reviewed-by: Calum Mackay Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 3 +++ net/sunrpc/svc_xprt.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b982f802f2a0..26367cf4c17a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -649,6 +649,8 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return rqstp; + pagevec_init(&rqstp->rq_pvec); + __set_bit(RQ_BUSY, &rqstp->rq_flags); rqstp->rq_server = serv; rqstp->rq_pool = pool; @@ -894,6 +896,7 @@ void svc_rqst_release_pages(struct svc_rqst *rqstp) void svc_rqst_free(struct svc_rqst *rqstp) { + pagevec_release(&rqstp->rq_pvec); svc_release_buffer(rqstp); if (rqstp->rq_scratch_page) put_page(rqstp->rq_scratch_page); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 533e08c4f319..e3952b690f54 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -541,7 +541,6 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; - pagevec_release(&rqstp->rq_pvec); svc_rqst_release_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -667,8 +666,6 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled, ret; - pagevec_init(&rqstp->rq_pvec); - pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", -- cgit v1.2.3 From 5e052dda121e2870dd87181783da4a95d7d2927b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 17 Apr 2023 09:42:14 -0400 Subject: SUNRPC: Recognize control messages in server-side TCP socket code To support kTLS, the server-side TCP socket receive path needs to watch for CMSGs. Acked-by: Jakub Kicinski Signed-off-by: Chuck Lever --- include/net/tls.h | 2 ++ net/sunrpc/svcsock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 154949c7b0c8..6056ce5a2aa5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -69,6 +69,8 @@ extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) +#define TLS_RECORD_TYPE_ALERT 0x15 +#define TLS_RECORD_TYPE_HANDSHAKE 0x16 #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 302a14dd7882..c5b74f523fc4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -216,6 +217,49 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +static int +svc_tcp_sock_process_cmsg(struct svc_sock *svsk, struct msghdr *msg, + struct cmsghdr *cmsg, int ret) +{ + if (cmsg->cmsg_level == SOL_TLS && + cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { + u8 content_type = *((u8 *)CMSG_DATA(cmsg)); + + switch (content_type) { + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + ret = -ENOTCONN; + break; + default: + ret = -EAGAIN; + } + } + return ret; +} + +static int +svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) +{ + union { + struct cmsghdr cmsg; + u8 buf[CMSG_SPACE(sizeof(u8))]; + } u; + int ret; + + msg->msg_control = &u; + msg->msg_controllen = sizeof(u); + ret = sock_recvmsg(svsk->sk_sock, msg, MSG_DONTWAIT); + if (unlikely(msg->msg_controllen != sizeof(u))) + ret = svc_tcp_sock_process_cmsg(svsk, msg, &u.cmsg, ret); + return ret; +} + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) { @@ -263,7 +307,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; } - len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + len = svc_tcp_sock_recv_cmsg(svsk, &msg); if (len > 0) svc_flush_bvec(bvec, len, seek); @@ -877,7 +921,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); - len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + len = svc_tcp_sock_recv_cmsg(svsk, &msg); if (len < 0) return len; svsk->sk_tcplen += len; -- cgit v1.2.3 From 695bc1f32c6bc806218b322096b23dfd601e59ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 17 Apr 2023 22:15:28 -0400 Subject: SUNRPC: Clear rq_xid when receiving a new RPC Call This is an eye-catcher for tracepoints that record the XID: it means svc_rqst() has not received a full RPC Call with an XID yet. Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e3952b690f54..3b9708b39e35 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -701,6 +701,8 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) arg->page_len = (pages-2)*PAGE_SIZE; arg->len = (pages-1)*PAGE_SIZE; arg->tail[0].iov_len = 0; + + rqstp->rq_xid = xdr_zero; return 0; } -- cgit v1.2.3 From 147abcacee33781e75588869e944ddb07528a897 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 19 Apr 2023 10:53:18 -0700 Subject: NFSD: Fix problem of COMMIT and NFS4ERR_DELAY in infinite loop The following request sequence to the same file causes the NFS client and server getting into an infinite loop with COMMIT and NFS4ERR_DELAY: OPEN REMOVE WRITE COMMIT Problem reported by recall11, recall12, recall14, recall20, recall22, recall40, recall42, recall48, recall50 of nfstest suite. This patch restores the handling of race condition in nfsd_file_do_acquire with unlink to that prior of the regression. Fixes: ac3a2585f018 ("nfsd: rework refcounting in filecache") Signed-off-by: Dai Ngo Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f40d8f3b35a4..ee9c923192e0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1099,8 +1099,6 @@ open_file: * then unhash. */ if (status != nfs_ok || inode->i_nlink == 0) - status = nfserr_jukebox; - if (status != nfs_ok) nfsd_file_unhash(nf); clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); if (status == nfs_ok) -- cgit v1.2.3 From 22b620ec0bf454cfd1c464f57cfce9afb3fb1e70 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 20 Apr 2023 11:02:33 -0400 Subject: NFSD: Clean up xattr memory allocation flags Tetsuo Handa points out: > Since GFP_KERNEL is "GFP_NOFS | __GFP_FS", usage like > "GFP_KERNEL | GFP_NOFS" does not make sense. The original intent was to hold the inode lock while estimating the buffer requirements for the requested information. Frank van der Linden, the author of NFSD's xattr code, says: > ... you need inode_lock to get an atomic view of an xattr. Since > both nfsd_getxattr and nfsd_listxattr to the standard trick of > querying the xattr length with a NULL buf argument (just getting > the length back), allocating the right buffer size, and then > querying again, they need to hold the inode lock to avoid having > the xattr changed from under them while doing that. > > From that then flows the requirement that GFP_FS could cause > problems while holding i_rwsem, so I added GFP_NOFS. However, Dave Chinner states: > You can do GFP_KERNEL allocations holding the i_rwsem just fine. > All that it requires is the caller holds a reference to the > inode ... Since these code paths acquire a dentry, they do indeed hold a reference. It is therefore safe to use GFP_KERNEL for these memory allocations. In particular, that's what this code is already doing; but now the C source code looks sane too. At a later time we can revisit in order to remove the inode lock in favor of simply retrying if the estimated buffer size is too small. Reported-by: Tetsuo Handa Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 10aa68ca82ef..bb9d47172162 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2168,7 +2168,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + buf = kvmalloc(len, GFP_KERNEL); if (buf == NULL) { err = nfserr_jukebox; goto out; @@ -2231,10 +2231,7 @@ nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp, goto out; } - /* - * We're holding i_rwsem - use GFP_NOFS. - */ - buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + buf = kvmalloc(len, GFP_KERNEL); if (buf == NULL) { err = nfserr_jukebox; goto out; -- cgit v1.2.3 From b3cbf98e2fdf3cb147a95161560cd25987284330 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 20 Apr 2023 13:56:24 -0400 Subject: SUNRPC: Support TLS handshake in the server-side TCP socket code This patch adds opportunitistic RPC-with-TLS to the Linux in-kernel NFS server. If the client requests RPC-with-TLS and the user space handshake agent is running, the server will set up a TLS session. There are no policy settings yet. For example, the server cannot yet require the use of RPC-with-TLS to access its data. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 5 +- include/linux/sunrpc/svcsock.h | 2 + include/trace/events/sunrpc.h | 16 ++++++- net/sunrpc/svc_xprt.c | 5 +- net/sunrpc/svcauth_unix.c | 11 ++++- net/sunrpc/svcsock.c | 101 ++++++++++++++++++++++++++++++++++++++-- 6 files changed, 132 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 775368802762..867479204840 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -27,7 +27,7 @@ struct svc_xprt_ops { void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); void (*xpo_kill_temp_xprt)(struct svc_xprt *); - void (*xpo_start_tls)(struct svc_xprt *); + void (*xpo_handshake)(struct svc_xprt *xprt); }; struct svc_xprt_class { @@ -70,6 +70,9 @@ struct svc_xprt { #define XPT_LOCAL 12 /* connection from loopback interface */ #define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */ #define XPT_CONG_CTRL 14 /* has congestion control */ +#define XPT_HANDSHAKE 15 /* xprt requests a handshake */ +#define XPT_TLS_SESSION 16 /* transport-layer security established */ +#define XPT_PEER_AUTH 17 /* peer has been authenticated */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index dd73fa174af5..d16ae621782c 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -38,6 +38,8 @@ struct svc_sock { /* Number of queued send requests */ atomic_t sk_sendqlen; + struct completion sk_handshake_done; + struct page * sk_pages[RPCSVC_MAXPAGES]; /* received data */ }; diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5a3bb42e1f50..31bc7025cb44 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1857,7 +1857,10 @@ TRACE_EVENT(svc_stats_latency, { BIT(XPT_CACHE_AUTH), "CACHE_AUTH" }, \ { BIT(XPT_LOCAL), "LOCAL" }, \ { BIT(XPT_KILL_TEMP), "KILL_TEMP" }, \ - { BIT(XPT_CONG_CTRL), "CONG_CTRL" }) + { BIT(XPT_CONG_CTRL), "CONG_CTRL" }, \ + { BIT(XPT_HANDSHAKE), "HANDSHAKE" }, \ + { BIT(XPT_TLS_SESSION), "TLS_SESSION" }, \ + { BIT(XPT_PEER_AUTH), "PEER_AUTH" }) TRACE_EVENT(svc_xprt_create_err, TP_PROTO( @@ -1990,6 +1993,17 @@ DEFINE_SVC_XPRT_EVENT(close); DEFINE_SVC_XPRT_EVENT(detach); DEFINE_SVC_XPRT_EVENT(free); +#define DEFINE_SVC_TLS_EVENT(name) \ + DEFINE_EVENT(svc_xprt_event, svc_tls_##name, \ + TP_PROTO(const struct svc_xprt *xprt), \ + TP_ARGS(xprt)) + +DEFINE_SVC_TLS_EVENT(start); +DEFINE_SVC_TLS_EVENT(upcall); +DEFINE_SVC_TLS_EVENT(unavailable); +DEFINE_SVC_TLS_EVENT(not_started); +DEFINE_SVC_TLS_EVENT(timed_out); + TRACE_EVENT(svc_xprt_accept, TP_PROTO( const struct svc_xprt *xprt, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3b9708b39e35..84e5d7d31481 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -427,7 +427,7 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) if (xpt_flags & BIT(XPT_BUSY)) return false; - if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE))) + if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE))) return true; if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) { if (xprt->xpt_ops->xpo_has_wspace(xprt) && @@ -828,6 +828,9 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) module_put(xprt->xpt_class->xcl_owner); } svc_xprt_received(xprt); + } else if (test_bit(XPT_HANDSHAKE, &xprt->xpt_flags)) { + xprt->xpt_ops->xpo_handshake(xprt); + svc_xprt_received(xprt); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 4485088ce27b..174783f804fa 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -17,8 +17,9 @@ #include #include #include -#define RPCDBG_FACILITY RPCDBG_AUTH +#include +#define RPCDBG_FACILITY RPCDBG_AUTH #include "netns.h" @@ -832,6 +833,7 @@ svcauth_tls_accept(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct svc_cred *cred = &rqstp->rq_cred; + struct svc_xprt *xprt = rqstp->rq_xprt; u32 flavor, len; void *body; __be32 *p; @@ -865,14 +867,19 @@ svcauth_tls_accept(struct svc_rqst *rqstp) if (cred->cr_group_info == NULL) return SVC_CLOSE; - if (rqstp->rq_xprt->xpt_ops->xpo_start_tls) { + if (xprt->xpt_ops->xpo_handshake) { p = xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2 + 8); if (!p) return SVC_CLOSE; + trace_svc_tls_start(xprt); *p++ = rpc_auth_null; *p++ = cpu_to_be32(8); memcpy(p, "STARTTLS", 8); + + set_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); } else { + trace_svc_tls_unavailable(xprt); if (xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_NULL, NULL, 0) < 0) return SVC_CLOSE; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c5b74f523fc4..a51c9b989d58 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -44,9 +44,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -64,6 +66,12 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* To-do: to avoid tying up an nfsd thread while waiting for a + * handshake request, the request could instead be deferred. + */ +enum { + SVC_HANDSHAKE_TO = 5U * HZ +}; static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); @@ -359,6 +367,8 @@ static void svc_data_ready(struct sock *sk) rmb(); svsk->sk_odata(sk); trace_svcsock_data_ready(&svsk->sk_xprt, 0); + if (test_bit(XPT_HANDSHAKE, &svsk->sk_xprt.xpt_flags)) + return; if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); } @@ -396,6 +406,88 @@ static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) sock_no_linger(svsk->sk_sock->sk); } +/** + * svc_tcp_handshake_done - Handshake completion handler + * @data: address of xprt to wake + * @status: status of handshake + * @peerid: serial number of key containing the remote peer's identity + * + * If a security policy is specified as an export option, we don't + * have a specific export here to check. So we set a "TLS session + * is present" flag on the xprt and let an upper layer enforce local + * security policy. + */ +static void svc_tcp_handshake_done(void *data, int status, key_serial_t peerid) +{ + struct svc_xprt *xprt = data; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + if (!status) { + if (peerid != TLS_NO_PEERID) + set_bit(XPT_PEER_AUTH, &xprt->xpt_flags); + set_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + } + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + complete_all(&svsk->sk_handshake_done); +} + +/** + * svc_tcp_handshake - Perform a transport-layer security handshake + * @xprt: connected transport endpoint + * + */ +static void svc_tcp_handshake(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sock->sk; + struct tls_handshake_args args = { + .ta_sock = svsk->sk_sock, + .ta_done = svc_tcp_handshake_done, + .ta_data = xprt, + }; + int ret; + + trace_svc_tls_upcall(xprt); + + clear_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + init_completion(&svsk->sk_handshake_done); + + ret = tls_server_hello_x509(&args, GFP_KERNEL); + if (ret) { + trace_svc_tls_not_started(xprt); + goto out_failed; + } + + ret = wait_for_completion_interruptible_timeout(&svsk->sk_handshake_done, + SVC_HANDSHAKE_TO); + if (ret <= 0) { + if (tls_handshake_cancel(sk)) { + trace_svc_tls_timed_out(xprt); + goto out_close; + } + } + + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) { + trace_svc_tls_unavailable(xprt); + goto out_close; + } + + /* Mark the transport ready in case the remote sent RPC + * traffic before the kernel received the handshake + * completion downcall. + */ + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + return; + +out_close: + set_bit(XPT_CLOSE, &xprt->xpt_flags); +out_failed: + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -1257,6 +1349,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, + .xpo_handshake = svc_tcp_handshake, }; static struct svc_xprt_class svc_tcp_class = { @@ -1580,10 +1673,12 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct socket *sock = svsk->sk_sock; - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); + tls_handshake_cancel(sock->sk); + if (sock->file) + sockfd_put(sock); else - sock_release(svsk->sk_sock); + sock_release(sock); kfree(svsk); } -- cgit v1.2.3 From 9280c577431401544e63dfb489a830a42bee25eb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 20 Apr 2023 13:56:31 -0400 Subject: NFSD: Handle new xprtsec= export option Enable administrators to require clients to use transport layer security when accessing particular exports. Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 51 +++++++++++++++++++++++++++++++++++++--- fs/nfsd/export.h | 1 + include/uapi/linux/nfsd/export.h | 13 ++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 6da74aebe1fb..ae85257b4238 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } return 0; - } #ifdef CONFIG_NFSD_V4 @@ -546,6 +545,29 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) +{ + unsigned int i, mode, listsize; + int err; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > NFSEXP_XPRTSEC_NUM) + return -EINVAL; + + exp->ex_xprtsec_modes = 0; + for (i = 0; i < listsize; i++) { + err = get_uint(mesg, &mode); + if (err) + return err; + if (mode > NFSEXP_XPRTSEC_MTLS) + return -EINVAL; + exp->ex_xprtsec_modes |= mode; + } + return 0; +} + static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { @@ -608,6 +630,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; + exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); @@ -649,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); + else if (strcmp(buf, "xprtsec") == 0) + err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set @@ -662,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; + /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the @@ -823,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } + new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) @@ -1034,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd, __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f; - struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + goto denied; +ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return 0; @@ -1061,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) if (nfsd4_spo_must_allow(rqstp)) return 0; +denied: return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index d03f7f6a8642..2df8ae25aad3 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -77,6 +77,7 @@ struct svc_export { struct cache_detail *cd; struct rcu_head ex_rcu; struct export_stats ex_stats; + unsigned long ex_xprtsec_modes; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 2124ba904779..a73ca3703abb 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -62,5 +62,18 @@ | NFSEXP_ALLSQUASH \ | NFSEXP_INSECURE_PORT) +/* + * Transport layer security policies that are permitted to access + * an export + */ +#define NFSEXP_XPRTSEC_NONE 0x0001 +#define NFSEXP_XPRTSEC_TLS 0x0002 +#define NFSEXP_XPRTSEC_MTLS 0x0004 + +#define NFSEXP_XPRTSEC_NUM (3) + +#define NFSEXP_XPRTSEC_ALL (NFSEXP_XPRTSEC_NONE | \ + NFSEXP_XPRTSEC_TLS | \ + NFSEXP_XPRTSEC_MTLS) #endif /* _UAPINFSD_EXPORT_H */ -- cgit v1.2.3