From fd19ca36fd782b84f71b86525b91a905cda913a4 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 29 Jun 2023 18:52:39 -0700 Subject: NFSD: handle GETATTR conflict with write delegation If the GETATTR request on a file that has write delegation in effect and the request attributes include the change info and size attribute then the write delegation is recalled. If the delegation is returned within 30ms then the GETATTR is serviced as normal otherwise the NFS4ERR_DELAY error is returned for the GETATTR. Add counter for write delegation recall due to conflict GETATTR. This is used to evaluate the need to implement CB_GETATTR to adoid recalling the delegation with conflit GETATTR. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 5 +++++ fs/nfsd/state.h | 3 +++ fs/nfsd/stats.c | 2 ++ fs/nfsd/stats.h | 7 ++++++ 5 files changed, 82 insertions(+) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index daf305daa751..b56ea72d4350 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8341,3 +8341,68 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, { get_stateid(cstate, &u->write.wr_stateid); } + +/** + * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict + * @rqstp: RPC transaction context + * @inode: file to be checked for a conflict + * + * This function is called when there is a conflict between a write + * delegation and a change/size GETATTR from another client. The server + * must either use the CB_GETATTR to get the current values of the + * attributes from the client that holds the delegation or recall the + * delegation before replying to the GETATTR. See RFC 8881 section + * 18.7.4. + * + * The current implementation does not support CB_GETATTR yet. However + * this can avoid recalling the delegation could be added in follow up + * work. + * + * Returns 0 if there is no conflict; otherwise an nfs_stat + * code is returned. + */ +__be32 +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +{ + __be32 status; + struct file_lock_context *ctx; + struct file_lock *fl; + struct nfs4_delegation *dp; + + ctx = locks_inode_context(inode); + if (!ctx) + return 0; + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_flags == FL_LAYOUT) + continue; + if (fl->fl_lmops != &nfsd_lease_mng_ops) { + /* + * non-nfs lease, if it's a lease with F_RDLCK then + * we are done; there isn't any write delegation + * on this inode + */ + if (fl->fl_type == F_RDLCK) + break; + goto break_lease; + } + if (fl->fl_type == F_WRLCK) { + dp = fl->fl_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + spin_unlock(&ctx->flc_lock); + return 0; + } +break_lease: + spin_unlock(&ctx->flc_lock); + nfsd_stats_wdeleg_getattr_inc(); + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; + } + break; + } + spin_unlock(&ctx->flc_lock); + return 0; +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b30dca7de8cc..95aafe52fa0c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2984,6 +2984,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, if (status) goto out; } + if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + if (status) + goto out; + } err = vfs_getattr(&path, &stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d49d3060ed4f..cbddcf484dba 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); return clp->cl_state == NFSD4_EXPIRABLE; } + +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, + struct inode *inode); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 777e24e5da33..63797635e1c3 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -65,6 +65,8 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, " %lld", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); } + seq_printf(seq, "\nwdeleg_getattr %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 9b43dc3d9991..cf5524e7ca06 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -22,6 +22,7 @@ enum { NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, #define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ #endif NFSD_STATS_COUNTERS_NUM }; @@ -93,4 +94,10 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); } +#ifdef CONFIG_NFSD_V4 +static inline void nfsd_stats_wdeleg_getattr_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); +} +#endif #endif /* _NFSD_STATS_H */ -- cgit v1.2.3 From 50bce06f0e7993aeaa03d39a8f8979b31e5862bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 19 Jul 2023 11:33:09 -0400 Subject: NFSD: Report zero space limit for write delegations Replace the -1 (no limit) with a zero (no reserved space). This prevents certain non-determinant client behavior, such as silly-renaming a file when the only open reference is a write delegation. Such a rename can leave unexpected .nfs files in a directory that is otherwise supposed to be empty. Note that other server implementations that support write delegation also set this field to zero. Suggested-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 95aafe52fa0c..d4de39404cde 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3978,17 +3978,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 32); + + p = xdr_reserve_space(xdr, XDR_UNIT * 8); if (!p) return nfserr_resource; *p++ = cpu_to_be32(open->op_recall); /* + * Always flush on close + * * TODO: space_limit's in delegations */ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = cpu_to_be32(~(u32)0); - *p++ = cpu_to_be32(~(u32)0); + *p++ = xdr_zero; + *p++ = xdr_zero; /* * TODO: ACE's in delegations -- cgit v1.2.3 From 1d3dd1d56ce8322fb5b2a143ec9ff38c703bfeda Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 29 Jun 2023 18:52:40 -0700 Subject: NFSD: Enable write delegation support This patch grants write delegations for OPEN with NFS4_SHARE_ACCESS_WRITE if there is no conflict with other OPENs. Write delegation conflicts with another OPEN, REMOVE, RENAME and SETATTR are handled the same as read delegation using notify_change, try_break_deleg. The NFSv4.0 protocol does not enable a server to determine that a conflicting GETATTR originated from the client holding the delegation versus coming from some other client. With NFSv4.1 and later, the SEQUENCE operation that begins each COMPOUND contains a client ID, so delegation recall can be safely squelched in this case. With NFSv4.0, however, the server must recall or send a CB_GETATTR (per RFC 7530 Section 16.7.5) even when the GETATTR originates from the client holding that delegation. An NFSv4.0 client can trigger a pathological situation if it always sends a DELEGRETURN preceded by a conflicting GETATTR in the same COMPOUND. COMPOUND execution will always stop at the GETATTR and the DELEGRETURN will never get executed. The server eventually revokes the delegation, which can result in loss of open or lock state. Tracepoint added to track whether read or write delegation is granted. Signed-off-by: Dai Ngo Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 97 ++++++++++++++++++++++++++++++++++++++++++----------- fs/nfsd/trace.h | 1 + 2 files changed, 78 insertions(+), 20 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b56ea72d4350..8534693eb6a4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -649,6 +649,18 @@ find_readable_file(struct nfs4_file *f) return ret; } +static struct nfsd_file * +find_rw_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); + spin_unlock(&f->fi_lock); + + return ret; +} + struct nfsd_file * find_any_file(struct nfs4_file *f) { @@ -1144,7 +1156,7 @@ static void block_delegations(struct knfsd_fh *fh) static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, - struct nfs4_clnt_odstate *odstate) + struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; long n; @@ -1170,7 +1182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); - dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -5449,8 +5461,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; - struct nfsd_file *nf; + struct nfsd_file *nf = NULL; struct file_lock *fl; + u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks @@ -5460,15 +5473,35 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - nf = find_readable_file(fp); - if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* + * Try for a write delegation first. RFC8881 section 10.4 says: + * + * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, + * on its own, all opens." + * + * Furthermore the client can use a write delegation for most READ + * operations as well, so we require a O_RDWR file here. + * + * Offer a write delegation in the case of a BOTH open, and ensure + * we get the O_RDWR descriptor. + */ + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { + nf = find_rw_file(fp); + dl_type = NFS4_OPEN_DELEGATE_WRITE; } + + /* + * If the file is being opened O_RDONLY or we couldn't get a O_RDWR + * file for some reason, then try for a read delegation instead. + */ + if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { + nf = find_readable_file(fp); + dl_type = NFS4_OPEN_DELEGATE_READ; + } + + if (!nf) + return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -5491,11 +5524,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(status); status = -ENOMEM; - dp = alloc_init_deleg(clp, fp, odstate); + dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(dp, dl_type); if (!fl) goto out_clnt_odstate; @@ -5568,10 +5601,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) } /* - * Attempt to hand out a delegation. + * The Linux NFS server does not offer write delegations to NFSv4.0 + * clients in order to avoid conflicts between write delegations and + * GETATTRs requesting CHANGE or SIZE attributes. + * + * With NFSv4.1 and later minorversions, the SEQUENCE operation that + * begins each COMPOUND contains a client ID. Delegation recall can + * be avoided when the server recognizes the client sending a + * GETATTR also holds write delegation it conflicts with. + * + * However, the NFSv4.0 protocol does not enable a server to + * determine that a GETATTR originated from the client holding the + * conflicting delegation versus coming from some other client. Per + * RFC 7530 Section 16.7.5, the server must recall or send a + * CB_GETATTR even when the GETATTR originates from the client that + * holds the conflicting delegation. * - * Note we don't support write delegations, and won't until the vfs has - * proper support for them. + * An NFSv4.0 client can trigger a pathological situation if it + * always sends a DELEGRETURN preceded by a conflicting GETATTR in + * the same COMPOUND. COMPOUND execution will always stop at the + * GETATTR and the DELEGRETURN will never get executed. The server + * eventually revokes the delegation, which can result in loss of + * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, @@ -5590,8 +5641,6 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = 1; - if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) - goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; @@ -5606,6 +5655,9 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && + !clp->cl_minorversion) + goto out_no_deleg; break; default: goto out_no_deleg; @@ -5616,8 +5668,13 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); - open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; + trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + } else { + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 2af74983f146..693fe6d465aa 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -607,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_write); DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); -- cgit v1.2.3 From 35308e7f0fc3942edc87d9c6dc78c4a096428957 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:16 -0400 Subject: NFSD: Refactor nfsd_reply_cache_free_locked() To reduce contention on the bucket locks, we must avoid calling kfree() while each bucket lock is held. Start by refactoring nfsd_reply_cache_free_locked() into a helper that removes an entry from the bucket (and must therefore run under the lock) and a second helper that frees the entry (which does not need to hold the lock). For readability, rename the helpers nfsd_cacherep_. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index a8eda1c85829..3fde7f6e4ef8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -110,21 +110,33 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, return rp; } +static void nfsd_cacherep_free(struct svc_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF) + kfree(rp->c_replvec.iov_base); + kmem_cache_free(drc_slab, rp); +} + static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, - struct nfsd_net *nn) +nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + struct svc_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); - kfree(rp->c_replvec.iov_base); - } if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } - kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, + struct nfsd_net *nn) +{ + nfsd_cacherep_unlink_locked(nn, b, rp); + nfsd_cacherep_free(rp); } static void @@ -132,8 +144,9 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, struct nfsd_net *nn) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(b, rp, nn); + nfsd_cacherep_unlink_locked(nn, b, rp); spin_unlock(&b->cache_lock); + nfsd_cacherep_free(rp); } int nfsd_drc_slab_create(void) -- cgit v1.2.3 From ff0d169329768c1102b7b07eebe5a9839aa1c143 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:22 -0400 Subject: NFSD: Rename nfsd_reply_cache_alloc() For readability, rename to match the other helpers. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 3fde7f6e4ef8..02259d280f51 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -85,8 +85,8 @@ nfsd_hashsize(unsigned int limit) } static struct svc_cacherep * -nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, - struct nfsd_net *nn) +nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) { struct svc_cacherep *rp; @@ -458,7 +458,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) * preallocate an entry. */ nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - rp = nfsd_reply_cache_alloc(rqstp, csum, nn); + rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; -- cgit v1.2.3 From a9507f6af1450ed26a4a36d979af518f5bb21e5d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:29 -0400 Subject: NFSD: Replace nfsd_prune_bucket() Enable nfsd_prune_bucket() to drop the bucket lock while calling kfree(). Use the same pattern that Jeff recently introduced in the NFSD filecache. A few percpu operations are moved outside the lock since they temporarily disable local IRQs which is expensive and does not need to be done while the lock is held. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 78 +++++++++++++++++++++++++++++++++++++++++++----------- fs/nfsd/trace.h | 22 +++++++++++++++ 2 files changed, 85 insertions(+), 15 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 02259d280f51..787d15b62336 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -117,6 +117,21 @@ static void nfsd_cacherep_free(struct svc_cacherep *rp) kmem_cache_free(drc_slab, rp); } +static unsigned long +nfsd_cacherep_dispose(struct list_head *dispose) +{ + struct svc_cacherep *rp; + unsigned long freed = 0; + + while (!list_empty(dispose)) { + rp = list_first_entry(dispose, struct svc_cacherep, c_lru); + list_del(&rp->c_lru); + nfsd_cacherep_free(rp); + freed++; + } + return freed; +} + static void nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, struct svc_cacherep *rp) @@ -260,6 +275,41 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) return &nn->drc_hashtbl[hash]; } +/* + * Remove and return no more than @max expired entries in bucket @b. + * If @max is zero, do not limit the number of removed entries. + */ +static void +nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + unsigned int max, struct list_head *dispose) +{ + unsigned long expiry = jiffies - RC_EXPIRE; + struct svc_cacherep *rp, *tmp; + unsigned int freed = 0; + + lockdep_assert_held(&b->cache_lock); + + /* The bucket LRU is ordered oldest-first. */ + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && + time_before(expiry, rp->c_timestamp)) + break; + + nfsd_cacherep_unlink_locked(nn, b, rp); + list_add(&rp->c_lru, dispose); + + if (max && ++freed > max) + break; + } +} + static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, unsigned int max) { @@ -283,11 +333,6 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, return freed; } -static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) -{ - return prune_bucket(b, nn, 3); -} - /* * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. @@ -443,6 +488,8 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; + unsigned long freed; + LIST_HEAD(dispose); int rtn = RC_DOIT; rqstp->rq_cacherep = NULL; @@ -467,20 +514,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found = nfsd_cache_insert(b, rp, nn); if (found != rp) goto found_entry; - - nfsd_stats_rc_misses_inc(); rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; + nfsd_prune_bucket_locked(nn, b, 3, &dispose); + spin_unlock(&b->cache_lock); + freed = nfsd_cacherep_dispose(&dispose); + trace_nfsd_drc_gc(nn, freed); + + nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); - - nfsd_prune_bucket(b, nn); - -out_unlock: - spin_unlock(&b->cache_lock); -out: - return rtn; + goto out; found_entry: /* We found a matching entry which is either in progress or done. */ @@ -518,7 +563,10 @@ found_entry: out_trace: trace_nfsd_drc_found(nn, rqstp, rtn); - goto out_unlock; +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; } /** diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 693fe6d465aa..c48419c0a58a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1262,6 +1262,28 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); +TRACE_EVENT_CONDITION(nfsd_drc_gc, + TP_PROTO( + const struct nfsd_net *nn, + unsigned long freed + ), + TP_ARGS(nn, freed), + TP_CONDITION(freed > 0), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, freed) + __field(int, total) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->freed = freed; + __entry->total = atomic_read(&nn->num_drc_entries); + ), + TP_printk("boot_time=%16llx total=%d freed=%lu", + __entry->boot_time, __entry->total, __entry->freed + ) +); + TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, -- cgit v1.2.3 From c135e1269f34dfdea4bd94c11060c83a3c0b3c12 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:35 -0400 Subject: NFSD: Refactor the duplicate reply cache shrinker Avoid holding the bucket lock while freeing cache entries. This change also caps the number of entries that are freed when the shrinker calls to reduce the shrinker's impact on the cache's effectiveness. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 82 ++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 43 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 787d15b62336..6599a12830db 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -310,68 +310,64 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, } } -static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, - unsigned int max) +/** + * nfsd_reply_cache_count - count_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Returns the total number of entries in the duplicate reply cache. To + * keep things simple and quick, this is not the number of expired entries + * in the cache (ie, the number that would be removed by a call to + * nfsd_reply_cache_scan). + */ +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct svc_cacherep *rp, *tmp; - long freed = 0; + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); - list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { - /* - * Don't free entries attached to calls that are still - * in-progress, but do keep scanning the list. - */ - if (rp->c_state == RC_INPROG) - continue; - if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && - time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) - break; - nfsd_reply_cache_free_locked(b, rp, nn); - if (max && freed++ > max) - break; - } - return freed; + return atomic_read(&nn->num_drc_entries); } -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. +/** + * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Free expired entries on each bucket's LRU list until we've released + * nr_to_scan freed objects. Nothing will be released if the cache + * has not exceeded it's max_drc_entries limit. + * + * Returns the number of entries released by this call. */ -static long -prune_cache_entries(struct nfsd_net *nn) +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + unsigned long freed = 0; + LIST_HEAD(dispose); unsigned int i; - long freed = 0; for (i = 0; i < nn->drc_hashsize; i++) { struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; if (list_empty(&b->lru_head)) continue; + spin_lock(&b->cache_lock); - freed += prune_bucket(b, nn, 0); + nfsd_prune_bucket_locked(nn, b, 0, &dispose); spin_unlock(&b->cache_lock); - } - return freed; -} -static unsigned long -nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + freed += nfsd_cacherep_dispose(&dispose); + if (freed > sc->nr_to_scan) + break; + } - return atomic_read(&nn->num_drc_entries); + trace_nfsd_drc_gc(nn, freed); + return freed; } -static unsigned long -nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); - - return prune_cache_entries(nn); -} /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes */ -- cgit v1.2.3 From cb18eca4b86768ec79e847795d1043356c9ee3b0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:41 -0400 Subject: NFSD: Remove svc_rqst::rq_cacherep Over time I'd like to see NFS-specific fields moved out of struct svc_rqst, which is an RPC layer object. These fields are layering violations. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/cache.h | 6 ++++-- fs/nfsd/nfscache.c | 11 ++++++----- fs/nfsd/nfssvc.c | 10 ++++++---- include/linux/sunrpc/svc.h | 1 - 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4c9b87850ab1..27610b071880 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_cache_lookup(struct svc_rqst *rqstp, + struct svc_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp, + int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6599a12830db..b259fc373ae7 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -465,6 +465,7 @@ out: /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none * is found, we try to grab the oldest expired entry off the LRU list. If @@ -477,7 +478,7 @@ out: * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp) +int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep) { struct nfsd_net *nn; struct svc_cacherep *rp, *found; @@ -488,7 +489,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) LIST_HEAD(dispose); int rtn = RC_DOIT; - rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsd_stats_rc_nocache_inc(); goto out; @@ -510,7 +510,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found = nfsd_cache_insert(b, rp, nn); if (found != rp) goto found_entry; - rqstp->rq_cacherep = rp; + *cacherep = rp; rp->c_state = RC_INPROG; nfsd_prune_bucket_locked(nn, b, 3, &dispose); spin_unlock(&b->cache_lock); @@ -568,6 +568,7 @@ out: /** * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply + * @rp: IN: DRC entry for this request * @cachetype: which cache to update * @statp: pointer to Reply's NFS status code, or NULL * @@ -585,10 +586,10 @@ out: * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp, + int cachetype, __be32 *statp) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; struct nfsd_drc_bucket *b; int len; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2154fa63c5f2..f91fb343313d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1046,6 +1046,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; + struct svc_cacherep *rp; /* * Give the xdr decoder a chance to change this if it wants @@ -1056,7 +1057,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; - switch (nfsd_cache_lookup(rqstp)) { + rp = NULL; + switch (nfsd_cache_lookup(rqstp, &rp)) { case RC_DOIT: break; case RC_REPLY: @@ -1072,7 +1074,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); out_cached_reply: return 1; @@ -1082,13 +1084,13 @@ out_decode_err: return 1; out_update_drop: - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index f8751118c122..fe1394cc1371 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -265,7 +265,6 @@ struct svc_rqst { /* Catering to nfsd */ struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ - struct svc_cacherep * rq_cacherep; /* cache info */ struct task_struct *rq_task; /* service thread */ struct net *rq_bc_net; /* pointer to backchannel's * net namespace -- cgit v1.2.3 From e7421ce71437ec8e4d69cc6bdf35b6853adc5050 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jul 2023 11:45:48 -0400 Subject: NFSD: Rename struct svc_cacherep The svc_ prefix is identified with the SunRPC layer. Although the duplicate reply cache caches RPC replies, it is only for the NFS protocol. Rename the struct to better reflect its purpose. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/cache.h | 6 +++--- fs/nfsd/nfscache.c | 44 ++++++++++++++++++++++---------------------- fs/nfsd/nfssvc.c | 2 +- fs/nfsd/trace.h | 4 ++-- 4 files changed, 28 insertions(+), 28 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 27610b071880..929248c6ca84 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,7 +19,7 @@ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage * is much larger than a sockaddr_in6. */ -struct svc_cacherep { +struct nfsd_cacherep { struct { /* Keep often-read xid, csum in the same cache line: */ __be32 k_xid; @@ -85,8 +85,8 @@ void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *rqstp, - struct svc_cacherep **cacherep); -void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp, + struct nfsd_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index b259fc373ae7..80621a709510 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static struct svc_cacherep * +static struct nfsd_cacherep * nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); if (rp) { @@ -110,7 +110,7 @@ nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, return rp; } -static void nfsd_cacherep_free(struct svc_cacherep *rp) +static void nfsd_cacherep_free(struct nfsd_cacherep *rp) { if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); @@ -120,11 +120,11 @@ static void nfsd_cacherep_free(struct svc_cacherep *rp) static unsigned long nfsd_cacherep_dispose(struct list_head *dispose) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned long freed = 0; while (!list_empty(dispose)) { - rp = list_first_entry(dispose, struct svc_cacherep, c_lru); + rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru); list_del(&rp->c_lru); nfsd_cacherep_free(rp); freed++; @@ -134,7 +134,7 @@ nfsd_cacherep_dispose(struct list_head *dispose) static void nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, - struct svc_cacherep *rp) + struct nfsd_cacherep *rp) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); @@ -147,7 +147,7 @@ nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, } static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, struct nfsd_net *nn) { nfsd_cacherep_unlink_locked(nn, b, rp); @@ -155,7 +155,7 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, } static void -nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, struct nfsd_net *nn) { spin_lock(&b->cache_lock); @@ -167,7 +167,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, int nfsd_drc_slab_create(void) { drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct svc_cacherep), 0, 0, NULL); + sizeof(struct nfsd_cacherep), 0, 0, NULL); return drc_slab ? 0: -ENOMEM; } @@ -236,7 +236,7 @@ out_shrinker: void nfsd_reply_cache_shutdown(struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned int i; unregister_shrinker(&nn->nfsd_reply_cache_shrinker); @@ -244,7 +244,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; while (!list_empty(head)) { - rp = list_first_entry(head, struct svc_cacherep, c_lru); + rp = list_first_entry(head, struct nfsd_cacherep, c_lru); nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], rp, nn); } @@ -261,7 +261,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) * not already scheduled. */ static void -lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); @@ -284,7 +284,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, unsigned int max, struct list_head *dispose) { unsigned long expiry = jiffies - RC_EXPIRE; - struct svc_cacherep *rp, *tmp; + struct nfsd_cacherep *rp, *tmp; unsigned int freed = 0; lockdep_assert_held(&b->cache_lock); @@ -402,8 +402,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static int -nfsd_cache_key_cmp(const struct svc_cacherep *key, - const struct svc_cacherep *rp, struct nfsd_net *nn) +nfsd_cache_key_cmp(const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp, struct nfsd_net *nn) { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { @@ -419,11 +419,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, * Must be called with cache_lock held. Returns the found entry or * inserts an empty key on failure. */ -static struct svc_cacherep * -nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, +static struct nfsd_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, struct nfsd_net *nn) { - struct svc_cacherep *rp, *ret = key; + struct nfsd_cacherep *rp, *ret = key; struct rb_node **p = &b->rb_head.rb_node, *parent = NULL; unsigned int entries = 0; @@ -432,7 +432,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, while (*p != NULL) { ++entries; parent = *p; - rp = rb_entry(parent, struct svc_cacherep, c_node); + rp = rb_entry(parent, struct nfsd_cacherep, c_node); cmp = nfsd_cache_key_cmp(key, rp, nn); if (cmp < 0) @@ -478,10 +478,10 @@ out: * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp, struct svc_cacherep **cacherep) +int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; - struct svc_cacherep *rp, *found; + struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; @@ -586,7 +586,7 @@ out: * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void nfsd_cache_update(struct svc_rqst *rqstp, struct svc_cacherep *rp, +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, int cachetype, __be32 *statp) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f91fb343313d..97830e28c140 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1046,7 +1046,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; /* * Give the xdr decoder a chance to change this if it wants diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c48419c0a58a..803904348871 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1241,8 +1241,8 @@ TRACE_EVENT(nfsd_drc_found, TRACE_EVENT(nfsd_drc_mismatch, TP_PROTO( const struct nfsd_net *nn, - const struct svc_cacherep *key, - const struct svc_cacherep *rp + const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp ), TP_ARGS(nn, key, rp), TP_STRUCT__entry( -- cgit v1.2.3 From 5865bafa197a90ecadd086c2874948fa0c474943 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Jul 2023 09:34:53 -0400 Subject: nfsd: add a MODULE_DESCRIPTION I got this today from modpost: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/nfsd/nfsd.o Add a module description. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4302ca0ff6ed..33f80d289d63 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1627,6 +1627,7 @@ static void __exit exit_nfsd(void) } MODULE_AUTHOR("Olaf Kirch "); +MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd) -- cgit v1.2.3 From a332018a91c419a9a475c41d827544c771986876 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Jul 2023 10:29:10 -0400 Subject: nfsd: handle failure to collect pre/post-op attrs more sanely Collecting pre_op_attrs can fail, in which case it's probably best to fail the whole operation. Change fh_fill_pre_attrs and fh_fill_both_attrs to return __be32, and have the callers check the return code and abort the operation if it's not nfs_ok. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 4 +++- fs/nfsd/nfs4proc.c | 14 ++++++++------ fs/nfsd/nfsfh.c | 26 ++++++++++++++++---------- fs/nfsd/nfsfh.h | 6 +++--- fs/nfsd/vfs.c | 52 +++++++++++++++++++++++++++++++++++----------------- 5 files changed, 65 insertions(+), 37 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fc8d5b7db9f8..268ef57751c4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -307,7 +307,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ae670807449..cdf58d181c9e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -297,12 +297,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, } if (d_really_is_positive(child)) { - status = nfs_ok; - /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(fhp); + status = fh_fill_both_attrs(fhp); + if (status != nfs_ok) + goto out; switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: @@ -345,7 +345,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; status = nfsd4_vfs_create(fhp, child, open); if (status != nfs_ok) goto out; @@ -424,11 +426,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru } else { status = nfsd_lookup(rqstp, current_fh, open->op_fname, open->op_fnamelen, *resfh); - if (!status) + if (status == nfs_ok) /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(current_fh); + status = fh_fill_both_attrs(current_fh); } if (status) goto out; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c291389a1d71..355bf0db3235 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -614,7 +614,7 @@ out_negative: * @fhp: file handle to be updated * */ -void fh_fill_pre_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode; @@ -622,12 +622,12 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; + return nfs_ok; inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) - return; + return err; if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -636,6 +636,7 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; + return nfs_ok; } /** @@ -643,26 +644,27 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) * @fhp: file handle to be updated * */ -void fh_fill_post_attrs(struct svc_fh *fhp) +__be32 fh_fill_post_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; if (fhp->fh_no_wcc) - return; + return nfs_ok; if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); if (err) - return; + return err; fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); + return nfs_ok; } /** @@ -672,16 +674,20 @@ void fh_fill_post_attrs(struct svc_fh *fhp) * This is used when the directory wasn't changed, but wcc attributes * are needed anyway. */ -void fh_fill_both_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp) { - fh_fill_post_attrs(fhp); - if (!fhp->fh_post_saved) - return; + __be32 err; + + err = fh_fill_post_attrs(fhp); + if (err) + return err; + fhp->fh_pre_change = fhp->fh_post_change; fhp->fh_pre_mtime = fhp->fh_post_attr.mtime; fhp->fh_pre_ctime = fhp->fh_post_attr.ctime; fhp->fh_pre_size = fhp->fh_post_attr.size; fhp->fh_pre_saved = true; + return nfs_ok; } /* diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4e0ecf0ae2cf..40426f899e76 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -294,7 +294,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) } u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); -extern void fh_fill_pre_attrs(struct svc_fh *fhp); -extern void fh_fill_post_attrs(struct svc_fh *fhp); -extern void fh_fill_both_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); +__be32 fh_fill_post_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); #endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2c9074ab2315..c463ef5e0821 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1540,7 +1540,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dput(dchild); if (err) goto out_unlock; - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: @@ -1635,13 +1637,16 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_unlock(dentry->d_inode); goto out_drop_write; } - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(dentry->d_inode); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1703,7 +1708,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - fh_fill_pre_attrs(ffhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_dput; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); @@ -1789,8 +1796,12 @@ retry: } trap = lock_rename(tdentry, fdentry); - fh_fill_pre_attrs(ffhp); - fh_fill_pre_attrs(tfhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + err = fh_fill_pre_attrs(tfhp); + if (err != nfs_ok) + goto out_unlock; odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1857,6 +1868,7 @@ retry: fh_fill_post_attrs(ffhp); fh_fill_post_attrs(tfhp); } +out_unlock: unlock_rename(tdentry, fdentry); fh_drop_write(ffhp); @@ -1916,12 +1928,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, goto out_unlock; } rinode = d_inode(rdentry); - ihold(rinode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { int retries; @@ -2341,16 +2355,18 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); - + err = nfsd_xattr_errno(ret); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - return nfsd_xattr_errno(ret); + return err; } __be32 @@ -2368,15 +2384,17 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, if (ret) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - - ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, - len, flags, NULL); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); + err = nfsd_xattr_errno(ret); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - - return nfsd_xattr_errno(ret); + return err; } #endif -- cgit v1.2.3 From 976626073a7502fc91416155bd037a29deee729b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Jul 2023 10:29:11 -0400 Subject: nfsd: remove unsafe BUG_ON from set_change_info At one time, nfsd would scrape inode information directly out of struct inode in order to populate the change_info4. At that time, the BUG_ON in set_change_info made some sense, since having it unset meant a coding error. More recently, it calls vfs_getattr to get this information, which can fail. If that fails, fh_pre_saved can end up not being set. While this situation is unfortunate, we don't need to crash the box. Move set_change_info to nfs4proc.c since all of the callers are there. Revise the condition for setting "atomic" to also check for fh_pre_saved. Drop the BUG_ON and just have it zero out both change_attr4s when this occurs. Reported-by: Boyang Xue Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2223560 Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 32 ++++++++++++++++++++++++++++++++ fs/nfsd/xdr4.h | 11 ----------- 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cdf58d181c9e..a9c84339ff65 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -382,6 +382,38 @@ out: return status; } +/** + * set_change_info - set up the change_info4 for a reply + * @cinfo: pointer to nfsd4_change_info to be populated + * @fhp: pointer to svc_fh to use as source + * + * Many operations in NFSv4 require change_info4 in the reply. This function + * populates that from the info that we (should!) have already collected. In + * the event that we didn't get any pre-attrs, just zero out both. + */ +static void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr); + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + + /* + * If fetching the pre-change attributes failed, then we should + * have already failed the whole operation. We could have still + * failed to fetch post-change attributes however. + * + * If we didn't get post-op attrs, just zero-out the after + * field since we don't know what it should be. If the pre_saved + * field isn't set for some reason, throw warning and just copy + * whatever is in the after field. + */ + if (WARN_ON_ONCE(!fhp->fh_pre_saved)) + cinfo->before_change = 0; + if (!fhp->fh_post_saved) + cinfo->after_change = 0; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 510978e602da..9d918a79dc16 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -774,17 +774,6 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op); #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); - - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; -} - - bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -- cgit v1.2.3 From f2b7019d2e3c4f1f55be658659804b337dcfac60 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Jul 2023 10:53:39 -0400 Subject: nfsd: set missing after_change as before_change + 1 In the event that we can't fetch post_op_attr attributes, we still need to set a value for the after_change. The operation has already happened, so we're not able to return an error at that point, but we do want to ensure that the client knows that its cache should be invalidated. If we weren't able to fetch post-op attrs, then just set the after_change to before_change + 1. The atomic flag should already be clear in this case. Suggested-by: Neil Brown Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a9c84339ff65..7588fd1859a4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -411,7 +411,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) if (WARN_ON_ONCE(!fhp->fh_pre_saved)) cinfo->before_change = 0; if (!fhp->fh_post_saved) - cinfo->after_change = 0; + cinfo->after_change = cinfo->before_change + 1; } static __be32 -- cgit v1.2.3 From d424797032c6e24b44037e6c7a2d32fd958300f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Jul 2023 08:13:05 -0400 Subject: nfsd: inherit required unset default acls from effective set A well-formed NFSv4 ACL will always contain OWNER@/GROUP@/EVERYONE@ ACEs, but there is no requirement for inheritable entries for those entities. POSIX ACLs must always have owner/group/other entries, even for a default ACL. nfsd builds the default ACL from inheritable ACEs, but the current code just leaves any unspecified ACEs zeroed out. The result is that adding a default user or group ACE to an inode can leave it with unwanted deny entries. For instance, a newly created directory with no acl will look something like this: # NFSv4 translation by server A::OWNER@:rwaDxtTcCy A::GROUP@:rxtcy A::EVERYONE@:rxtcy # POSIX ACL of underlying file user::rwx group::r-x other::r-x ...if I then add new v4 ACE: nfs4_setfacl -a A:fd:1000:rwx /mnt/local/test ...I end up with a result like this today: user::rwx user:1000:rwx group::r-x mask::rwx other::r-x default:user::--- default:user:1000:rwx default:group::--- default:mask::rwx default:other::--- A::OWNER@:rwaDxtTcCy A::1000:rwaDxtcy A::GROUP@:rxtcy A::EVERYONE@:rxtcy D:fdi:OWNER@:rwaDx A:fdi:OWNER@:tTcCy A:fdi:1000:rwaDxtcy A:fdi:GROUP@:tcy A:fdi:EVERYONE@:tcy ...which is not at all expected. Adding a single inheritable allow ACE should not result in everyone else losing access. The setfacl command solves a silimar issue by copying owner/group/other entries from the effective ACL when none of them are set: "If a Default ACL entry is created, and the Default ACL contains no owner, owning group, or others entry, a copy of the ACL owner, owning group, or others entry is added to the Default ACL. Having nfsd do the same provides a more sane result (with no deny ACEs in the resulting set): user::rwx user:1000:rwx group::r-x mask::rwx other::r-x default:user::rwx default:user:1000:rwx default:group::r-x default:mask::rwx default:other::r-x A::OWNER@:rwaDxtTcCy A::1000:rwaDxtcy A::GROUP@:rxtcy A::EVERYONE@:rxtcy A:fdi:OWNER@:rwaDxtTcCy A:fdi:1000:rwaDxtcy A:fdi:GROUP@:rxtcy A:fdi:EVERYONE@:rxtcy Reported-by: Ondrej Valousek Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2136452 Suggested-by: Andreas Gruenbacher Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4acl.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 518203821790..96e786b5e544 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -441,7 +441,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { - int empty; + unsigned char valid; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -457,7 +457,6 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); - state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't know which, so we allocate @@ -500,7 +499,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * and effective cases: when there are no inheritable ACEs, * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT)) return NULL; /* @@ -622,11 +621,12 @@ static void process_one_v4_ace(struct posix_acl_state *state, struct nfs4_ace *ace) { u32 mask = ace->access_mask; + short type = ace2type(ace); int i; - state->empty = 0; + state->valid |= type; - switch (ace2type(ace)) { + switch (type) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->owner, mask); @@ -726,6 +726,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) process_one_v4_ace(&effective_acl_state, ace); } + + /* + * At this point, the default ACL may have zeroed-out entries for owner, + * group and other. That usually results in a non-sensical resulting ACL + * that denies all access except to any ACE that was explicitly added. + * + * The setfacl command solves a similar problem with this logic: + * + * "If a Default ACL entry is created, and the Default ACL contains + * no owner, owning group, or others entry, a copy of the ACL + * owner, owning group, or others entry is added to the Default ACL." + * + * Copy any missing ACEs from the effective set, if any ACEs were + * explicitly set. + */ + if (default_acl_state.valid) { + if (!(default_acl_state.valid & ACL_USER_OBJ)) + default_acl_state.owner = effective_acl_state.owner; + if (!(default_acl_state.valid & ACL_GROUP_OBJ)) + default_acl_state.group = effective_acl_state.group; + if (!(default_acl_state.valid & ACL_OTHER)) + default_acl_state.other = effective_acl_state.other; + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); -- cgit v1.2.3 From 3903902401451b1cd9d797a8c79769eb26ac7fe5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 Jul 2023 16:38:08 +1000 Subject: nfsd: don't allow nfsd threads to be signalled. The original implementation of nfsd used signals to stop threads during shutdown. In Linux 2.3.46pre5 nfsd gained the ability to shutdown threads internally it if was asked to run "0" threads. After this user-space transitioned to using "rpc.nfsd 0" to stop nfsd and sending signals to threads was no longer an important part of the API. In commit 3ebdbe5203a8 ("SUNRPC: discard svo_setup and rename svc_set_num_threads_sync()") (v5.17-rc1~75^2~41) we finally removed the use of signals for stopping threads, using kthread_stop() instead. This patch makes the "obvious" next step and removes the ability to signal nfsd threads - or any svc threads. nfsd stops allowing signals and we don't check for their delivery any more. This will allow for some simplification in later patches. A change worth noting is in nfsd4_ssc_setup_dul(). There was previously a signal_pending() check which would only succeed when the thread was being shut down. It should really have tested kthread_should_stop() as well. Now it just does the latter, not the former. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfs/callback.c | 9 +-------- fs/nfsd/nfs4proc.c | 5 ++--- fs/nfsd/nfssvc.c | 12 ------------ net/sunrpc/svc_xprt.c | 16 ++++++---------- 4 files changed, 9 insertions(+), 33 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 456af7d230cf..46a0a2d6962e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -80,9 +80,6 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); /* * Listen for a request on the socket */ @@ -112,11 +109,7 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7588fd1859a4..5ca748309c26 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1347,12 +1347,11 @@ try_again: /* found a match */ if (ni->nsui_busy) { /* wait - and try again */ - prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE); spin_unlock(&nn->nfsd_ssc_lock); /* allow 20secs for mount/unmount for now - revisit */ - if (signal_pending(current) || + if (kthread_should_stop() || (schedule_timeout(20*HZ) == 0)) { finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 97830e28c140..439fca195925 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -965,15 +965,6 @@ nfsd(void *vrqstp) current->fs->umask = 0; - /* - * thread is spawned with all signals set to SIG_IGN, re-enable - * the ones that will bring down the thread - */ - allow_signal(SIGKILL); - allow_signal(SIGHUP); - allow_signal(SIGINT); - allow_signal(SIGQUIT); - atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -998,9 +989,6 @@ nfsd(void *vrqstp) validate_process_creds(); } - /* Clear signals before calling svc_exit_thread() */ - flush_signals(current); - atomic_dec(&nfsdstats.th_cnt); out: diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 62c7919ea610..d4a7fd5aabd1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -701,8 +701,8 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) /* Made progress, don't sleep yet */ continue; - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { + set_current_state(TASK_IDLE); + if (kthread_should_stop()) { set_current_state(TASK_RUNNING); return -EINTR; } @@ -740,7 +740,7 @@ rqst_should_sleep(struct svc_rqst *rqstp) return false; /* are we shutting down? */ - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) return false; /* are we freezing? */ @@ -762,11 +762,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (rqstp->rq_xprt) goto out_found; - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_IDLE); smp_mb__before_atomic(); clear_bit(SP_CONGESTED, &pool->sp_flags); clear_bit(RQ_BUSY, &rqstp->rq_flags); @@ -788,7 +784,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (!time_left) percpu_counter_inc(&pool->sp_threads_timedout); - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) return ERR_PTR(-EINTR); return ERR_PTR(-EAGAIN); out_found: @@ -885,7 +881,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) try_to_freeze(); cond_resched(); err = -EINTR; - if (signalled() || kthread_should_stop()) + if (kthread_should_stop()) goto out; xprt = svc_get_next_xprt(rqstp, timeout); -- cgit v1.2.3 From 18e4cf915543257eae2925671934937163f5639b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jul 2023 16:48:31 +1000 Subject: nfsd: Simplify code around svc_exit_thread() call in nfsd() Previously a thread could exit asynchronously (due to a signal) so some care was needed to hold nfsd_mutex over the last svc_put() call. Now a thread can only exit when svc_set_num_threads() is called, and this is always called under nfsd_mutex. So no care is needed. Not only is the mutex held when a thread exits now, but the svc refcount is elevated, so the svc_put() in svc_exit_thread() will never be a final put, so the mutex isn't even needed at this point in the code. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 23 ----------------------- include/linux/sunrpc/svc.h | 13 ------------- 2 files changed, 36 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 439fca195925..56a776cd4e2d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -992,31 +992,8 @@ nfsd(void *vrqstp) atomic_dec(&nfsdstats.th_cnt); out: - /* Take an extra ref so that the svc_put in svc_exit_thread() - * doesn't call svc_destroy() - */ - svc_get(nn->nfsd_serv); - /* Release the thread */ svc_exit_thread(rqstp); - - /* We need to drop a ref, but may not drop the last reference - * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that - * could deadlock with nfsd_shutdown_threads() waiting for us. - * So three options are: - * - drop a non-final reference, - * - get the mutex without waiting - * - sleep briefly andd try the above again - */ - while (!svc_put_not_last(nn->nfsd_serv)) { - if (mutex_trylock(&nfsd_mutex)) { - nfsd_put(net); - mutex_unlock(&nfsd_mutex); - break; - } - msleep(20); - } - return 0; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fe1394cc1371..2230148d9d68 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -120,19 +120,6 @@ static inline void svc_put(struct svc_serv *serv) kref_put(&serv->sv_refcnt, svc_destroy); } -/** - * svc_put_not_last - decrement non-final reference count on SUNRPC serv - * @serv: the svc_serv to have count decremented - * - * Returns: %true is refcount was decremented. - * - * If the refcount is 1, it is not decremented and instead failure is reported. - */ -static inline bool svc_put_not_last(struct svc_serv *serv) -{ - return refcount_dec_not_one(&serv->sv_refcnt.refcount); -} - /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is -- cgit v1.2.3 From 9f28a971ee9fdf1bf8ce8c88b103f483be610277 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jul 2023 16:48:32 +1000 Subject: nfsd: separate nfsd_last_thread() from nfsd_put() Now that the last nfsd thread is stopped by an explicit act of calling svc_set_num_threads() with a count of zero, we only have a limited number of places that can happen, and don't need to call nfsd_last_thread() in nfsd_put() So separate that out and call it at the two places where the number of threads is set to zero. Move the clearing of ->nfsd_serv and the call to svc_xprt_destroy_all() into nfsd_last_thread(), as they are really part of the same action. nfsd_put() is now a thin wrapper around svc_put(), so make it a static inline. nfsd_put() cannot be called after nfsd_last_thread(), so in a couple of places we have to use svc_put() instead. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfsd.h | 7 ++++++- fs/nfsd/nfssvc.c | 52 +++++++++++++++++++--------------------------------- 2 files changed, 25 insertions(+), 34 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d88498f8b275..11c14faa6c67 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -96,7 +96,12 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_put(struct net *net); +static inline void nfsd_put(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_put(nn->nfsd_serv); +} bool i_am_nfsd(void); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 56a776cd4e2d..a0b16e3dd91a 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -542,9 +542,14 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +static void nfsd_last_thread(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { @@ -554,6 +559,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) #endif } + svc_xprt_destroy_all(serv, net); + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -644,7 +651,8 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_put(net); + nfsd_last_thread(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); } @@ -674,9 +682,6 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - /* NOT nfsd_put() as notifiers (see below) haven't - * been set up yet. - */ svc_put(serv); return error; } @@ -719,29 +724,6 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -/* This is the callback for kref_put() below. - * There is no code here as the first thing to be done is - * call svc_shutdown_net(), but we cannot get the 'net' from - * the kref. So do all the work when kref_put returns true. - */ -static void nfsd_noop(struct kref *ref) -{ -} - -void nfsd_put(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { - svc_xprt_destroy_all(nn->nfsd_serv, net); - nfsd_last_thread(nn->nfsd_serv, net); - svc_destroy(&nn->nfsd_serv->sv_refcnt); - spin_lock(&nfsd_notifier_lock); - nn->nfsd_serv = NULL; - spin_unlock(&nfsd_notifier_lock); - } -} - int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; @@ -792,7 +774,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - nfsd_put(net); + svc_put(nn->nfsd_serv); return err; } @@ -807,6 +789,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) int error; bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -826,22 +809,25 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out; nfsd_up_before = nn->nfsd_net_up; + serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = svc_set_num_threads(serv, NULL, nrservs); if (error) goto out_shutdown; - error = nn->nfsd_serv->sv_nrthreads; + error = serv->sv_nrthreads; + if (error == 0) + nfsd_last_thread(net); out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); out_put: /* Threads now hold service active */ if (xchg(&nn->keep_active, 0)) - nfsd_put(net); - nfsd_put(net); + svc_put(serv); + svc_put(serv); out: mutex_unlock(&nfsd_mutex); return error; -- cgit v1.2.3 From f78116d3bf4fd7a84451e1a2adc35df7a63fbbf4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 Jul 2023 16:38:08 +1000 Subject: SUNRPC: call svc_process() from svc_recv(). All callers of svc_recv() go on to call svc_process() on success. Simplify callers by having svc_recv() do that for them. This loses one call to validate_process_creds() in nfsd. That was debugging code added 14 years ago. I don't think we need to keep it. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 5 ----- fs/nfs/callback.c | 1 - fs/nfsd/nfssvc.c | 3 +-- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 3 ++- 5 files changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 614faa5f69cd..91ef139a7757 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -132,7 +132,6 @@ lockd(void *vrqstp) */ while (!kthread_should_stop()) { long timeout = MAX_SCHEDULE_TIMEOUT; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; @@ -146,10 +145,6 @@ lockd(void *vrqstp) err = svc_recv(rqstp, timeout); if (err == -EAGAIN || err == -EINTR) continue; - dprintk("lockd: request from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); - - svc_process(rqstp); } if (nlmsvc_ops) nlmsvc_invalidate_all(); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 46a0a2d6962e..2d94384bd6a9 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -86,7 +86,6 @@ nfs4_callback_svc(void *vrqstp) err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); if (err == -EAGAIN || err == -EINTR) continue; - svc_process(rqstp); } svc_exit_thread(rqstp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a0b16e3dd91a..547feb8ad0af 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -970,8 +970,7 @@ nfsd(void *vrqstp) ; if (err == -EINTR) break; - validate_process_creds(); - svc_process(rqstp); + validate_process_creds(); } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 587811a002c9..c69896c124a4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1516,7 +1516,6 @@ out_baddir: out_drop: svc_drop(rqstp); } -EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d4a7fd5aabd1..8430b151bd71 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -906,7 +906,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; rqstp->rq_stime = ktime_get(); - return len; + svc_process(rqstp); + return 0; out_release: rqstp->rq_res.len = 0; svc_xprt_release(rqstp); -- cgit v1.2.3 From 7b719e2bf342a59e88b2b6215b98ca4cf824bc58 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 Jul 2023 16:38:08 +1000 Subject: SUNRPC: change svc_recv() to return void. svc_recv() currently returns a 0 on success or one of two errors: - -EAGAIN means no message was successfully received - -EINTR means the thread has been told to stop Previously nfsd would stop as the result of a signal as well as following kthread_stop(). In that case the difference was useful: EINTR means stop unconditionally. EAGAIN means stop if kthread_should_stop(), continue otherwise. Now threads only exit when kthread_should_stop() so we don't need the distinction. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 9 +-------- fs/nfs/callback.c | 11 ++--------- fs/nfsd/nfssvc.c | 13 ++----------- include/linux/sunrpc/svcsock.h | 2 +- net/sunrpc/svc_xprt.c | 28 +++++++++++----------------- 5 files changed, 17 insertions(+), 46 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 91ef139a7757..cf4ff7d3564c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -116,7 +116,6 @@ static void set_grace_period(struct net *net) static int lockd(void *vrqstp) { - int err = 0; struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -138,13 +137,7 @@ lockd(void *vrqstp) timeout = nlmsvc_retry_blocked(); - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) - continue; + svc_recv(rqstp, timeout); } if (nlmsvc_ops) nlmsvc_invalidate_all(); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 2d94384bd6a9..54155b484f7b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -74,19 +74,12 @@ out_err: static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - } + while (!kthread_freezable_should_stop(NULL)) + svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); svc_exit_thread(rqstp); return 0; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 547feb8ad0af..4c0ab101d90b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -939,7 +939,6 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int err; /* At this point, the thread shares current->fs * with the init process. We need to create files with the @@ -958,19 +957,11 @@ nfsd(void *vrqstp) /* * The main request loop */ - for (;;) { + while (!kthread_should_stop()) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) - ; - if (err == -EINTR) - break; - + svc_recv(rqstp, 60*60*HZ); validate_process_creds(); } diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index a7ea54460b1a..c0ddd331d82f 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -57,7 +57,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk) * Function prototypes. */ void svc_close_net(struct svc_serv *, struct net *); -int svc_recv(struct svc_rqst *, long); +void svc_recv(struct svc_rqst *, long); void svc_send(struct svc_rqst *rqstp); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8430b151bd71..d7d69143011c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -679,7 +679,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static int svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; @@ -704,7 +704,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_IDLE); if (kthread_should_stop()) { set_current_state(TASK_RUNNING); - return -EINTR; + return false; } trace_svc_alloc_arg_err(pages, ret); memalloc_retry_wait(GFP_KERNEL); @@ -723,7 +723,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) arg->tail[0].iov_len = 0; rqstp->rq_xid = xdr_zero; - return 0; + return true; } static bool @@ -785,8 +785,8 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) percpu_counter_inc(&pool->sp_threads_timedout); if (kthread_should_stop()) - return ERR_PTR(-EINTR); - return ERR_PTR(-EAGAIN); + return NULL; + return NULL; out_found: /* Normally we will wait up to 5 seconds for any required * cache information to be provided. @@ -868,32 +868,27 @@ out: * organised not to touch any cachelines in the shared svc_serv * structure, only cachelines in the local svc_pool. */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +void svc_recv(struct svc_rqst *rqstp, long timeout) { struct svc_xprt *xprt = NULL; struct svc_serv *serv = rqstp->rq_server; - int len, err; + int len; - err = svc_alloc_arg(rqstp); - if (err) + if (!svc_alloc_arg(rqstp)) goto out; try_to_freeze(); cond_resched(); - err = -EINTR; if (kthread_should_stop()) goto out; xprt = svc_get_next_xprt(rqstp, timeout); - if (IS_ERR(xprt)) { - err = PTR_ERR(xprt); + if (!xprt) goto out; - } len = svc_handle_xprt(rqstp, xprt); /* No data, incomplete (TCP) read, or accept() */ - err = -EAGAIN; if (len <= 0) goto out_release; @@ -907,12 +902,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) serv->sv_stats->netcnt++; rqstp->rq_stime = ktime_get(); svc_process(rqstp); - return 0; +out: + return; out_release: rqstp->rq_res.len = 0; svc_xprt_release(rqstp); -out: - return err; } EXPORT_SYMBOL_GPL(svc_recv); -- cgit v1.2.3 From c743b4259c3af2c0637c307f08a062d25fa3c99f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 Jul 2023 16:38:08 +1000 Subject: SUNRPC: remove timeout arg from svc_recv() Most svc threads have no interest in a timeout. nfsd sets it to 1 hour, but this is a wart of no significance. lockd uses the timeout so that it can call nlmsvc_retry_blocked(). It also sometimes calls svc_wake_up() to ensure this is called. So change lockd to be consistent and always use svc_wake_up() to trigger nlmsvc_retry_blocked() - using a timer instead of a timeout to svc_recv(). And change svc_recv() to not take a timeout arg. This makes the sp_threads_timedout counter always zero. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 14 +++++++++----- fs/lockd/svclock.c | 5 +++-- fs/nfs/callback.c | 2 +- fs/nfsd/nfssvc.c | 2 +- include/linux/lockd/lockd.h | 4 +++- include/linux/sunrpc/svc.h | 1 - include/linux/sunrpc/svcsock.h | 2 +- net/sunrpc/svc.c | 2 -- net/sunrpc/svc_xprt.c | 34 ++++++++++++++++------------------ 9 files changed, 34 insertions(+), 32 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index cf4ff7d3564c..ef3f77a59556 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -56,6 +56,12 @@ static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; +static void nlmsvc_request_retry(struct timer_list *tl) +{ + svc_wake_up(nlmsvc_serv); +} +DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); + unsigned int lockd_net_id; /* @@ -130,14 +136,11 @@ lockd(void *vrqstp) * NFS mount or NFS daemon has gone away. */ while (!kthread_should_stop()) { - long timeout = MAX_SCHEDULE_TIMEOUT; - /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; - timeout = nlmsvc_retry_blocked(); - - svc_recv(rqstp, timeout); + nlmsvc_retry_blocked(); + svc_recv(rqstp); } if (nlmsvc_ops) nlmsvc_invalidate_all(); @@ -371,6 +374,7 @@ static void lockd_put(void) #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); + timer_delete_sync(&nlmsvc_retry); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 28abec5c451d..43aeba9de55c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -1019,7 +1019,7 @@ retry_deferred_block(struct nlm_block *block) * picks up locks that can be granted, or grant notifications that must * be retransmitted. */ -unsigned long +void nlmsvc_retry_blocked(void) { unsigned long timeout = MAX_SCHEDULE_TIMEOUT; @@ -1049,5 +1049,6 @@ nlmsvc_retry_blocked(void) } spin_unlock(&nlm_blocked_lock); - return timeout; + if (timeout < MAX_SCHEDULE_TIMEOUT) + mod_timer(&nlmsvc_retry, jiffies + timeout); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 54155b484f7b..39a0ba746267 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -79,7 +79,7 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); while (!kthread_freezable_should_stop(NULL)) - svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); + svc_recv(rqstp); svc_exit_thread(rqstp); return 0; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 4c0ab101d90b..1582af33e204 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -961,7 +961,7 @@ nfsd(void *vrqstp) /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; - svc_recv(rqstp, 60*60*HZ); + svc_recv(rqstp); validate_process_creds(); } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index f42594a9efe0..0f016d69c996 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -204,6 +204,8 @@ extern unsigned long nlmsvc_timeout; extern bool nsm_use_hostnames; extern u32 nsm_local_state; +extern struct timer_list nlmsvc_retry; + /* * Lockd client functions */ @@ -280,7 +282,7 @@ __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *); __be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); -unsigned long nlmsvc_retry_blocked(void); +void nlmsvc_retry_blocked(void); void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t match); void nlmsvc_grant_reply(struct nlm_cookie *, __be32); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 2230148d9d68..b206fdde8e97 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -41,7 +41,6 @@ struct svc_pool { /* statistics on pool operation */ struct percpu_counter sp_sockets_queued; struct percpu_counter sp_threads_woken; - struct percpu_counter sp_threads_timedout; #define SP_TASK_PENDING (0) /* still work to do even if no * xprt is queued. */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index c0ddd331d82f..d4a173c5b3be 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -57,7 +57,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk) * Function prototypes. */ void svc_close_net(struct svc_serv *, struct net *); -void svc_recv(struct svc_rqst *, long); +void svc_recv(struct svc_rqst *rqstp); void svc_send(struct svc_rqst *rqstp); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c69896c124a4..030f8c759ee6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -515,7 +515,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); - percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); } return serv; @@ -590,7 +589,6 @@ svc_destroy(struct kref *ref) percpu_counter_destroy(&pool->sp_sockets_queued); percpu_counter_destroy(&pool->sp_threads_woken); - percpu_counter_destroy(&pool->sp_threads_timedout); } kfree(serv->sv_pools); kfree(serv); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d7d69143011c..9bdcdd8401b8 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -750,10 +750,9 @@ rqst_should_sleep(struct svc_rqst *rqstp) return true; } -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp) { struct svc_pool *pool = rqstp->rq_pool; - long time_left = 0; /* rq_xprt should be clear on entry */ WARN_ON_ONCE(rqstp->rq_xprt); @@ -769,7 +768,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) smp_mb__after_atomic(); if (likely(rqst_should_sleep(rqstp))) - time_left = schedule_timeout(timeout); + schedule(); else __set_current_state(TASK_RUNNING); @@ -781,9 +780,6 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (rqstp->rq_xprt) goto out_found; - if (!time_left) - percpu_counter_inc(&pool->sp_threads_timedout); - if (kthread_should_stop()) return NULL; return NULL; @@ -863,12 +859,15 @@ out: return len; } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. +/** + * svc_recv - Receive and process the next request on any transport + * @rqstp: an idle RPC service thread + * + * This code is carefully organised not to touch any cachelines in + * the shared svc_serv structure, only cachelines in the local + * svc_pool. */ -void svc_recv(struct svc_rqst *rqstp, long timeout) +void svc_recv(struct svc_rqst *rqstp) { struct svc_xprt *xprt = NULL; struct svc_serv *serv = rqstp->rq_server; @@ -882,7 +881,7 @@ void svc_recv(struct svc_rqst *rqstp, long timeout) if (kthread_should_stop()) goto out; - xprt = svc_get_next_xprt(rqstp, timeout); + xprt = svc_get_next_xprt(rqstp); if (!xprt) goto out; @@ -1447,12 +1446,11 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) return 0; } - seq_printf(m, "%u %llu %llu %llu %llu\n", - pool->sp_id, - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_sockets_queued), - percpu_counter_sum_positive(&pool->sp_threads_woken), - percpu_counter_sum_positive(&pool->sp_threads_timedout)); + seq_printf(m, "%u %llu %llu %llu 0\n", + pool->sp_id, + percpu_counter_sum_positive(&pool->sp_sockets_queued), + percpu_counter_sum_positive(&pool->sp_sockets_queued), + percpu_counter_sum_positive(&pool->sp_threads_woken)); return 0; } -- cgit v1.2.3 From 6372e2ee629894433fe6107d7048536a3280a284 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Aug 2023 10:20:52 -0400 Subject: NFSD: da_addr_body field missing in some GETDEVICEINFO replies The XDR specification in RFC 8881 looks like this: struct device_addr4 { layouttype4 da_layout_type; opaque da_addr_body<>; }; struct GETDEVICEINFO4resok { device_addr4 gdir_device_addr; bitmap4 gdir_notification; }; union GETDEVICEINFO4res switch (nfsstat4 gdir_status) { case NFS4_OK: GETDEVICEINFO4resok gdir_resok4; case NFS4ERR_TOOSMALL: count4 gdir_mincount; default: void; }; Looking at nfsd4_encode_getdeviceinfo() .... When the client provides a zero gd_maxcount, then the Linux NFS server implementation encodes the da_layout_type field and then skips the da_addr_body field completely, proceeding directly to encode gdir_notification field. There does not appear to be an option in the specification to skip encoding da_addr_body. Moreover, Section 18.40.3 says: > If the client wants to just update or turn off notifications, it > MAY send a GETDEVICEINFO operation with gdia_maxcount set to zero. > In that event, if the device ID is valid, the reply's da_addr_body > field of the gdir_device_addr field will be of zero length. Since the layout drivers are responsible for encoding the da_addr_body field, put this fix inside the ->encode_getdeviceinfo methods. Fixes: 9cf514ccfacb ("nfsd: implement pNFS operations") Reviewed-by: Christoph Hellwig Cc: Tom Haynes Signed-off-by: Chuck Lever --- fs/nfsd/blocklayoutxdr.c | 9 +++++++++ fs/nfsd/flexfilelayoutxdr.c | 9 +++++++++ fs/nfsd/nfs4xdr.c | 25 +++++++++++-------------- 3 files changed, 29 insertions(+), 14 deletions(-) (limited to 'fs/nfsd') diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 8e9c1a0f8d38..1ed2f691ebb9 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -83,6 +83,15 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, int len = sizeof(__be32), ret, i; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + p = xdr_reserve_space(xdr, len + sizeof(__be32)); if (!p) return nfserr_resource; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index e81d2a5cf381..bb205328e043 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, int addr_len; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + /* len + padding for two strings */ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; ver_len = 20; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d4de39404cde..2e40c74d2f72 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4686,20 +4686,17 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(gdev->gd_layout_type); - /* If maxcount is 0 then just update notifications */ - if (gdev->gd_maxcount != 0) { - ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { - /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. - */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) - goto toosmall; - return nfserr; - } + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + return nfserr; } if (gdev->gd_notify_types) { -- cgit v1.2.3