From 74fd48739d0488e39ae18b0168720f449a06690c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 13 Oct 2023 09:03:53 -0400 Subject: nfsd: new Kconfig option for legacy client tracking We've had a number of attempts at different NFSv4 client tracking methods over the years, but now nfsdcld has emerged as the clear winner since the others (recoverydir and the usermodehelper upcall) are problematic. As a case in point, the recoverydir backend uses MD5 hashes to encode long form clientid strings, which means that nfsd repeatedly gets dinged on FIPS audits, since MD5 isn't considered secure. Its use of MD5 is not cryptographically significant, so there is no danger there, but allowing us to compile that out allows us to sidestep the issue entirely. As a prelude to eventually removing support for these client tracking methods, add a new Kconfig option that enables them. Mark it deprecated and make it default to N. Acked-by: NeilBrown Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 16 +++++++++ fs/nfsd/nfs4recover.c | 97 +++++++++++++++++++++++++++++++++------------------ fs/nfsd/nfsctl.c | 6 ++++ 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 43b88eaf0673..272ab8d5c4d7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -158,3 +158,19 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. + +config NFSD_LEGACY_CLIENT_TRACKING + bool "Support legacy NFSv4 client tracking methods (DEPRECATED)" + depends on NFSD_V4 + default n + help + The NFSv4 server needs to store a small amount of information on + stable storage in order to handle state recovery after reboot. Most + modern deployments upcall to a userland daemon for this (nfsdcld), + but older NFS servers may store information directly in a + recoverydir, or spawn a process directly using a usermodehelper + upcall. + + These legacy client tracking methods have proven to be probelmatic + and will be removed in the future. Say Y here if you need support + for them in the interim. diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3509e73abe1f..2c060e0b1604 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -66,6 +66,7 @@ struct nfsd4_client_tracking_ops { static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -720,6 +721,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .version = 1, .msglen = 0, }; +#endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */ /* Globals */ #define NFSD_PIPE_DIR "nfsd" @@ -731,8 +733,10 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - bool cn_has_legacy; struct crypto_shash *cn_tfm; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + bool cn_has_legacy; +#endif }; struct cld_upcall { @@ -793,7 +797,6 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, uint8_t cmd, princhashlen; struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; - struct cld_net *cn = nn->cld_net; if (get_user(cmd, &cmsg->cm_cmd)) { dprintk("%s: error when copying cmd from userspace", __func__); @@ -833,11 +836,15 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, return PTR_ERR(name.data); name.len = namelen; } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { + struct cld_net *cn = nn->cld_net; + name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } +#endif if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); kfree(princhash.data); @@ -1010,7 +1017,9 @@ __nfsd4_init_cld_pipe(struct net *net) } cn->cn_pipe->dentry = dentry; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING cn->cn_has_legacy = false; +#endif nn->cld_net = cn; return 0; @@ -1282,10 +1291,6 @@ nfsd4_cld_check(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - struct cld_net *cn = nn->cld_net; - int status; - char dname[HEXDIR_LEN]; - struct xdr_netobj name; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1296,7 +1301,12 @@ nfsd4_cld_check(struct nfs4_client *clp) if (crp) goto found; - if (cn->cn_has_legacy) { +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + if (nn->cld_net->cn_has_legacy) { + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; @@ -1314,6 +1324,7 @@ nfsd4_cld_check(struct nfs4_client *clp) goto found; } +#endif return -ENOENT; found: crp->cr_clp = clp; @@ -1327,8 +1338,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; int status; - char dname[HEXDIR_LEN]; - struct xdr_netobj name; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; @@ -1342,7 +1351,11 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (crp) goto found; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (cn->cn_has_legacy) { + struct xdr_netobj name; + char dname[HEXDIR_LEN]; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; @@ -1360,6 +1373,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) goto found; } +#endif return -ENOENT; found: if (crp->cr_princhash.len) { @@ -1663,6 +1677,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { .msglen = sizeof(struct cld_msg_v2), }; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), @@ -2007,28 +2022,10 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .msglen = 0, }; -int -nfsd4_client_tracking_init(struct net *net) +static inline int check_for_legacy_methods(int status, struct net *net) { - int status; - struct path path; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - /* just run the init if it the method is already decided */ - if (nn->client_tracking_ops) - goto do_init; - - /* First, try to use nfsdcld */ - nn->client_tracking_ops = &nfsd4_cld_tracking_ops; - status = nn->client_tracking_ops->init(net); - if (!status) - return status; - if (status != -ETIMEDOUT) { - nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; - status = nn->client_tracking_ops->init(net); - if (!status) - return status; - } + struct path path; /* * Next, try the UMH upcall. @@ -2045,14 +2042,46 @@ nfsd4_client_tracking_init(struct net *net) nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { - status = d_is_dir(path.dentry); + status = !d_is_dir(path.dentry); path_put(&path); - if (!status) { - status = -EINVAL; - goto out; - } + if (status) + return -ENOTDIR; + status = nn->client_tracking_ops->init(net); + } + return status; +} +#else +static inline int check_for_legacy_methods(int status, struct net *net) +{ + return status; +} +#endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */ + +int +nfsd4_client_tracking_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int status; + + /* just run the init if it the method is already decided */ + if (nn->client_tracking_ops) + goto do_init; + + /* First, try to use nfsdcld */ + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + if (status != -ETIMEDOUT) { + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; } + status = check_for_legacy_methods(status, net); + if (status) + goto out; do_init: status = nn->client_tracking_ops->init(net); out: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 87fed75808ff..6a3b385703cc 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -76,7 +76,9 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +#endif static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif @@ -93,7 +95,9 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = { #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = write_recoverydir, +#endif [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; @@ -1021,6 +1025,7 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, struct nfsd_net *nn) { @@ -1081,6 +1086,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) mutex_unlock(&nfsd_mutex); return rv; } +#endif /* * write_v4_end_grace - release grace period for nfsd's v4.x lock manager -- cgit v1.2.3 From f3734cc4073f68ac3566293acc6d62971c47ad5a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 26 Oct 2023 16:50:18 +0200 Subject: NFSD: use read_seqbegin() rather than read_seqbegin_or_lock() The usage of read_seqbegin_or_lock() in nfsd_copy_write_verifier() is wrong. "seq" is always even and thus "or_lock" has no effect, this code can never take ->writeverf_lock for writing. I guess this is fine, nfsd_copy_write_verifier() just copies 8 bytes and nfsd_reset_write_verifier() is supposed to be very rare operation so we do not need the adaptive locking in this case. Yet the code looks wrong and sub-optimal, it can use read_seqbegin() without changing the behaviour. [ cel: Note also that it eliminates this Sparse warning: fs/nfsd/nfssvc.c:360:6: warning: context imbalance in 'nfsd_copy_write_verifier' - different lock contexts for basic block ] Signed-off-by: Oleg Nesterov Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7a2bc8e82a63..ee835bf9ee42 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -359,13 +359,12 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { - int seq = 0; + unsigned int seq; do { - read_seqbegin_or_lock(&nn->writeverf_lock, &seq); + seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); - } while (need_seqretry(&nn->writeverf_lock, seq)); - done_seqretry(&nn->writeverf_lock, seq); + } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) -- cgit v1.2.3 From ce7df05508c30de140c1dfd9a32a8c03c5671ecc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 22 Oct 2023 18:50:10 -0400 Subject: NFSD: Make the file_delayed_close workqueue UNBOUND workqueue: nfsd_file_delayed_close [nfsd] hogged CPU for >13333us 8 times, consider switching to WQ_UNBOUND There's no harm in closing a cached file descriptor on another core. Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef063f93fde9..66dd5059f1bb 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -717,7 +717,7 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); + nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0); if (!nfsd_filecache_wq) goto out; -- cgit v1.2.3 From d0ab8b649ba7636d181605e31bf3e42b0784bc67 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Nov 2023 08:45:07 -0500 Subject: NFSD: Remove nfsd_drc_gc() tracepoint This trace point was for debugging the DRC's garbage collection. In the field it's just noise. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 6 +----- fs/nfsd/trace.h | 22 ---------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d3273a396659..5c1a4a0aa605 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -364,8 +364,6 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) if (freed > sc->nr_to_scan) break; } - - trace_nfsd_drc_gc(nn, freed); return freed; } @@ -508,7 +506,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; - unsigned long freed; LIST_HEAD(dispose); int rtn = RC_DOIT; @@ -538,8 +535,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, nfsd_prune_bucket_locked(nn, b, 3, &dispose); spin_unlock(&b->cache_lock); - freed = nfsd_cacherep_dispose(&dispose); - trace_nfsd_drc_gc(nn, freed); + nfsd_cacherep_dispose(&dispose); nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index fbc0ccb40424..d1e8cf079b0f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1262,28 +1262,6 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); -TRACE_EVENT_CONDITION(nfsd_drc_gc, - TP_PROTO( - const struct nfsd_net *nn, - unsigned long freed - ), - TP_ARGS(nn, freed), - TP_CONDITION(freed > 0), - TP_STRUCT__entry( - __field(unsigned long long, boot_time) - __field(unsigned long, freed) - __field(int, total) - ), - TP_fast_assign( - __entry->boot_time = nn->boot_time; - __entry->freed = freed; - __entry->total = atomic_read(&nn->num_drc_entries); - ), - TP_printk("boot_time=%16llx total=%d freed=%lu", - __entry->boot_time, __entry->total, __entry->freed - ) -); - TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, -- cgit v1.2.3 From a853ed552545e116cf9b197b8c3c5cec80077f1e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 19 Nov 2023 20:17:11 -0500 Subject: NFSD: Document lack of f_pos_lock in nfsd_readdir() Al Viro notes that normal system calls hold f_pos_lock when calling ->iterate_shared and ->llseek; however nfsd_readdir() does not take that mutex when calling these methods. It should be safe however because the struct file acquired by nfsd_readdir() is not visible to other threads. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e01e4e2acbd9..c260cbfa8176 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2102,9 +2102,23 @@ static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, return cdp->err; } -/* - * Read entries from a directory. - * The NFSv3/4 verifier we ignore for now. +/** + * nfsd_readdir - Read entries from a directory + * @rqstp: RPC transaction context + * @fhp: NFS file handle of directory to be read + * @offsetp: OUT: seek offset of final entry that was read + * @cdp: OUT: an eof error value + * @func: entry filler actor + * + * This implementation ignores the NFSv3/4 verifier cookie. + * + * NB: normal system calls hold file->f_pos_lock when calling + * ->iterate_shared and ->llseek, but nfsd_readdir() does not. + * Because the struct file acquired here is not visible to other + * threads, it's internal state does not need mutex protection. + * + * Returns nfs_ok on success, otherwise an nfsstat code is + * returned. */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, -- cgit v1.2.3 From deb704281f076097b0347116a82edeba96697db1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:27 -0500 Subject: SUNRPC: Add a server-side API for retrieving an RPC's pseudoflavor NFSD will use this new API to determine whether nfsd_splice_read is safe to use. This avoids the need to add a dependency to NFSD for CONFIG_SUNRPC_GSS. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcauth.h | 7 ++++++- net/sunrpc/auth_gss/svcauth_gss.c | 6 ++++++ net/sunrpc/svcauth.c | 16 ++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 6f90203edbf8..61c455f1e1f5 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -131,8 +131,11 @@ enum svc_auth_status { * This call releases a domain. * * set_client() - * Givens a pending request (struct svc_rqst), finds and assigns + * Given a pending request (struct svc_rqst), finds and assigns * an appropriate 'auth_domain' as the client. + * + * pseudoflavor() + * Returns RPC_AUTH pseudoflavor in use by @rqstp. */ struct auth_ops { char * name; @@ -143,11 +146,13 @@ struct auth_ops { int (*release)(struct svc_rqst *rqstp); void (*domain_release)(struct auth_domain *dom); enum svc_auth_status (*set_client)(struct svc_rqst *rqstp); + rpc_authflavor_t (*pseudoflavor)(struct svc_rqst *rqstp); }; struct svc_xprt; extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); +extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 18734e70c5dd..104d9a320142 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -2014,6 +2014,11 @@ svcauth_gss_domain_release(struct auth_domain *dom) call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } +static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) +{ + return svcauth_gss_flavor(rqstp->rq_gssclient); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, @@ -2022,6 +2027,7 @@ static struct auth_ops svcauthops_gss = { .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, + .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index aa4429d0b810..1619211f0960 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -160,6 +160,22 @@ svc_auth_unregister(rpc_authflavor_t flavor) } EXPORT_SYMBOL_GPL(svc_auth_unregister); +/** + * svc_auth_flavor - return RPC transaction's RPC_AUTH flavor + * @rqstp: RPC transaction context + * + * Returns an RPC flavor or GSS pseudoflavor. + */ +rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + + if (!aops->pseudoflavor) + return aops->flavour; + return aops->pseudoflavor(rqstp); +} +EXPORT_SYMBOL_GPL(svc_auth_flavor); + /************************************************** * 'auth_domains' are stored in a hash table indexed by name. * When the last reference to an 'auth_domain' is dropped, -- cgit v1.2.3 From c21fd7a8e86c0e069b512462ffd69bcf179387c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:33 -0500 Subject: NFSD: Replace RQ_SPLICE_OK in nfsd_read() RQ_SPLICE_OK is a bit of a layering violation. Also, a subsequent patch is going to provide a mechanism for always disabling splice reads. Splicing is an issue only for NFS READs, so refactor nfsd_read() to check the auth type directly instead of relying on an rq_flag setting. The new helper will be added into the NFSv4 read path in a subsequent patch. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 26 +++++++++++++++++++++++++- fs/nfsd/vfs.h | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c260cbfa8176..cc63fd52a493 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1209,6 +1209,30 @@ out_nfserr: return nfserr; } +/** + * nfsd_read_splice_ok - check if spliced reading is supported + * @rqstp: RPC transaction context + * + * Return values: + * %true: nfsd_splice_read() may be used + * %false: nfsd_splice_read() must not be used + * + * NFS READ normally uses splice to send data in-place. However the + * data in cache can change after the reply's MIC is computed but + * before the RPC reply is sent. To prevent the client from + * rejecting the server-computed MIC in this somewhat rare case, do + * not use splice with the GSS integrity and privacy services. + */ +bool nfsd_read_splice_ok(struct svc_rqst *rqstp) +{ + switch (svc_auth_flavor(rqstp)) { + case RPC_AUTH_GSS_KRB5I: + case RPC_AUTH_GSS_KRB5P: + return false; + } + return true; +} + /** * nfsd_read - Read data from a file * @rqstp: RPC transaction context @@ -1238,7 +1262,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; file = nf->nf_file; - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e3c29596f4df..702fbc4483bf 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -114,6 +114,7 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, unsigned int base, u32 *eof); +bool nfsd_read_splice_ok(struct svc_rqst *rqstp); __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long *count, u32 *eof); -- cgit v1.2.3 From a2c91753a4f3771a9b46eb42e0c46654819149a4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:40 -0500 Subject: NFSD: Modify NFSv4 to use nfsd_read_splice_ok() Avoid the use of an atomic bitop, and prepare for adding a run-time switch for using splice reads. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 7 +++++-- fs/nfsd/nfs4xdr.c | 13 ++++++++----- fs/nfsd/xdr4.h | 1 + 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6f2d4aa4970d..14712fa08f76 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -970,8 +970,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * To ensure proper ordering, we therefore turn off zero copy if * the client wants us to do more in this compound: */ - if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + if (!nfsd4_last_compound_op(rqstp)) { + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + argp->splice_ok = false; + } /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b499fe9caa32..c719c475a068 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2524,8 +2524,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) svc_reserve(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + argp->splice_ok = false; return true; } @@ -4375,12 +4376,13 @@ static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; struct nfsd4_read *read = &u->read; - bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); - unsigned long maxcount; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; + bool splice_ok = argp->splice_ok; + unsigned long maxcount; + struct file *file; __be32 *p; if (nfserr) @@ -5201,9 +5203,10 @@ static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read) { - bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; + bool splice_ok = argp->splice_ok; unsigned long maxcount; __be32 nfserr, *p; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 80e859dc84d8..415516c1b27e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -840,6 +840,7 @@ struct nfsd4_compoundargs { u32 minorversion; u32 client_opcnt; u32 opcnt; + bool splice_ok; struct nfsd4_op *ops; struct nfsd4_op iops[8]; }; -- cgit v1.2.3 From 3587b5c75376fd0b6ca8c4a8de54954e410f4e0e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:46 -0500 Subject: SUNRPC: Remove RQ_SPLICE_OK This flag is no longer used. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 -- include/trace/events/sunrpc.h | 1 - net/sunrpc/auth_gss/svcauth_gss.c | 10 ---------- net/sunrpc/svc.c | 2 -- 4 files changed, 15 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b10f987509cc..544fcfe07479 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -260,8 +260,6 @@ enum { RQ_LOCAL, /* local request */ RQ_USEDEFERRAL, /* use deferral */ RQ_DROPME, /* drop current reply */ - RQ_SPLICE_OK, /* turned off in gss privacy to prevent - * encrypting page cache pages */ RQ_VICTIM, /* Have agreed to shut down */ RQ_DATA, /* request has data */ }; diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 337c90787fb1..cdd3a45e6003 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1675,7 +1675,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto); svc_rqst_flag(LOCAL) \ svc_rqst_flag(USEDEFERRAL) \ svc_rqst_flag(DROPME) \ - svc_rqst_flag(SPLICE_OK) \ svc_rqst_flag(VICTIM) \ svc_rqst_flag_end(DATA) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 104d9a320142..24de94184700 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -866,14 +866,6 @@ svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) struct xdr_buf databody_integ; struct xdr_netobj checksum; - /* NFS READ normally uses splice to send data in-place. However - * the data in cache can change after the reply's MIC is computed - * but before the RPC reply is sent. To prevent the client from - * rejecting the server-computed MIC in this somewhat rare case, - * do not use splice with the GSS integrity service. - */ - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); - /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; @@ -948,8 +940,6 @@ svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) struct xdr_buf *buf = xdr->buf; unsigned int saved_len; - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); - if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 3f2ea7a0496f..fa4e23fa0e09 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1305,8 +1305,6 @@ svc_process_common(struct svc_rqst *rqstp) int rc; __be32 *p; - /* Will be turned off by GSS integrity and privacy services */ - set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); -- cgit v1.2.3 From 3c86e615d17d1f6194ff222247d291fc9df16ff4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Dec 2023 15:30:06 +0300 Subject: nfsd: remove unnecessary NULL check We check "state" for NULL on the previous line so it can't be NULL here. No need to check again. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202312031425.LffZTarR-lkp@intel.com/ Signed-off-by: Dan Carpenter Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3edbfa0233e6..2fa54cfd4882 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6575,7 +6575,7 @@ unlock: spin_unlock(&nn->s2s_cp_lock); if (!state) return nfserr_bad_stateid; - if (!clp && state) + if (!clp) *cps = state; return 0; } -- cgit v1.2.3 From 52e89100754b2e888cb63bf2d19e65d809497cd6 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Sat, 2 Dec 2023 21:07:25 +0000 Subject: NFSv4, NFSD: move enum nfs_cb_opnum4 to include/linux/nfs4.h Callback operations enum is defined in client and server, move it to common header file. Signed-off-by: ChenXiaoSong Acked-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfs/callback.h | 19 ------------------- fs/nfsd/nfs4callback.c | 26 +------------------------- include/linux/nfs4.h | 22 ++++++++++++++++++++++ 3 files changed, 23 insertions(+), 44 deletions(-) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index ccd4f245cae2..0279b78b5fc9 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -19,25 +19,6 @@ enum nfs4_callback_procnum { CB_COMPOUND = 1, }; -enum nfs4_callback_opnum { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, -/* Callback operations new to NFSv4.1 */ - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, -/* Callback operations new to NFSv4.2 */ - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044, -}; - struct nfs4_slot; struct cb_process_state { __be32 drc_status; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4039ffcf90ba..926c29879c6a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -31,6 +31,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -87,31 +88,6 @@ static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); } -/* - * nfs_cb_opnum4 - * - * enum nfs_cb_opnum4 { - * OP_CB_GETATTR = 3, - * ... - * }; - */ -enum nfs_cb_opnum4 { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044 -}; - static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) { __be32 *p; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c11c4db34639..ef8d2d618d5b 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -869,4 +869,26 @@ enum { RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15, }; +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + + /* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + + /* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, + + OP_CB_ILLEGAL = 10044, +}; + #endif -- cgit v1.2.3 From b541dd554bc0442f7ff8c6cab6c5460c044913c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:13 -0500 Subject: svcrdma: Eliminate allocation of recv_ctxt objects in backchannel The svc_rdma_recv_ctxt free list uses a lockless list to avoid the need for a spin lock in the fast path. llist_del_first(), which is used by svc_rdma_recv_ctxt_get(), requires serialization, however, when there are multiple list producers that are unserialized. I mistakenly thought there was only one caller of svc_rdma_recv_ctxt_get() (svc_rdma_refresh_recvs()), thus explicit serialization would not be necessary. But there is another caller: svc_rdma_bc_sendto(), and these two are not serialized against each other. I haven't seen ill effects that I could directly ascribe to a lack of serialization. It's just an observation based on code audit. When DMA-mapping before sending a Reply, the passed-in struct svc_rdma_recv_ctxt is used only for its write and reply PCLs. These are currently always empty in the backchannel case. So, instead of passing a full svc_rdma_recv_ctxt object to svc_rdma_map_reply_msg(), let's pass in just the Write and Reply PCLs. This change makes it unnecessary for the backchannel to acquire a dummy svc_rdma_recv_ctxt object when sending an RPC Call. The need for svc_rdma_recv_ctxt free list serialization is now completely avoided. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 11 ++++------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 31 ++++++++++++++++-------------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a5ee0af2a310..4ac32895a058 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -200,7 +200,8 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr); extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 7420a2c990c7..c9be6778643b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -76,15 +76,12 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, struct svc_rdma_send_ctxt *sctxt) { - struct svc_rdma_recv_ctxt *rctxt; + struct svc_rdma_pcl empty_pcl; int ret; - rctxt = svc_rdma_recv_ctxt_get(rdma); - if (!rctxt) - return -EIO; - - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); - svc_rdma_recv_ctxt_put(rdma, rctxt); + pcl_init(&empty_pcl); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &empty_pcl, &empty_pcl, + &rqst->rq_snd_buf); if (ret < 0) return -EIO; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c6644cca52c5..45735f74eb86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -653,7 +653,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: xdr_buf containing RPC message to transmit * * Returns: @@ -662,7 +662,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, */ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, const struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { /* Resources needed for the transport header */ @@ -672,7 +672,7 @@ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_count_sges, &args); if (ret < 0) return false; @@ -728,7 +728,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: prepared xdr_buf containing RPC message * * The device is not capable of sending the reply directly. @@ -743,7 +743,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, */ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { struct svc_rdma_pullup_data args = { @@ -751,7 +751,7 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_linearize, &args); if (ret < 0) return ret; @@ -764,7 +764,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, /* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client * @xdr: prepared xdr_buf containing RPC message * * Returns: @@ -776,7 +777,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr) { struct svc_rdma_map_data args = { @@ -789,18 +791,18 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; /* If there is a Reply chunk, nothing follows the transport - * header, and we're done here. + * header, so there is nothing to map. */ - if (!pcl_is_empty(&rctxt->rc_reply_pcl)) + if (!pcl_is_empty(reply_pcl)) return 0; /* For pull-up, svc_rdma_send() will sync the transport header. * No additional DMA mapping is necessary. */ - if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) - return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + if (svc_rdma_pull_up_needed(rdma, sctxt, write_pcl, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, write_pcl, xdr); - return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + return pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_dma_map, &args); } @@ -848,7 +850,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, { int ret; - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, &rqstp->rq_res); if (ret < 0) return ret; -- cgit v1.2.3 From 877118c667abe0df7e4d7b0607f77806a9d2df91 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:20 -0500 Subject: svcrdma: Pre-allocate svc_rdma_recv_ctxt objects The original reason for allocating svc_rdma_recv_ctxt objects during Receive completion was to ensure the objects were allocated on the NUMA node closest to the underlying IB device. Since commit c5d68d25bd6b ("svcrdma: Clean up allocation of svc_rdma_recv_ctxt"), however, the device's favored node is explicitly passed to the memory allocator. To enable switching Receive completion to soft IRQ context, move memory allocation out of completion handling, since it can be costly, and it can sleep. A limited number of objects is now allocated at "accept" time. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3b05f90a3e50..c8c1c534070b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -204,18 +204,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) node = llist_del_first(&rdma->sc_recv_ctxts); if (!node) - goto out_empty; - ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); + return NULL; -out: + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); ctxt->rc_page_count = 0; return ctxt; - -out_empty: - ctxt = svc_rdma_recv_ctxt_alloc(rdma); - if (!ctxt) - return NULL; - goto out; } /** @@ -277,7 +270,7 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, rdma->sc_pending_recvs++; } if (!recv_chain) - return false; + return true; ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) @@ -301,10 +294,27 @@ err_free: * svc_rdma_post_recvs - Post initial set of Recv WRs * @rdma: fresh svcxprt_rdma * - * Returns true if successful, otherwise false. + * Return values: + * %true: Receive Queue initialization successful + * %false: memory allocation or DMA error */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { + unsigned int total; + + /* For each credit, allocate enough recv_ctxts for one + * posted Receive and one RPC in process. + */ + total = (rdma->sc_max_requests * 2) + rdma->sc_recv_batch; + while (total--) { + struct svc_rdma_recv_ctxt *ctxt; + + ctxt = svc_rdma_recv_ctxt_alloc(rdma); + if (!ctxt) + return false; + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + } + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests); } -- cgit v1.2.3 From 9c7e1a06588ee6962afe0dfe5a398e1d23212005 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:26 -0500 Subject: svcrdma: Add a utility workqueue to svcrdma To handle work in the background, set up an UNBOUND workqueue for svcrdma. Subsequent patches will make use of it. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma.c | 32 ++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 4ac32895a058..e18c94e816b3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -65,6 +65,7 @@ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; +extern struct workqueue_struct *svcrdma_wq; extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f0d5eeed4c88..f86970733eb0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -256,28 +256,44 @@ out_err: return rc; } +struct workqueue_struct *svcrdma_wq; + void svc_rdma_cleanup(void) { - dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); svc_unreg_xprt_class(&svc_rdma_class); svc_rdma_proc_cleanup(); + if (svcrdma_wq) { + struct workqueue_struct *wq = svcrdma_wq; + + svcrdma_wq = NULL; + destroy_workqueue(wq); + } + + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); } int svc_rdma_init(void) { + struct workqueue_struct *wq; int rc; - dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); - dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %u\n", svcrdma_max_requests); - dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); - dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; rc = svc_rdma_proc_init(); - if (rc) + if (rc) { + destroy_workqueue(wq); return rc; + } - /* Register RDMA with the SVC transport switch */ + svcrdma_wq = wq; svc_reg_xprt_class(&svc_rdma_class); + + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2abd895046ee..c046916df007 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -547,6 +547,7 @@ static void __svc_rdma_free(struct work_struct *work) /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); + flush_workqueue(svcrdma_wq); svc_rdma_flush_recv_queues(rdma); -- cgit v1.2.3 From ae225fe27b931de89b6b1e1bbe6de4de23000850 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:33 -0500 Subject: svcrdma: Add an async version of svc_rdma_send_ctxt_put() DMA unmapping can take quite some time, so it should not be handled in a single-threaded completion handler. Defer releasing send_ctxts to the recently-added workqueue. With this patch, DMA unmapping can be handled in parallel, and it does not cause head-of-queue blocking of Send completions. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 34 +++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index e18c94e816b3..ab250017b99f 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -152,7 +152,9 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; + struct work_struct sc_work; + struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 45735f74eb86..22c39ba923d2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -143,6 +143,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_rdma = rdma; ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; @@ -223,15 +224,8 @@ out_empty: goto out; } -/** - * svc_rdma_send_ctxt_put - Return send_ctxt to free list - * @rdma: controlling svcxprt_rdma - * @ctxt: object to return to the free list - * - * Pages left in sc_pages are DMA unmapped and released. - */ -void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt) +static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; @@ -255,6 +249,28 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); } +static void svc_rdma_send_ctxt_put_async(struct work_struct *work) +{ + struct svc_rdma_send_ctxt *ctxt; + + ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work); + svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt); +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async); + queue_work(svcrdma_wq, &ctxt->sc_work); +} + /** * svc_rdma_wake_send_waiters - manage Send Queue accounting * @rdma: controlling transport -- cgit v1.2.3 From f09c36c8dffc7dcf796b862bffdda5753bec84ef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:39 -0500 Subject: svcrdma: Add an async version of svc_rdma_write_info_free() DMA unmapping can take quite some time, so it should not be handled in a single-threaded completion handler. Defer releasing write_info structs to the recently-added workqueue. With this patch, DMA unmapping can be handled in parallel, and it does not cause head-of-queue blocking of Write completions. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index e460e25a1d6d..de1ec3220aab 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -227,6 +227,7 @@ struct svc_rdma_write_info { unsigned int wi_next_off; struct svc_rdma_chunk_ctxt wi_cc; + struct work_struct wi_work; }; static struct svc_rdma_write_info * @@ -248,12 +249,21 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, return info; } -static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +static void svc_rdma_write_info_free_async(struct work_struct *work) { + struct svc_rdma_write_info *info; + + info = container_of(work, struct svc_rdma_write_info, wi_work); svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); kfree(info); } +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + INIT_WORK(&info->wi_work, svc_rdma_write_info_free_async); + queue_work(svcrdma_wq, &info->wi_work); +} + /** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue -- cgit v1.2.3 From bfb81535c2660d2bb8496e5cbb7480693188cd72 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:46 -0500 Subject: svcrdma: Clean up locking There's no need to protect llist_entry() with a spin lock. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 22c39ba923d2..09f5d0570bc9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -201,10 +201,11 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) spin_lock(&rdma->sc_send_lock); node = llist_del_first(&rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); if (!node) goto out_empty; + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - spin_unlock(&rdma->sc_send_lock); out: rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); @@ -217,7 +218,6 @@ out: return ctxt; out_empty: - spin_unlock(&rdma->sc_send_lock); ctxt = svc_rdma_send_ctxt_alloc(rdma); if (!ctxt) return NULL; -- cgit v1.2.3 From 907e34a7d01d99d7e10a6090f2bdd247bbc5de9a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Nov 2023 11:33:24 -0500 Subject: svcrdma: Add lockdep class keys for transport locks Two svcrdma-related transport locks can become quite contended. Collate their use and make them easy to find in /proc/lock_stat for better observability. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c046916df007..3826da1c15f3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -125,6 +125,9 @@ static void qp_event_handler(struct ib_event *event, void *context) static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net, int node) { + static struct lock_class_key svcrdma_rwctx_lock; + static struct lock_class_key svcrdma_sctx_lock; + static struct lock_class_key svcrdma_dto_lock; struct svcxprt_rdma *cma_xprt; cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node); @@ -141,8 +144,11 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + lockdep_set_class(&cma_xprt->sc_rq_dto_lock, &svcrdma_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); + lockdep_set_class(&cma_xprt->sc_send_lock, &svcrdma_sctx_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); + lockdep_set_class(&cma_xprt->sc_rw_ctxt_lock, &svcrdma_rwctx_lock); /* * Note that this implies that the underlying transport support -- cgit v1.2.3 From be2acb104880dbd5582c898d000cab5f38750bb9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Nov 2023 11:33:30 -0500 Subject: rpcrdma: Introduce a simple cid tracepoint class De-duplicate some code, making it easier to add new tracepoints that report only a completion ID. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 93 +++++++++------------------------ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_rw.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 2 +- 5 files changed, 30 insertions(+), 71 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 718df1d9b834..b3445e07c151 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -22,47 +22,37 @@ ** Event classes **/ -DECLARE_EVENT_CLASS(rpcrdma_completion_class, +DECLARE_EVENT_CLASS(rpcrdma_simple_cid_class, TP_PROTO( - const struct ib_wc *wc, const struct rpc_rdma_cid *cid ), - TP_ARGS(wc, cid), + TP_ARGS(cid), TP_STRUCT__entry( __field(u32, cq_id) __field(int, completion_id) - __field(unsigned long, status) - __field(unsigned int, vendor_err) ), TP_fast_assign( __entry->cq_id = cid->ci_queue_id; __entry->completion_id = cid->ci_completion_id; - __entry->status = wc->status; - if (wc->status) - __entry->vendor_err = wc->vendor_err; - else - __entry->vendor_err = 0; ), - TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x)", - __entry->cq_id, __entry->completion_id, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err + TP_printk("cq.id=%d cid=%d", + __entry->cq_id, __entry->completion_id ) ); -#define DEFINE_COMPLETION_EVENT(name) \ - DEFINE_EVENT(rpcrdma_completion_class, name, \ +#define DEFINE_SIMPLE_CID_EVENT(name) \ + DEFINE_EVENT(rpcrdma_simple_cid_class, name, \ TP_PROTO( \ - const struct ib_wc *wc, \ const struct rpc_rdma_cid *cid \ ), \ - TP_ARGS(wc, cid)) + TP_ARGS(cid) \ + ) -DECLARE_EVENT_CLASS(rpcrdma_send_completion_class, +DECLARE_EVENT_CLASS(rpcrdma_completion_class, TP_PROTO( const struct ib_wc *wc, const struct rpc_rdma_cid *cid @@ -73,20 +63,29 @@ DECLARE_EVENT_CLASS(rpcrdma_send_completion_class, TP_STRUCT__entry( __field(u32, cq_id) __field(int, completion_id) + __field(unsigned long, status) + __field(unsigned int, vendor_err) ), TP_fast_assign( __entry->cq_id = cid->ci_queue_id; __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) + __entry->vendor_err = wc->vendor_err; + else + __entry->vendor_err = 0; ), - TP_printk("cq.id=%u cid=%d", - __entry->cq_id, __entry->completion_id + TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x)", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err ) ); -#define DEFINE_SEND_COMPLETION_EVENT(name) \ - DEFINE_EVENT(rpcrdma_send_completion_class, name, \ +#define DEFINE_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_completion_class, name, \ TP_PROTO( \ const struct ib_wc *wc, \ const struct rpc_rdma_cid *cid \ @@ -978,27 +977,7 @@ TRACE_EVENT(xprtrdma_post_send_err, ) ); -TRACE_EVENT(xprtrdma_post_recv, - TP_PROTO( - const struct rpcrdma_rep *rep - ), - - TP_ARGS(rep), - - TP_STRUCT__entry( - __field(u32, cq_id) - __field(int, completion_id) - ), - - TP_fast_assign( - __entry->cq_id = rep->rr_cid.ci_queue_id; - __entry->completion_id = rep->rr_cid.ci_completion_id; - ), - - TP_printk("cq.id=%d cid=%d", - __entry->cq_id, __entry->completion_id - ) -); +DEFINE_SIMPLE_CID_EVENT(xprtrdma_post_recv); TRACE_EVENT(xprtrdma_post_recvs, TP_PROTO( @@ -2020,31 +1999,11 @@ TRACE_EVENT(svcrdma_post_send, ) ); -DEFINE_SEND_COMPLETION_EVENT(svcrdma_wc_send); +DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_send); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_send_flush); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_send_err); -TRACE_EVENT(svcrdma_post_recv, - TP_PROTO( - const struct svc_rdma_recv_ctxt *ctxt - ), - - TP_ARGS(ctxt), - - TP_STRUCT__entry( - __field(u32, cq_id) - __field(int, completion_id) - ), - - TP_fast_assign( - __entry->cq_id = ctxt->rc_cid.ci_queue_id; - __entry->completion_id = ctxt->rc_cid.ci_completion_id; - ), - - TP_printk("cq.id=%d cid=%d", - __entry->cq_id, __entry->completion_id - ) -); +DEFINE_SIMPLE_CID_EVENT(svcrdma_post_recv); DEFINE_RECEIVE_SUCCESS_EVENT(svcrdma_wc_recv); DEFINE_RECEIVE_FLUSH_EVENT(svcrdma_wc_recv_flush); @@ -2153,7 +2112,7 @@ TRACE_EVENT(svcrdma_wc_read, DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_read_flush); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_read_err); -DEFINE_SEND_COMPLETION_EVENT(svcrdma_wc_write); +DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_write); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_flush); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_err); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c8c1c534070b..72374033bb2b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -264,7 +264,7 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, if (!ctxt) break; - trace_svcrdma_post_recv(ctxt); + trace_svcrdma_post_recv(&ctxt->rc_cid); ctxt->rc_recv_wr.next = recv_chain; recv_chain = &ctxt->rc_recv_wr; rdma->sc_pending_recvs++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index de1ec3220aab..db2a4bd2f7ad 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -282,7 +282,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) switch (wc->status) { case IB_WC_SUCCESS: - trace_svcrdma_wc_write(wc, &cc->cc_cid); + trace_svcrdma_wc_write(&cc->cc_cid); break; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 09f5d0570bc9..31b711deab5e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -305,7 +305,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; - trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + trace_svcrdma_wc_send(&ctxt->sc_cid); svc_rdma_send_ctxt_put(rdma, ctxt); return; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 28c0771c4e8c..4f8d7efa469f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1364,7 +1364,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) } rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; - trace_xprtrdma_post_recv(rep); + trace_xprtrdma_post_recv(&rep->rr_cid); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; --needed; -- cgit v1.2.3 From ad3656bd84e0a91d4505095a0df7fbb6a8e0796a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Nov 2023 11:33:37 -0500 Subject: svcrdma: SQ error tracepoints should report completion IDs Update the Send Queue's error flow tracepoints to report the completion ID of the waiting or failing context. This ties the wait/failure to a particular operation or request, which is a little more useful than knowing only the transport that is about to close. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 49 +++++++++++++++++++++-------------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 6 ++--- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 6 ++--- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index b3445e07c151..f1c2022d39ca 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2143,65 +2143,74 @@ TRACE_EVENT(svcrdma_qp_error, ) ); -DECLARE_EVENT_CLASS(svcrdma_sendqueue_event, +DECLARE_EVENT_CLASS(svcrdma_sendqueue_class, TP_PROTO( - const struct svcxprt_rdma *rdma + const struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid ), - TP_ARGS(rdma), + TP_ARGS(rdma, cid), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(int, avail) __field(int, depth) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->avail = atomic_read(&rdma->sc_sq_avail); __entry->depth = rdma->sc_sq_depth; - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s sc_sq_avail=%d/%d", - __get_str(addr), __entry->avail, __entry->depth + TP_printk("cq.id=%u cid=%d sc_sq_avail=%d/%d", + __entry->cq_id, __entry->completion_id, + __entry->avail, __entry->depth ) ); #define DEFINE_SQ_EVENT(name) \ - DEFINE_EVENT(svcrdma_sendqueue_event, svcrdma_sq_##name,\ - TP_PROTO( \ - const struct svcxprt_rdma *rdma \ - ), \ - TP_ARGS(rdma)) + DEFINE_EVENT(svcrdma_sendqueue_class, name, \ + TP_PROTO( \ + const struct svcxprt_rdma *rdma, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(rdma, cid) \ + ) -DEFINE_SQ_EVENT(full); -DEFINE_SQ_EVENT(retry); +DEFINE_SQ_EVENT(svcrdma_sq_full); +DEFINE_SQ_EVENT(svcrdma_sq_retry); TRACE_EVENT(svcrdma_sq_post_err, TP_PROTO( const struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int status ), - TP_ARGS(rdma, status), + TP_ARGS(rdma, cid, status), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(int, avail) __field(int, depth) __field(int, status) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->avail = atomic_read(&rdma->sc_sq_avail); __entry->depth = rdma->sc_sq_depth; __entry->status = status; - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s sc_sq_avail=%d/%d status=%d", - __get_str(addr), __entry->avail, __entry->depth, - __entry->status + TP_printk("cq.id=%u cid=%d sc_sq_avail=%d/%d status=%d", + __entry->cq_id, __entry->completion_id, + __entry->avail, __entry->depth, __entry->status ) ); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index db2a4bd2f7ad..b06e49cc55fb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -406,14 +406,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) } percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); + trace_svcrdma_sq_full(rdma, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma); + trace_svcrdma_sq_retry(rdma, &cc->cc_cid); } while (1); - trace_svcrdma_sq_post_err(rdma, ret); + trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); svc_xprt_deferred_close(&rdma->sc_xprt); /* If even one was posted, there will be a completion. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 31b711deab5e..2ee691c45b85 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -343,13 +343,13 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); + trace_svcrdma_sq_full(rdma, &ctxt->sc_cid); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > 1); if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) return -ENOTCONN; - trace_svcrdma_sq_retry(rdma); + trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid); continue; } @@ -360,7 +360,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) return 0; } - trace_svcrdma_sq_post_err(rdma, ret); + trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret); svc_xprt_deferred_close(&rdma->sc_xprt); wake_up(&rdma->sc_send_wait); return ret; -- cgit v1.2.3 From 848760a9e701cfab93a71374c1727491b0bc321e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Nov 2023 11:33:43 -0500 Subject: svcrdma: DMA error tracepoints should report completion IDs Update the DMA error flow tracepoints to report the completion ID of the failing context. This ties the wait/failure to a particular operation or request, which is more useful than knowing only the failing transport. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 74 +++++++++++++++++++-------------------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 9 ++--- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index f1c2022d39ca..bba758e5fb1d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1805,33 +1805,37 @@ DEFINE_SVC_DMA_EVENT(dma_unmap_page); TRACE_EVENT(svcrdma_dma_map_rw_err, TP_PROTO( const struct svcxprt_rdma *rdma, + u64 offset, + u32 handle, unsigned int nents, int status ), - TP_ARGS(rdma, nents, status), + TP_ARGS(rdma, offset, handle, nents, status), TP_STRUCT__entry( - __field(int, status) + __field(u32, cq_id) + __field(u32, handle) + __field(u64, offset) __field(unsigned int, nents) - __string(device, rdma->sc_cm_id->device->name) - __string(addr, rdma->sc_xprt.xpt_remotebuf) + __field(int, status) ), TP_fast_assign( - __entry->status = status; + __entry->cq_id = rdma->sc_sq_cq->res.id; + __entry->handle = handle; + __entry->offset = offset; __entry->nents = nents; - __assign_str(device, rdma->sc_cm_id->device->name); - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); + __entry->status = status; ), - TP_printk("addr=%s device=%s nents=%u status=%d", - __get_str(addr), __get_str(device), __entry->nents, - __entry->status + TP_printk("cq.id=%u 0x%016llx:0x%08x nents=%u status=%d", + __entry->cq_id, (unsigned long long)__entry->offset, + __entry->handle, __entry->nents, __entry->status ) ); -TRACE_EVENT(svcrdma_no_rwctx_err, +TRACE_EVENT(svcrdma_rwctx_empty, TP_PROTO( const struct svcxprt_rdma *rdma, unsigned int num_sges @@ -1840,79 +1844,75 @@ TRACE_EVENT(svcrdma_no_rwctx_err, TP_ARGS(rdma, num_sges), TP_STRUCT__entry( + __field(u32, cq_id) __field(unsigned int, num_sges) - __string(device, rdma->sc_cm_id->device->name) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = rdma->sc_sq_cq->res.id; __entry->num_sges = num_sges; - __assign_str(device, rdma->sc_cm_id->device->name); - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s device=%s num_sges=%d", - __get_str(addr), __get_str(device), __entry->num_sges + TP_printk("cq.id=%u num_sges=%d", + __entry->cq_id, __entry->num_sges ) ); TRACE_EVENT(svcrdma_page_overrun_err, TP_PROTO( - const struct svcxprt_rdma *rdma, - const struct svc_rqst *rqst, + const struct rpc_rdma_cid *cid, unsigned int pageno ), - TP_ARGS(rdma, rqst, pageno), + TP_ARGS(cid, pageno), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, pageno) - __field(u32, xid) - __string(device, rdma->sc_cm_id->device->name) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->pageno = pageno; - __entry->xid = __be32_to_cpu(rqst->rq_xid); - __assign_str(device, rdma->sc_cm_id->device->name); - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s device=%s xid=0x%08x pageno=%u", __get_str(addr), - __get_str(device), __entry->xid, __entry->pageno + TP_printk("cq.id=%u cid=%d pageno=%u", + __entry->cq_id, __entry->completion_id, + __entry->pageno ) ); TRACE_EVENT(svcrdma_small_wrch_err, TP_PROTO( - const struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, unsigned int remaining, unsigned int seg_no, unsigned int num_segs ), - TP_ARGS(rdma, remaining, seg_no, num_segs), + TP_ARGS(cid, remaining, seg_no, num_segs), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, remaining) __field(unsigned int, seg_no) __field(unsigned int, num_segs) - __string(device, rdma->sc_cm_id->device->name) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->remaining = remaining; __entry->seg_no = seg_no; __entry->num_segs = num_segs; - __assign_str(device, rdma->sc_cm_id->device->name); - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s device=%s remaining=%u seg_no=%u num_segs=%u", - __get_str(addr), __get_str(device), __entry->remaining, - __entry->seg_no, __entry->num_segs + TP_printk("cq.id=%u cid=%d remaining=%u seg_no=%u num_segs=%u", + __entry->cq_id, __entry->completion_id, + __entry->remaining, __entry->seg_no, __entry->num_segs ) ); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index b06e49cc55fb..c06676714417 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -80,7 +80,7 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) out_free: kfree(ctxt); out_noctx: - trace_svcrdma_no_rwctx_err(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, sges); return NULL; } @@ -135,8 +135,9 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, ctxt->rw_sg_table.sgl, ctxt->rw_nents, 0, offset, handle, direction); if (unlikely(ret < 0)) { + trace_svcrdma_dma_map_rw_err(rdma, offset, handle, + ctxt->rw_nents, ret); svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); } return ret; } @@ -526,7 +527,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + trace_svcrdma_small_wrch_err(&cc->cc_cid, remaining, info->wi_seg_no, info->wi_chunk->ch_segcount); return -E2BIG; } @@ -766,7 +767,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, return 0; out_overrun: - trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); + trace_svcrdma_page_overrun_err(&cc->cc_cid, info->ri_pageno); return -EINVAL; } -- cgit v1.2.3 From 2dd6e29a3ea86ce51404589fd99597cd4dd0cd41 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 27 Nov 2023 11:33:50 -0500 Subject: svcrdma: Update some svcrdma DMA-related tracepoints A send/recv_ctxt already records transport-related information in the cq.id, thus there is no need to record the IP addresses of the transport endpoints. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 21 +++++++++++---------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 +++++----- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index bba758e5fb1d..9a3fc6eb09a8 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1762,29 +1762,29 @@ DEFINE_ERROR_EVENT(chunk); DECLARE_EVENT_CLASS(svcrdma_dma_map_class, TP_PROTO( - const struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, u64 dma_addr, u32 length ), - TP_ARGS(rdma, dma_addr, length), + TP_ARGS(cid, dma_addr, length), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(u64, dma_addr) __field(u32, length) - __string(device, rdma->sc_cm_id->device->name) - __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->dma_addr = dma_addr; __entry->length = length; - __assign_str(device, rdma->sc_cm_id->device->name); - __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("addr=%s device=%s dma_addr=%llu length=%u", - __get_str(addr), __get_str(device), + TP_printk("cq.id=%u cid=%d dma_addr=%llu length=%u", + __entry->cq_id, __entry->completion_id, __entry->dma_addr, __entry->length ) ); @@ -1792,11 +1792,12 @@ DECLARE_EVENT_CLASS(svcrdma_dma_map_class, #define DEFINE_SVC_DMA_EVENT(name) \ DEFINE_EVENT(svcrdma_dma_map_class, svcrdma_##name, \ TP_PROTO( \ - const struct svcxprt_rdma *rdma,\ + const struct rpc_rdma_cid *cid, \ u64 dma_addr, \ u32 length \ ), \ - TP_ARGS(rdma, dma_addr, length)) + TP_ARGS(cid, dma_addr, length) \ + ) DEFINE_SVC_DMA_EVENT(dma_map_page); DEFINE_SVC_DMA_EVENT(dma_map_err); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 2ee691c45b85..9571ed4a74d4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -237,13 +237,13 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, * remains mapped until @ctxt is destroyed. */ for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) { + trace_svcrdma_dma_unmap_page(&ctxt->sc_cid, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length); ib_dma_unmap_page(device, ctxt->sc_sges[i].addr, ctxt->sc_sges[i].length, DMA_TO_DEVICE); - trace_svcrdma_dma_unmap_page(rdma, - ctxt->sc_sges[i].addr, - ctxt->sc_sges[i].length); } llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); @@ -550,14 +550,14 @@ static int svc_rdma_page_dma_map(void *data, struct page *page, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - trace_svcrdma_dma_map_page(rdma, dma_addr, len); + trace_svcrdma_dma_map_page(&ctxt->sc_cid, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: - trace_svcrdma_dma_map_err(rdma, dma_addr, len); + trace_svcrdma_dma_map_err(&ctxt->sc_cid, dma_addr, len); return -EIO; } -- cgit v1.2.3 From 5ef6c666764151095346c18966b8720146a33719 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:24 -0500 Subject: svcrdma: Reduce size of struct svc_rdma_rw_ctxt SG_CHUNK_SIZE is 128, making struct svc_rdma_rw_ctxt + the first SGL array more than 4200 bytes in length, pushing the memory allocation well into order 1. Even so, the RDMA rw core doesn't seem to use more than max_send_sge entries in that array (typically 32 or less), so that is all wasted space. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c06676714417..69010ab7f0c3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -39,6 +39,7 @@ struct svc_rdma_rw_ctxt { struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; + unsigned int rw_first_sgl_nents; struct sg_table rw_sg_table; struct scatterlist rw_first_sgl[]; }; @@ -53,6 +54,8 @@ svc_rdma_next_ctxt(struct list_head *list) static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { + struct ib_device *dev = rdma->sc_cm_id->device; + unsigned int first_sgl_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -62,18 +65,19 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), - GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); + ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); + ctxt->rw_first_sgl_nents = first_sgl_nents; } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) + first_sgl_nents)) goto out_free; return ctxt; @@ -87,7 +91,7 @@ out_noctx: static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); llist_add(&ctxt->rw_node, list); } -- cgit v1.2.3 From c4fd9f452517402b7f8b768761961178f2b2098c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:31 -0500 Subject: svcrdma: Acquire the svcxprt_rdma pointer from the CQ context Enable the removal of the svc_rdma_chunk_ctxt::cc_rdma field in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 69010ab7f0c3..6fa818dc5b11 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -278,10 +278,10 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) */ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); @@ -345,6 +345,7 @@ static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) */ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); @@ -363,7 +364,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; -- cgit v1.2.3 From c3899b71072fb9b830918faa53745edc2f47d3a9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:37 -0500 Subject: svcrdma: Explicitly pass the transport into Write chunk I/O paths Enable the eventual removal of the svc_rdma_chunk_ctxt::cc_rdma field. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 6fa818dc5b11..ef2579141c33 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -220,6 +220,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + struct svcxprt_rdma *wi_rdma; + const struct svc_rdma_chunk *wi_chunk; /* write state of this chunk */ @@ -246,6 +248,7 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, if (!info) return info; + info->wi_rdma = rdma; info->wi_chunk = chunk; info->wi_seg_off = 0; info->wi_seg_no = 0; @@ -489,7 +492,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, unsigned int remaining) { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svcxprt_rdma *rdma = info->wi_rdma; const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; int ret; -- cgit v1.2.3 From 4a68edd93f5c79f3597d3c00642628599b62b2ef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:44 -0500 Subject: svcrdma: Explicitly pass the transport into Read chunk I/O paths Enable the eventual removal of the svc_rdma_chunk_ctxt::cc_rdma field. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 58 ++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index ef2579141c33..cda57a5f8ba0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -712,6 +712,7 @@ out_err: /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment + * @rdma: controlling transport * @info: context for ongoing I/O * @segment: co-ordinates of remote memory to be read * @@ -721,7 +722,8 @@ out_err: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info, const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; @@ -734,7 +736,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, len = segment->rs_length; sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); if (!ctxt) return -ENOMEM; ctxt->rw_nents = sge_no; @@ -764,7 +766,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; @@ -781,6 +783,7 @@ out_overrun: /** * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @rdma: controlling transport * @info: context for ongoing I/O * @chunk: Read chunk to pull * @@ -790,7 +793,8 @@ out_overrun: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info, const struct svc_rdma_chunk *chunk) { const struct svc_rdma_segment *segment; @@ -798,7 +802,7 @@ static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, ret = -EINVAL; pcl_for_each_segment(segment, chunk) { - ret = svc_rdma_build_read_segment(info, segment); + ret = svc_rdma_build_read_segment(rdma, info, segment); if (ret < 0) break; info->ri_totalbytes += segment->rs_length; @@ -858,6 +862,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks + * @rdma: controlling transport * @info: context for RDMA Reads * * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, @@ -870,7 +875,8 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; @@ -887,7 +893,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rdma, info, chunk); if (ret < 0) return ret; @@ -920,6 +926,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf /** * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks + * @rdma: controlling transport * @info: context for RDMA Reads * * The chunk data lands in the page list of rqstp->rq_arg.pages. @@ -935,7 +942,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) +static int svc_rdma_read_data_item(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct xdr_buf *buf = &info->ri_rqst->rq_arg; @@ -944,7 +952,7 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) int ret; chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rdma, info, chunk); if (ret < 0) goto out; @@ -978,6 +986,7 @@ out: /** * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @rdma: controlling transport * @info: context for RDMA Reads * @chunk: parsed Call chunk to pull * @offset: offset of region to pull @@ -990,7 +999,8 @@ out: * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, +static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info, const struct svc_rdma_chunk *chunk, unsigned int offset, unsigned int length) { @@ -1010,7 +1020,7 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; dummy.rs_offset = segment->rs_offset + offset; - ret = svc_rdma_build_read_segment(info, &dummy); + ret = svc_rdma_build_read_segment(rdma, info, &dummy); if (ret < 0) break; @@ -1023,6 +1033,7 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message + * @rdma: controlling transport * @info: context for RDMA Reads * * Return values: @@ -1032,7 +1043,8 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) +static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_chunk *call_chunk = @@ -1043,17 +1055,17 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) int ret; if (pcl_is_empty(pcl)) - return svc_rdma_build_read_chunk(info, call_chunk); + return svc_rdma_build_read_chunk(rdma, info, call_chunk); start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); + ret = svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rdma, info, chunk); if (ret < 0) return ret; @@ -1063,7 +1075,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) start += length; length = next->ch_position - info->ri_totalbytes; - ret = svc_rdma_read_chunk_range(info, call_chunk, + ret = svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); if (ret < 0) return ret; @@ -1071,11 +1083,12 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) start += length; length = call_chunk->ch_length - start; - return svc_rdma_read_chunk_range(info, call_chunk, start, length); + return svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); } /** * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message + * @rdma: controlling transport * @info: context for RDMA Reads * * The start of the data lands in the first page just after the @@ -1092,12 +1105,13 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) +static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info) { struct xdr_buf *buf = &info->ri_rqst->rq_arg; int ret; - ret = svc_rdma_read_call_chunk(info); + ret = svc_rdma_read_call_chunk(rdma, info); if (ret < 0) goto out; @@ -1156,11 +1170,11 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, if (pcl_is_empty(&head->rc_call_pcl)) { if (head->rc_read_pcl.cl_count == 1) - ret = svc_rdma_read_data_item(info); + ret = svc_rdma_read_data_item(rdma, info); else - ret = svc_rdma_read_multiple_chunks(info); + ret = svc_rdma_read_multiple_chunks(rdma, info); } else - ret = svc_rdma_read_special(info); + ret = svc_rdma_read_special(rdma, info); if (ret < 0) goto out_err; -- cgit v1.2.3 From 83fe6dd6a8165fa5f1a45b832e8fe76850888eff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:50 -0500 Subject: svcrdma: Explicitly pass the transport to svc_rdma_post_chunk_ctxt() Enable the eventual removal of the svc_rdma_chunk_ctxt::cc_rdma field. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index cda57a5f8ba0..c0b64a79197e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -379,9 +379,9 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) * even if one or more WRs are flushed. This is true when posting * an rdma_rw_ctx or when posting a single signaled WR. */ -static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct ib_send_wr *first_wr; const struct ib_send_wr *bad_wr; struct list_head *tmp; @@ -652,7 +652,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, goto out_err; trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; return xdr->len; @@ -699,7 +699,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, goto out_err; trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; @@ -1180,7 +1180,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); init_completion(&cc->cc_done); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; -- cgit v1.2.3 From bc8fd4e915130ebe515be4d852185d8e90c6111d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:56:57 -0500 Subject: svcrdma: Pass a pointer to the transport to svc_rdma_cc_release() Enable the eventual removal of the svc_rdma_chunk_ctxt::cc_rdma field. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c0b64a79197e..7676b9df024b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -187,10 +187,10 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, * that only one atomic llist operation is needed to put them all * back on the free list. */ -static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, +static void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); @@ -262,7 +262,7 @@ static void svc_rdma_write_info_free_async(struct work_struct *work) struct svc_rdma_write_info *info; info = container_of(work, struct svc_rdma_write_info, wi_work); - svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); + svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE); kfree(info); } @@ -334,9 +334,10 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) return info; } -static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) +static void svc_rdma_read_info_free(struct svcxprt_rdma *rdma, + struct svc_rdma_read_info *info) { - svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); + svc_rdma_cc_release(rdma, &info->ri_cc, DMA_FROM_DEVICE); kfree(info); } @@ -1197,6 +1198,6 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, head->rc_page_count = 0; out_err: - svc_rdma_read_info_free(info); + svc_rdma_read_info_free(rdma, info); return ret; } -- cgit v1.2.3 From 2cc0f23b53050c047fe99ebe73c162268e8dd635 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:03 -0500 Subject: svcrdma: Remove the svc_rdma_chunk_ctxt::cc_rdma field In every instance, the pointer address in that field is now available by other means. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 7676b9df024b..cfa5973c9277 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -157,7 +157,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt { struct rpc_rdma_cid cc_cid; struct ib_cqe cc_cqe; - struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; ktime_t cc_posttime; int cc_sqecount; @@ -176,7 +175,6 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { svc_rdma_cc_cid_init(rdma, &cc->cc_cid); - cc->cc_rdma = rdma; INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; -- cgit v1.2.3 From 6a04a4349330c5476adf465159a7f49411091bbe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:09 -0500 Subject: svcrdma: Move struct svc_rdma_chunk_ctxt to svc_rdma.h Prepare for nestling these into the send and recv ctxts so they no longer have to be allocated dynamically. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 15 +++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 18 ------------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ab250017b99f..50c4f18a9b7f 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -127,6 +127,21 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* + * A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a set of rdma_rw's that handle data movement + * for all segments of one chunk. + */ +struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; + struct ib_cqe cc_cqe; + struct list_head cc_rwctxts; + ktime_t cc_posttime; + int cc_sqecount; + enum ib_wc_status cc_status; + struct completion cc_done; +}; + struct svc_rdma_recv_ctxt { struct llist_node rc_node; struct list_head rc_list; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index cfa5973c9277..1de56e9fea91 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -146,24 +146,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, return ret; } -/* A chunk context tracks all I/O for moving one Read or Write - * chunk. This is a set of rdma_rw's that handle data movement - * for all segments of one chunk. - * - * These are small, acquired with a single allocator call, and - * no more than one is needed per chunk. They are allocated on - * demand, and not cached. - */ -struct svc_rdma_chunk_ctxt { - struct rpc_rdma_cid cc_cid; - struct ib_cqe cc_cqe; - struct list_head cc_rwctxts; - ktime_t cc_posttime; - int cc_sqecount; - enum ib_wc_status cc_status; - struct completion cc_done; -}; - static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, struct rpc_rdma_cid *cid) { -- cgit v1.2.3 From b1818412d06fc03605d02dbdd4a7c53dc9e2d5ba Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:16 -0500 Subject: svcrdma: Start moving fields out of struct svc_rdma_read_info Since the request's svc_rdma_recv_ctxt will stay around for the duration of the RDMA Read operation, the contents of struct svc_rdma_read_info can reside in the request's svc_rdma_recv_ctxt rather than being allocated separately. This will eventually save a call to kmalloc() in a hot path. Start this clean-up by moving the Read chunk's svc_rdma_chunk_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 +++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 57 ++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 50c4f18a9b7f..6c7501ae4e29 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -156,6 +156,10 @@ struct svc_rdma_recv_ctxt { u32 rc_inv_rkey; __be32 rc_msgtype; + /* State for pulling a Read chunk */ + unsigned int rc_readbytes; + struct svc_rdma_chunk_ctxt rc_cc; + struct svc_rdma_pcl rc_call_pcl; struct svc_rdma_pcl rc_read_pcl; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1de56e9fea91..a27b8f338ae5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -294,9 +294,6 @@ struct svc_rdma_read_info { struct svc_rdma_recv_ctxt *ri_readctxt; unsigned int ri_pageno; unsigned int ri_pageoff; - unsigned int ri_totalbytes; - - struct svc_rdma_chunk_ctxt ri_cc; }; static struct svc_rdma_read_info * @@ -304,20 +301,13 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) { struct svc_rdma_read_info *info; - info = kmalloc_node(sizeof(*info), GFP_KERNEL, + return kmalloc_node(sizeof(*info), GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); - if (!info) - return info; - - svc_rdma_cc_init(rdma, &info->ri_cc); - info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; - return info; } static void svc_rdma_read_info_free(struct svcxprt_rdma *rdma, struct svc_rdma_read_info *info) { - svc_rdma_cc_release(rdma, &info->ri_cc, DMA_FROM_DEVICE); kfree(info); } @@ -333,12 +323,12 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_read_info *info; + struct svc_rdma_recv_ctxt *ctxt; switch (wc->status) { case IB_WC_SUCCESS: - info = container_of(cc, struct svc_rdma_read_info, ri_cc); - trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc); + trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes, cc->cc_posttime); break; case IB_WC_WR_FLUSH_ERR: @@ -708,7 +698,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; struct svc_rqst *rqstp = info->ri_rqst; unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; @@ -778,6 +768,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, struct svc_rdma_read_info *info, const struct svc_rdma_chunk *chunk) { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_segment *segment; int ret; @@ -786,7 +777,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, ret = svc_rdma_build_read_segment(rdma, info, segment); if (ret < 0) break; - info->ri_totalbytes += segment->rs_length; + head->rc_readbytes += segment->rs_length; } return ret; } @@ -828,7 +819,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, dst = page_address(rqstp->rq_pages[info->ri_pageno]); memcpy(dst + info->ri_pageno, src + offset, page_len); - info->ri_totalbytes += page_len; + head->rc_readbytes += page_len; info->ri_pageoff += page_len; if (info->ri_pageoff == PAGE_SIZE) { info->ri_pageno++; @@ -883,7 +874,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, break; start += length; - length = next->ch_position - info->ri_totalbytes; + length = next->ch_position - head->rc_readbytes; ret = svc_rdma_copy_inline_range(info, start, length); if (ret < 0) return ret; @@ -895,13 +886,13 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, if (ret < 0) return ret; - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; + buf->len += head->rc_readbytes; + buf->buflen += head->rc_readbytes; buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + buf->page_len = head->rc_readbytes - buf->head[0].iov_len; return 0; } @@ -985,6 +976,7 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, unsigned int offset, unsigned int length) { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_segment *segment; int ret; @@ -1005,7 +997,7 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, if (ret < 0) break; - info->ri_totalbytes += dummy.rs_length; + head->rc_readbytes += dummy.rs_length; length -= dummy.rs_length; offset = 0; } @@ -1055,7 +1047,7 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, break; start += length; - length = next->ch_position - info->ri_totalbytes; + length = next->ch_position - head->rc_readbytes; ret = svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); if (ret < 0) @@ -1089,6 +1081,7 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma, struct svc_rdma_read_info *info) { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct xdr_buf *buf = &info->ri_rqst->rq_arg; int ret; @@ -1096,13 +1089,13 @@ static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma, if (ret < 0) goto out; - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; + buf->len += head->rc_readbytes; + buf->buflen += head->rc_readbytes; buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + buf->page_len = head->rc_readbytes - buf->head[0].iov_len; out: return ret; @@ -1135,19 +1128,20 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; struct svc_rdma_read_info *info; - struct svc_rdma_chunk_ctxt *cc; int ret; info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; - cc = &info->ri_cc; info->ri_rqst = rqstp; info->ri_readctxt = head; info->ri_pageno = 0; info->ri_pageoff = 0; - info->ri_totalbytes = 0; + svc_rdma_cc_init(rdma, cc); + cc->cc_cqe.done = svc_rdma_wc_read_done; + head->rc_readbytes = 0; if (pcl_is_empty(&head->rc_call_pcl)) { if (head->rc_read_pcl.cl_count == 1) @@ -1178,6 +1172,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, head->rc_page_count = 0; out_err: + svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); svc_rdma_read_info_free(rdma, info); return ret; } -- cgit v1.2.3 From 8e122582680c6f8acd686a5a2af9c0e46fe90f2d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:22 -0500 Subject: svcrdma: Move svc_rdma_read_info::ri_pageno to struct svc_rdma_recv_ctxt Further clean up: move the page index field into svc_rdma_recv_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 21 +++++++++------------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 6c7501ae4e29..0ea66f73bec2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt { __be32 rc_msgtype; /* State for pulling a Read chunk */ + unsigned int rc_curpage; unsigned int rc_readbytes; struct svc_rdma_chunk_ctxt rc_cc; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index a27b8f338ae5..487acb192558 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -292,7 +292,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_read_info { struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_pageno; unsigned int ri_pageoff; }; @@ -720,20 +719,18 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, if (!info->ri_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], + sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], seg_len, info->ri_pageoff); sg = sg_next(sg); info->ri_pageoff += seg_len; if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; + head->rc_curpage++; info->ri_pageoff = 0; } len -= seg_len; - /* Safety check */ - if (len && - &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages))) goto out_overrun; } @@ -748,7 +745,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, return 0; out_overrun: - trace_svcrdma_page_overrun_err(&cc->cc_cid, info->ri_pageno); + trace_svcrdma_page_overrun_err(&cc->cc_cid, head->rc_curpage); return -EINVAL; } @@ -790,7 +787,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, * * Take a page at a time from rqstp->rq_pages and copy the inline * content from the Receive buffer into that page. Update - * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * head->rc_curpage and info->ri_pageoff so that the next RDMA Read * result will land contiguously with the copied content. * * Return values: @@ -816,13 +813,13 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, if (!info->ri_pageoff) head->rc_page_count++; - dst = page_address(rqstp->rq_pages[info->ri_pageno]); - memcpy(dst + info->ri_pageno, src + offset, page_len); + dst = page_address(rqstp->rq_pages[head->rc_curpage]); + memcpy(dst + head->rc_curpage, src + offset, page_len); head->rc_readbytes += page_len; info->ri_pageoff += page_len; if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; + head->rc_curpage++; info->ri_pageoff = 0; } remaining -= page_len; @@ -1137,10 +1134,10 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, return -ENOMEM; info->ri_rqst = rqstp; info->ri_readctxt = head; - info->ri_pageno = 0; info->ri_pageoff = 0; svc_rdma_cc_init(rdma, cc); cc->cc_cqe.done = svc_rdma_wc_read_done; + head->rc_curpage = 0; head->rc_readbytes = 0; if (pcl_is_empty(&head->rc_call_pcl)) { -- cgit v1.2.3 From 919f6e790ab6cca772fa60c6006162c0a7ebbfc5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:28 -0500 Subject: svcrdma: Move read_info::ri_pageoff into struct svc_rdma_recv_ctxt Further clean up: move the starting byte offset field into svc_rdma_recv_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 31 +++++++++++++++---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 0ea66f73bec2..44a14eaf8c40 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt { __be32 rc_msgtype; /* State for pulling a Read chunk */ + unsigned int rc_pageoff; unsigned int rc_curpage; unsigned int rc_readbytes; struct svc_rdma_chunk_ctxt rc_cc; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 487acb192558..dbced8970779 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -292,7 +292,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_read_info { struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_pageoff; }; static struct svc_rdma_read_info * @@ -705,7 +704,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; + sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); if (!ctxt) return -ENOMEM; @@ -714,19 +713,19 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, sg = ctxt->rw_sg_table.sgl; for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { seg_len = min_t(unsigned int, len, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], - seg_len, info->ri_pageoff); + seg_len, head->rc_pageoff); sg = sg_next(sg); - info->ri_pageoff += seg_len; - if (info->ri_pageoff == PAGE_SIZE) { + head->rc_pageoff += seg_len; + if (head->rc_pageoff == PAGE_SIZE) { head->rc_curpage++; - info->ri_pageoff = 0; + head->rc_pageoff = 0; } len -= seg_len; @@ -787,7 +786,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, * * Take a page at a time from rqstp->rq_pages and copy the inline * content from the Receive buffer into that page. Update - * head->rc_curpage and info->ri_pageoff so that the next RDMA Read + * head->rc_curpage and head->rc_pageoff so that the next RDMA Read * result will land contiguously with the copied content. * * Return values: @@ -803,24 +802,24 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, struct svc_rqst *rqstp = info->ri_rqst; unsigned int page_no, numpages; - numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT; for (page_no = 0; page_no < numpages; page_no++) { unsigned int page_len; page_len = min_t(unsigned int, remaining, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; dst = page_address(rqstp->rq_pages[head->rc_curpage]); memcpy(dst + head->rc_curpage, src + offset, page_len); head->rc_readbytes += page_len; - info->ri_pageoff += page_len; - if (info->ri_pageoff == PAGE_SIZE) { + head->rc_pageoff += page_len; + if (head->rc_pageoff == PAGE_SIZE) { head->rc_curpage++; - info->ri_pageoff = 0; + head->rc_pageoff = 0; } remaining -= page_len; offset += page_len; @@ -1134,9 +1133,9 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, return -ENOMEM; info->ri_rqst = rqstp; info->ri_readctxt = head; - info->ri_pageoff = 0; svc_rdma_cc_init(rdma, cc); cc->cc_cqe.done = svc_rdma_wc_read_done; + head->rc_pageoff = 0; head->rc_curpage = 0; head->rc_readbytes = 0; -- cgit v1.2.3 From fc20f19b4df4a46d1003d15d84148a117e8bdf5d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:35 -0500 Subject: svcrdma: Update synopsis of svc_rdma_build_read_segment() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_build_read_segment() can use the recv_ctxt to derive that information rather than the other way around. This removes one usage of the ri_readctxt field, enabling its removal in a subsequent patch. At the same time, the use of ri_rqst can similarly be replaced with a passed-in function parameter. Start with build_read_segment() because it is a common utility function at the bottom of the Read chunk path. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 7 +++++++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 17 +++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 44a14eaf8c40..f03f9909fb97 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -116,6 +116,13 @@ struct svcxprt_rdma { /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 +static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + return container_of(xprt, struct svcxprt_rdma, sc_xprt); +} + /* * Default connection parameters */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index dbced8970779..c2d0e4bb454e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -681,8 +681,8 @@ out_err: /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment - * @rdma: controlling transport - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @segment: co-ordinates of remote memory to be read * * Returns: @@ -691,13 +691,12 @@ out_err: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info, +static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_segment *segment) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - struct svc_rqst *rqstp = info->ri_rqst; unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; struct scatterlist *sg; @@ -770,7 +769,8 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, ret = -EINVAL; pcl_for_each_segment(segment, chunk) { - ret = svc_rdma_build_read_segment(rdma, info, segment); + ret = svc_rdma_build_read_segment(info->ri_rqst, + info->ri_readctxt, segment); if (ret < 0) break; head->rc_readbytes += segment->rs_length; @@ -989,7 +989,8 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; dummy.rs_offset = segment->rs_offset + offset; - ret = svc_rdma_build_read_segment(rdma, info, &dummy); + ret = svc_rdma_build_read_segment(info->ri_rqst, + info->ri_readctxt, &dummy); if (ret < 0) break; -- cgit v1.2.3 From 02e8fe1eca4c6ff5bf26718bb732379f3193534a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:41 -0500 Subject: svcrdma: Update synopsis of svc_rdma_build_read_chunk() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_build_read_chunk() can use that recv_ctxt to derive that information rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c2d0e4bb454e..b10341cd1df2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -749,8 +749,8 @@ out_overrun: /** * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk - * @rdma: controlling transport - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: Read chunk to pull * * Return values: @@ -759,18 +759,16 @@ out_overrun: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info, +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_segment *segment; int ret; ret = -EINVAL; pcl_for_each_segment(segment, chunk) { - ret = svc_rdma_build_read_segment(info->ri_rqst, - info->ri_readctxt, segment); + ret = svc_rdma_build_read_segment(rqstp, head, segment); if (ret < 0) break; head->rc_readbytes += segment->rs_length; @@ -861,7 +859,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(rdma, info, chunk); + ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); if (ret < 0) return ret; @@ -920,7 +918,7 @@ static int svc_rdma_read_data_item(struct svcxprt_rdma *rdma, int ret; chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(rdma, info, chunk); + ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); if (ret < 0) goto out; @@ -1025,7 +1023,8 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, int ret; if (pcl_is_empty(pcl)) - return svc_rdma_build_read_chunk(rdma, info, call_chunk); + return svc_rdma_build_read_chunk(info->ri_rqst, head, + call_chunk); start = 0; chunk = pcl_first_chunk(pcl); @@ -1035,7 +1034,7 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(rdma, info, chunk); + ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); if (ret < 0) return ret; -- cgit v1.2.3 From c7eb4feb1b21dd08fa32f08ce165b9444b9bfee9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:48 -0500 Subject: svcrdma: Update synopsis of svc_rdma_read_chunk_range() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_build_read_chunk_range() can use that recv_ctxt to derive that information rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index b10341cd1df2..63546e495cb3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -951,9 +951,9 @@ out: } /** - * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk - * @rdma: controlling transport - * @info: context for RDMA Reads + * svc_rdma_read_chunk_range - Build RDMA Read WRs for portion of a chunk + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: parsed Call chunk to pull * @offset: offset of region to pull * @length: length of region to pull @@ -965,12 +965,11 @@ out: * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info, +static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk, unsigned int offset, unsigned int length) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_segment *segment; int ret; @@ -987,8 +986,7 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma, dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; dummy.rs_offset = segment->rs_offset + offset; - ret = svc_rdma_build_read_segment(info->ri_rqst, - info->ri_readctxt, &dummy); + ret = svc_rdma_build_read_segment(rqstp, head, &dummy); if (ret < 0) break; @@ -1029,7 +1027,8 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); + ret = svc_rdma_read_chunk_range(info->ri_rqst, head, call_chunk, + start, length); if (ret < 0) return ret; @@ -1044,15 +1043,16 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, start += length; length = next->ch_position - head->rc_readbytes; - ret = svc_rdma_read_chunk_range(rdma, info, call_chunk, - start, length); + ret = svc_rdma_read_chunk_range(info->ri_rqst, head, + call_chunk, start, length); if (ret < 0) return ret; } start += length; length = call_chunk->ch_length - start; - return svc_rdma_read_chunk_range(rdma, info, call_chunk, start, length); + return svc_rdma_read_chunk_range(info->ri_rqst, head, call_chunk, + start, length); } /** -- cgit v1.2.3 From 6e4b9b8643967e1a44d7b326da60b4ecb3a9b858 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:54 -0500 Subject: svcrdma: Update the synopsis of svc_rdma_read_data_item() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_build_read_data_item() can use that recv_ctxt to derive that information rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 63546e495cb3..1953f3983695 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -892,8 +892,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, /** * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks - * @rdma: controlling transport - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in the page list of rqstp->rq_arg.pages. * @@ -908,17 +908,16 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_data_item(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info) +static int svc_rdma_read_data_item(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct xdr_buf *buf = &rqstp->rq_arg; struct svc_rdma_chunk *chunk; unsigned int length; int ret; chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) goto out; @@ -940,7 +939,7 @@ static int svc_rdma_read_data_item(struct svcxprt_rdma *rdma, * Currently these chunks always start at page offset 0, * thus the rounded-up length never crosses a page boundary. */ - buf->pages = &info->ri_rqst->rq_pages[0]; + buf->pages = &rqstp->rq_pages[0]; length = xdr_align_size(chunk->ch_length); buf->page_len = length; buf->len += length; @@ -1141,7 +1140,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, if (pcl_is_empty(&head->rc_call_pcl)) { if (head->rc_read_pcl.cl_count == 1) - ret = svc_rdma_read_data_item(rdma, info); + ret = svc_rdma_read_data_item(rqstp, head); else ret = svc_rdma_read_multiple_chunks(rdma, info); } else -- cgit v1.2.3 From 6518204d2304239507236919f70ecf7ff324fe20 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:01 -0500 Subject: svcrdma: Update synopsis of svc_rdma_copy_inline_range() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_copy_inline_range() can use that recv_ctxt to derive the read_info rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1953f3983695..ec546fe094e8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -778,7 +778,8 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, /** * svc_rdma_copy_inline_range - Copy part of the inline content into pages - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @offset: offset into the Receive buffer of region to copy * @remaining: length of region to copy * @@ -791,13 +792,12 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, * %0: Inline content was successfully copied * %-EINVAL: offset or length was incorrect */ -static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, +static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, unsigned int offset, unsigned int remaining) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; unsigned char *dst, *src = head->rc_recv_buf; - struct svc_rqst *rqstp = info->ri_rqst; unsigned int page_no, numpages; numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT; @@ -846,7 +846,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rqst *rqstp = info->ri_rqst; + struct xdr_buf *buf = &rqstp->rq_arg; struct svc_rdma_chunk *chunk, *next; unsigned int start, length; int ret; @@ -854,7 +855,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_copy_inline_range(info, start, length); + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; @@ -869,14 +870,14 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, start += length; length = next->ch_position - head->rc_readbytes; - ret = svc_rdma_copy_inline_range(info, start, length); + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; } start += length; length = head->rc_byte_len - start; - ret = svc_rdma_copy_inline_range(info, start, length); + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; -- cgit v1.2.3 From 740a3c895d94fc72494f2a4abf1ab31b6be482e8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:07 -0500 Subject: svcrdma: Update synopsis of svc_rdma_read_multiple_chunks() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_read_multiple_chunks() can use that recv_ctxt to derive the read_info rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index ec546fe094e8..56a8e602706a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -828,8 +828,8 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, /** * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks - * @rdma: controlling transport - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, * like an incoming TCP call. @@ -841,12 +841,11 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info) +static noinline int +svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; - struct svc_rqst *rqstp = info->ri_rqst; struct xdr_buf *buf = &rqstp->rq_arg; struct svc_rdma_chunk *chunk, *next; unsigned int start, length; @@ -860,7 +859,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -884,9 +883,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma, buf->len += head->rc_readbytes; buf->buflen += head->rc_readbytes; - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; + buf->pages = &rqstp->rq_pages[1]; buf->page_len = head->rc_readbytes - buf->head[0].iov_len; return 0; } @@ -1143,7 +1142,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, if (head->rc_read_pcl.cl_count == 1) ret = svc_rdma_read_data_item(rqstp, head); else - ret = svc_rdma_read_multiple_chunks(rdma, info); + ret = svc_rdma_read_multiple_chunks(rqstp, head); } else ret = svc_rdma_read_special(rdma, info); if (ret < 0) -- cgit v1.2.3 From 23bab3b22d84793f1f096322d17952156be6f207 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:13 -0500 Subject: svcrdma: Update the synopsis of svc_rdma_read_call_chunk() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_read_call_chunk() can use that recv_ctxt to derive the read_info rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 56a8e602706a..f9d1b0463282 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -998,8 +998,8 @@ static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp, /** * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message - * @rdma: controlling transport - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * Return values: * %0: RDMA Read WQEs were successfully built @@ -1008,10 +1008,9 @@ static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info) +static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_chunk *call_chunk = pcl_first_chunk(&head->rc_call_pcl); const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; @@ -1020,19 +1019,18 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, int ret; if (pcl_is_empty(pcl)) - return svc_rdma_build_read_chunk(info->ri_rqst, head, - call_chunk); + return svc_rdma_build_read_chunk(rqstp, head, call_chunk); start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_read_chunk_range(info->ri_rqst, head, call_chunk, + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info->ri_rqst, head, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -1042,15 +1040,15 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma, start += length; length = next->ch_position - head->rc_readbytes; - ret = svc_rdma_read_chunk_range(info->ri_rqst, head, - call_chunk, start, length); + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, + start, length); if (ret < 0) return ret; } start += length; length = call_chunk->ch_length - start; - return svc_rdma_read_chunk_range(info->ri_rqst, head, call_chunk, + return svc_rdma_read_chunk_range(rqstp, head, call_chunk, start, length); } @@ -1080,7 +1078,7 @@ static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma, struct xdr_buf *buf = &info->ri_rqst->rq_arg; int ret; - ret = svc_rdma_read_call_chunk(rdma, info); + ret = svc_rdma_read_call_chunk(info->ri_rqst, info->ri_readctxt); if (ret < 0) goto out; -- cgit v1.2.3 From efd02cb0dda6fed89065bb9d6de77d9752e1f0c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:20 -0500 Subject: svcrdma: Update the synopsis of svc_rdma_read_special() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_read_special() can use that recv_ctxt to derive the read_info rather than the other way around. This removes another usage of the ri_readctxt field, enabling its removal in a subsequent patch. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index f9d1b0463282..a3003c2dc0a2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1054,8 +1054,8 @@ static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, /** * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message - * @rdma: controlling transport - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The start of the data lands in the first page just after the * Transport header, and the rest lands in rqstp->rq_arg.pages. @@ -1071,23 +1071,22 @@ static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info) +static noinline int svc_rdma_read_special(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct xdr_buf *buf = &rqstp->rq_arg; int ret; - ret = svc_rdma_read_call_chunk(info->ri_rqst, info->ri_readctxt); + ret = svc_rdma_read_call_chunk(rqstp, head); if (ret < 0) goto out; buf->len += head->rc_readbytes; buf->buflen += head->rc_readbytes; - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; + buf->pages = &rqstp->rq_pages[1]; buf->page_len = head->rc_readbytes - buf->head[0].iov_len; out: @@ -1142,7 +1141,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, else ret = svc_rdma_read_multiple_chunks(rqstp, head); } else - ret = svc_rdma_read_special(rdma, info); + ret = svc_rdma_read_special(rqstp, head); if (ret < 0) goto out_err; -- cgit v1.2.3 From 57666bbb4eaae187e1e08dd1a9a2db7bcb1fbf96 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:26 -0500 Subject: svcrdma: Remove struct svc_rdma_read_info The remaining fields of struct svc_rdma_read_info are no longer referenced. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index a3003c2dc0a2..0ccb21f1089e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -287,28 +287,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_write_info_free(info); } -/* State for pulling a Read chunk. - */ -struct svc_rdma_read_info { - struct svc_rqst *ri_rqst; - struct svc_rdma_recv_ctxt *ri_readctxt; -}; - -static struct svc_rdma_read_info * -svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_read_info *info; - - return kmalloc_node(sizeof(*info), GFP_KERNEL, - ibdev_to_node(rdma->sc_cm_id->device)); -} - -static void svc_rdma_read_info_free(struct svcxprt_rdma *rdma, - struct svc_rdma_read_info *info) -{ - kfree(info); -} - /** * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx * @cq: controlling Completion Queue @@ -1121,14 +1099,8 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *head) { struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - struct svc_rdma_read_info *info; int ret; - info = svc_rdma_read_info_alloc(rdma); - if (!info) - return -ENOMEM; - info->ri_rqst = rqstp; - info->ri_readctxt = head; svc_rdma_cc_init(rdma, cc); cc->cc_cqe.done = svc_rdma_wc_read_done; head->rc_pageoff = 0; @@ -1165,6 +1137,5 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, out_err: svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); - svc_rdma_read_info_free(rdma, info); return ret; } -- cgit v1.2.3 From 018f34051bc9f4908336b3fe9e52931bb8410ced Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:33 -0500 Subject: svcrdma: Move the svc_rdma_cc_init() call Now that the chunk_ctxt for Reads is no longer dynamically allocated it can be initialized once for the life of the object that contains it (struct svc_rdma_recv_ctxt). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 11 ++++++++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f03f9909fb97..051fefde8d51 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -211,6 +211,8 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); +extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 72374033bb2b..392a91dc8a99 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -156,6 +156,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; + svc_rdma_cc_init(rdma, &ctxt->rc_cc); return ctxt; fail2: diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 0ccb21f1089e..4d2db06ccfd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -153,8 +153,13 @@ static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); } -static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc) +/** + * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be initialized + */ +void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { svc_rdma_cc_cid_init(rdma, &cc->cc_cid); @@ -1101,8 +1106,8 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; int ret; - svc_rdma_cc_init(rdma, cc); cc->cc_cqe.done = svc_rdma_wc_read_done; + cc->cc_sqecount = 0; head->rc_pageoff = 0; head->rc_curpage = 0; head->rc_readbytes = 0; -- cgit v1.2.3 From 28ee0ec8948ac235327a1f5472fc032b308284a3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:08 -0500 Subject: svcrdma: De-duplicate completion ID initialization helpers Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 24 ++++++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 7 ------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 9 +-------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 ------- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 051fefde8d51..46f2ce9f810b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -134,6 +134,30 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/** + * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + +/** + * svc_rdma_send_cid_init - Initialize a Send Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + /* * A chunk context tracks all I/O for moving one Read or Write * chunk. This is a set of rdma_rw's that handle data movement diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 392a91dc8a99..ac6351e292c5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -115,13 +115,6 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } -static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_rq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 4d2db06ccfd2..eab71f3867fa 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -146,13 +146,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, return ret; } -static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - /** * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt * @rdma: controlling transport instance @@ -161,7 +154,7 @@ static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { - svc_rdma_cc_cid_init(rdma, &cc->cc_cid); + svc_rdma_send_cid_init(rdma, &cc->cc_cid); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 9571ed4a74d4..c9585e469ca8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { -- cgit v1.2.3 From 2a95ce479e681b35e385da0f1a6adf7c6240ddce Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:15 -0500 Subject: svcrdma: Optimize svc_rdma_cc_init() The atomic_inc_return() in svc_rdma_send_cid_init() is expensive. Some svc_rdma_chunk_ctxt's now reside in long-lived container structures. They don't need a fresh completion ID for every I/O operation. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_rw.c | 9 +++++---- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ac6351e292c5..38f01652dc6d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -123,7 +123,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) dma_addr_t addr; void *buffer; - ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node); + ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node); if (!ctxt) goto fail0; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index eab71f3867fa..ff54bb268b7d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -154,7 +154,10 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { - svc_rdma_send_cid_init(rdma, &cc->cc_cid); + struct rpc_rdma_cid *cid = &cc->cc_cid; + + if (unlikely(!cid->ci_completion_id)) + svc_rdma_send_cid_init(rdma, cid); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; @@ -221,15 +224,13 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info; - info = kmalloc_node(sizeof(*info), GFP_KERNEL, + info = kzalloc_node(sizeof(*info), GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; info->wi_rdma = rdma; info->wi_chunk = chunk; - info->wi_seg_off = 0; - info->wi_seg_no = 0; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c9585e469ca8..1a49b7f02041 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -122,7 +122,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) void *buffer; int i; - ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), + ctxt = kzalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), GFP_KERNEL, node); if (!ctxt) goto fail0; -- cgit v1.2.3 From 913cd7668f17b2b0dad044a179ee540d81eff41b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:21 -0500 Subject: svcrdma: Remove pointer addresses shown in dprintk() There are a couple of dprintk() call sites in svc_rdma_accept() that show pointer addresses. These days, displayed pointer addresses are hashed and thus have little or no diagnostic value, especially for site administrators. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3826da1c15f3..451814eb12b9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -457,8 +457,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; qp_attr.recv_cq = newxprt->sc_rq_cq; - dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", - newxprt->sc_cm_id, newxprt->sc_pd); dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", @@ -512,7 +510,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection %p accepted:\n", newxprt); + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; -- cgit v1.2.3 From b918bfcf370c92ea3b82fa9bb3d017702b5fa4cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:28 -0500 Subject: svcrdma: Remove queue-shortening warnings These won't have much diagnostic value for site administrators. Since they can't be disabled, they become noise. What's more, the subsequent rdma_create_qp() call adjusts the Send Queue size (possibly downward) without warning, making the size reported by these pr_warns inaccurate. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 451814eb12b9..040d2ef6400c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -412,8 +412,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + newxprt->sc_recv_batch; if (rq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing receive depth to %d\n", - dev->attrs.max_qp_wr); rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; @@ -423,11 +421,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); ctxts *= newxprt->sc_max_requests; newxprt->sc_sq_depth = rq_depth + ctxts; - if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing send depth to %d\n", - dev->attrs.max_qp_wr); + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; - } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); newxprt->sc_pd = ib_alloc_pd(dev, 0); -- cgit v1.2.3 From fc2e69db82c1ac506cd7f539a3ab66d51d3380dc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:34 -0500 Subject: svcrdma: Clean up comment in svc_rdma_accept() The comment that starts "Qualify ..." applies to only some of the following code paragraph. Re-arrange the lines so the comment makes more sense. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 040d2ef6400c..8127c711fa3b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -397,18 +397,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; - /* Qualify the transport resource defaults with the - * capabilities of this particular device */ + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); + + /* Qualify the transport's resource defaults with the + * capabilities of this particular device. + */ + /* Transport header, head iovec, tail iovec */ newxprt->sc_max_send_sges = 3; /* Add one SGE per page list entry */ newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = svcrdma_max_requests; - newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + newxprt->sc_recv_batch; if (rq_depth > dev->attrs.max_qp_wr) { @@ -417,7 +421,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); ctxts *= newxprt->sc_max_requests; newxprt->sc_sq_depth = rq_depth + ctxts; -- cgit v1.2.3 From 4d9d69db898d05bd063548eee65d16a020676fec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:31:48 -0500 Subject: svcrdma: Add back svc_rdma_recv_ctxt::rc_pages Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (the client) stops responding. We need to go back to handling RDMA Reads by allowing the nfsd thread to return to the svc scheduler, then waking a second thread finish the RPC message once the Read completion fires. To start with, restore the rc_pages field so that RDMA Read pages can be managed across calls to svc_rdma_recvfrom(). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 +++- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 5 +++++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 46f2ce9f810b..0f2d7f68ef5d 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -183,7 +183,6 @@ struct svc_rdma_recv_ctxt { void *rc_recv_buf; struct xdr_stream rc_stream; u32 rc_byte_len; - unsigned int rc_page_count; u32 rc_inv_rkey; __be32 rc_msgtype; @@ -199,6 +198,9 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_chunk *rc_cur_result_payload; struct svc_rdma_pcl rc_write_pcl; struct svc_rdma_pcl rc_reply_pcl; + + unsigned int rc_page_count; + struct page *rc_pages[RPCSVC_MAXPAGES]; }; struct svc_rdma_send_ctxt { diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 38f01652dc6d..e363cb1bdbc4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -214,6 +214,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { + /* @rc_page_count is normally zero here, but error flows + * can leave pages in @rc_pages. + */ + release_pages(ctxt->rc_pages, ctxt->rc_page_count); + pcl_free(&ctxt->rc_call_pcl); pcl_free(&ctxt->rc_read_pcl); pcl_free(&ctxt->rc_write_pcl); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index ff54bb268b7d..28a34718dee5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1131,7 +1131,9 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ + /* Ensure svc_rdma_recv_ctxt_put() does not release pages + * left in @rc_pages while I/O proceeds. + */ head->rc_page_count = 0; out_err: -- cgit v1.2.3 From a937693a82fd2211c5e52b638959d1486a77d16a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:31:54 -0500 Subject: svcrdma: Add back svcxprt_rdma::sc_read_complete_q Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (ie, the client) stops responding. We need to go back to handling RDMA Reads by allowing the nfsd thread to return to the svc scheduler, then waking a second thread finish the RPC message once the Read completion fires. As a next step, add a list_head upon which completed Reads are queued. A subsequent patch will make use of this queue. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 37 +++++++++++++++++++++++++++++++- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 0f2d7f68ef5d..c98d29e51b9c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -98,6 +98,7 @@ struct svcxprt_rdma { u32 sc_pending_recvs; u32 sc_recv_batch; struct list_head sc_rq_dto_q; + struct list_head sc_read_complete_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e363cb1bdbc4..2de947183a7a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -382,6 +382,10 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); svc_rdma_recv_ctxt_put(rdma, ctxt); @@ -763,6 +767,30 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, return true; } +static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + int i; + + /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing + * the rq_pages that were already allocated for this rqstp. + */ + release_pages(rqstp->rq_respages, ctxt->rc_page_count); + for (i = 0; i < ctxt->rc_page_count; i++) + rqstp->rq_pages[i] = ctxt->rc_pages[i]; + + /* Update @rqstp's result send buffer to start after the + * last page in the RDMA Read payload. + */ + rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Prevent svc_rdma_recv_ctxt_put() from releasing the + * pages in ctxt::rc_pages a second time. + */ + ctxt->rc_page_count = 0; +} + /** * svc_rdma_recvfrom - Receive an RPC call * @rqstp: request structure into which to receive an RPC Call @@ -807,8 +835,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; - ctxt = NULL; spin_lock(&rdma_xprt->sc_rq_dto_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + svc_rdma_read_complete(rqstp, ctxt); + goto complete; + } ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); if (ctxt) list_del(&ctxt->rc_list); @@ -846,6 +880,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_readfail; } +complete: rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 8127c711fa3b..4f27325ace4a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -137,6 +137,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); -- cgit v1.2.3 From ecba85e951c178e3fe7cea04eebf1035e8168f93 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:32:01 -0500 Subject: svcrdma: Copy construction of svc_rqst::rq_arg to rdma_read_complete() Once a set of RDMA Reads are complete, the Read completion handler will poke the transport to trigger a second call to svc_rdma_recvfrom(). recvfrom() will then merge the RDMA Read payloads with the previously received RPC header to form a completed RPC Call message. The new code is copied from the svc_rdma_process_read_list() path. A subsequent patch will make use of this code and remove the code that this was copied from (svc_rdma_rw.c). Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 93 ++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 9a3fc6eb09a8..110c1475c527 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2112,6 +2112,7 @@ TRACE_EVENT(svcrdma_wc_read, DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_read_flush); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_read_err); +DEFINE_SIMPLE_CID_EVENT(svcrdma_read_finished); DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_write); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_flush); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2de947183a7a..034bdd02f925 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -767,10 +767,86 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, return true; } +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with a single Read chunk (only the upper layer data payload + * was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_one(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct svc_rdma_chunk *chunk = pcl_first_chunk(&ctxt->rc_read_pcl); + struct xdr_buf *buf = &rqstp->rq_arg; + unsigned int length; + + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. + */ + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; + + /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). + * + * If the client already rounded up the chunk length, the + * length does not change. Otherwise, the length of the page + * list is increased to include XDR round-up. + * + * Currently these chunks always start at page offset 0, + * thus the rounded-up length never crosses a page boundary. + */ + buf->pages = &rqstp->rq_pages[0]; + length = xdr_align_size(chunk->ch_length); + buf->page_len = length; + buf->len += length; + buf->buflen += length; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with payload in multiple Read chunks and no PZRC. + */ +static void svc_rdma_read_complete_multiple(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_NOMSG type message + * (the RPC message body was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_pzrc(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *ctxt) { - int i; + unsigned int i; /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing * the rq_pages that were already allocated for this rqstp. @@ -789,6 +865,21 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, * pages in ctxt::rc_pages a second time. */ ctxt->rc_page_count = 0; + + /* Finish constructing the RPC Call message. The exact + * procedure for that depends on what kind of RPC/RDMA + * chunks were provided by the client. + */ + if (pcl_is_empty(&ctxt->rc_call_pcl)) { + if (ctxt->rc_read_pcl.cl_count == 1) + svc_rdma_read_complete_one(rqstp, ctxt); + else + svc_rdma_read_complete_multiple(rqstp, ctxt); + } else { + svc_rdma_read_complete_pzrc(rqstp, ctxt); + } + + trace_svcrdma_read_finished(&ctxt->rc_cid); } /** -- cgit v1.2.3 From d3dba534100d4e9eb7a5204be97cd6f9ada2066e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:32:07 -0500 Subject: svcrdma: Implement multi-stage Read completion again Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (ie, the client) stops responding. We need to go back to handling RDMA Reads by getting the svc scheduler to call svc_rdma_recvfrom() a second time to finish building an RPC message after a Read completion. This is the final patch, and makes several changes that have to happen concurrently: 1. svc_rdma_process_read_list no longer waits for a completion, but simply builds and posts the Read WRs. 2. svc_rdma_read_done() now queues a completed Read on sc_read_complete_q for later processing rather than calling complete(). 3. The completed RPC message is no longer built in the svc_rdma_process_read_list() path. Finishing the message is now done in svc_rdma_recvfrom() when it notices work on the sc_read_complete_q. The "finish building this RPC message" code is removed from the svc_rdma_process_read_list() path. This arrangement avoids the need for an nfsd thread to wait for an RDMA Read non-interruptibly without a timeout. It's basically the same code structure that Tom Tucker used for Read chunks along with some clean-up and modernization. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 6 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 36 +++++--- net/sunrpc/xprtrdma/svc_rdma_rw.c | 151 +++++++++++--------------------- 3 files changed, 80 insertions(+), 113 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c98d29e51b9c..e7595ae62fe2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -170,8 +170,6 @@ struct svc_rdma_chunk_ctxt { struct list_head cc_rwctxts; ktime_t cc_posttime; int cc_sqecount; - enum ib_wc_status cc_status; - struct completion cc_done; }; struct svc_rdma_recv_ctxt { @@ -191,6 +189,7 @@ struct svc_rdma_recv_ctxt { unsigned int rc_pageoff; unsigned int rc_curpage; unsigned int rc_readbytes; + struct xdr_buf rc_saved_arg; struct svc_rdma_chunk_ctxt rc_cc; struct svc_rdma_pcl rc_call_pcl; @@ -240,6 +239,9 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc); +extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 034bdd02f925..d72953f29258 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -214,6 +214,8 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { + svc_rdma_cc_release(rdma, &ctxt->rc_cc, DMA_FROM_DEVICE); + /* @rc_page_count is normally zero here, but error flows * can leave pages in @rc_pages. */ @@ -870,6 +872,7 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, * procedure for that depends on what kind of RPC/RDMA * chunks were provided by the client. */ + rqstp->rq_arg = ctxt->rc_saved_arg; if (pcl_is_empty(&ctxt->rc_call_pcl)) { if (ctxt->rc_read_pcl.cl_count == 1) svc_rdma_read_complete_one(rqstp, ctxt); @@ -930,7 +933,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); if (ctxt) { list_del(&ctxt->rc_list); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + svc_xprt_received(xprt); svc_rdma_read_complete(rqstp, ctxt); goto complete; } @@ -965,11 +969,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_get_inv_rkey(rdma_xprt, ctxt); if (!pcl_is_empty(&ctxt->rc_read_pcl) || - !pcl_is_empty(&ctxt->rc_call_pcl)) { - ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); - if (ret < 0) - goto out_readfail; - } + !pcl_is_empty(&ctxt->rc_call_pcl)) + goto out_readlist; complete: rqstp->rq_xprt_ctxt = ctxt; @@ -983,12 +984,23 @@ out_err: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_readfail: - if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, ctxt, ret); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - svc_xprt_deferred_close(xprt); - return -ENOTCONN; +out_readlist: + /* This @rqstp is about to be recycled. Save the work + * already done constructing the Call message in rq_arg + * so it can be restored when the RDMA Reads have + * completed. + */ + ctxt->rc_saved_arg = rqstp->rq_arg; + + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) { + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, ctxt, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + svc_xprt_deferred_close(xprt); + return ret; + } + return 0; out_backchannel: svc_rdma_handle_bc_reply(rqstp, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 28a34718dee5..c00fcce61d1e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -163,14 +163,15 @@ void svc_rdma_cc_init(struct svcxprt_rdma *rdma, cc->cc_sqecount = 0; } -/* - * The consumed rw_ctx's are cleaned and placed on a local llist so - * that only one atomic llist operation is needed to put them all - * back on the free list. +/** + * svc_rdma_cc_release - Release resources held by a svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be released + * @dir: DMA direction */ -static void svc_rdma_cc_release(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc, - enum dma_data_direction dir) +void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) { struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; @@ -300,12 +301,21 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); struct svc_rdma_recv_ctxt *ctxt; + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + + ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc); switch (wc->status) { case IB_WC_SUCCESS: - ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc); trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes, cc->cc_posttime); - break; + + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q); + /* the unlock pairs with the smp_rmb in svc_xprt_ready */ + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + spin_unlock(&rdma->sc_rq_dto_lock); + svc_xprt_enqueue(&rdma->sc_xprt); + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); break; @@ -313,10 +323,13 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - cc->cc_status = wc->status; - complete(&cc->cc_done); - return; + /* The RDMA Read has flushed, so the incoming RPC message + * cannot be constructed and must be dropped. Signal the + * loss to the client by closing the connection. + */ + svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); + svc_rdma_recv_ctxt_put(rdma, ctxt); + svc_xprt_deferred_close(&rdma->sc_xprt); } /* @@ -823,7 +836,6 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; - struct xdr_buf *buf = &rqstp->rq_arg; struct svc_rdma_chunk *chunk, *next; unsigned int start, length; int ret; @@ -853,18 +865,7 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, start += length; length = head->rc_byte_len - start; - ret = svc_rdma_copy_inline_range(rqstp, head, start, length); - if (ret < 0) - return ret; - - buf->len += head->rc_readbytes; - buf->buflen += head->rc_readbytes; - - buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); - buf->pages = &rqstp->rq_pages[1]; - buf->page_len = head->rc_readbytes - buf->head[0].iov_len; - return 0; + return svc_rdma_copy_inline_range(rqstp, head, start, length); } /** @@ -888,42 +889,8 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, static int svc_rdma_read_data_item(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { - struct xdr_buf *buf = &rqstp->rq_arg; - struct svc_rdma_chunk *chunk; - unsigned int length; - int ret; - - chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(rqstp, head, chunk); - if (ret < 0) - goto out; - - /* Split the Receive buffer between the head and tail - * buffers at Read chunk's position. XDR roundup of the - * chunk is not included in either the pagelist or in - * the tail. - */ - buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; - buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; - buf->head[0].iov_len = chunk->ch_position; - - /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). - * - * If the client already rounded up the chunk length, the - * length does not change. Otherwise, the length of the page - * list is increased to include XDR round-up. - * - * Currently these chunks always start at page offset 0, - * thus the rounded-up length never crosses a page boundary. - */ - buf->pages = &rqstp->rq_pages[0]; - length = xdr_align_size(chunk->ch_length); - buf->page_len = length; - buf->len += length; - buf->buflen += length; - -out: - return ret; + return svc_rdma_build_read_chunk(rqstp, head, + pcl_first_chunk(&head->rc_read_pcl)); } /** @@ -1051,23 +1018,28 @@ static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, static noinline int svc_rdma_read_special(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { - struct xdr_buf *buf = &rqstp->rq_arg; - int ret; - - ret = svc_rdma_read_call_chunk(rqstp, head); - if (ret < 0) - goto out; - - buf->len += head->rc_readbytes; - buf->buflen += head->rc_readbytes; + return svc_rdma_read_call_chunk(rqstp, head); +} - buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes); - buf->pages = &rqstp->rq_pages[1]; - buf->page_len = head->rc_readbytes - buf->head[0].iov_len; +/* Pages under I/O have been copied to head->rc_pages. Ensure that + * svc_xprt_release() does not put them when svc_rdma_recvfrom() + * returns. This has to be done after all Read WRs are constructed + * to properly handle a page that happens to be part of I/O on behalf + * of two different RDMA segments. + * + * Note: if the subsequent post_send fails, these pages have already + * been moved to head->rc_pages and thus will be cleaned up by + * svc_rdma_recv_ctxt_put(). + */ +static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) +{ + unsigned int i; -out: - return ret; + for (i = 0; i < head->rc_page_count; i++) { + head->rc_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } } /** @@ -1113,30 +1085,11 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, ret = svc_rdma_read_multiple_chunks(rqstp, head); } else ret = svc_rdma_read_special(rqstp, head); + svc_rdma_clear_rqst_pages(rqstp, head); if (ret < 0) - goto out_err; + return ret; trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); - init_completion(&cc->cc_done); ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) - goto out_err; - - ret = 1; - wait_for_completion(&cc->cc_done); - if (cc->cc_status != IB_WC_SUCCESS) - ret = -EIO; - - /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Ensure svc_rdma_recv_ctxt_put() does not release pages - * left in @rc_pages while I/O proceeds. - */ - head->rc_page_count = 0; - -out_err: - svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); - return ret; + return ret < 0 ? ret : 1; } -- cgit v1.2.3 From 05a4b58301c38fbb81cc10a79f246f3dea0043c5 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 15 Dec 2023 13:47:15 -0800 Subject: SUNRPC: remove printk when back channel request not found If the client interface is down, or there is a network partition between the client and server that prevents the callback request to reach the client, TCP on the server will keep re-transmitting the callback for about ~9 minutes before giving up and closing the connection. If the connection between the client and the server is re-established before the connection is closed and after the callback timed out (9 secs) then the re-transmitted callback request will arrive at the client. When the server receives the reply of the callback, receive_cb_reply prints the "Got unrecognized reply..." message in the system log since the callback request was already removed from the server xprt's recv_queue. Even though this scenario has no effect on the server operation, a malfunctioning or malicious client can fill up the server's system log. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 998687421fa6..bfb2f78523a8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1049,18 +1049,14 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) struct rpc_rqst *req = NULL; struct kvec *src, *dst; __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - __be32 xid; - __be32 calldir; - - xid = *p++; - calldir = *p; + __be32 xid = *p; if (!bc_xprt) return -EAGAIN; spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) - goto unlock_notfound; + goto unlock_eagain; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); /* @@ -1077,12 +1073,6 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) rqstp->rq_arg.len = 0; spin_unlock(&bc_xprt->queue_lock); return 0; -unlock_notfound: - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, ntohl(xid)); unlock_eagain: spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; -- cgit v1.2.3 From 7b207ccd983350a5dedd132b57c666186dd02a7c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 11:56:32 +1100 Subject: svc: don't hold reference for poolstats, only mutex. A future patch will remove refcounting on svc_serv as it is of little use. It is currently used to keep the svc around while the pool_stats file is open. Change this to get the pointer, protected by the mutex, only in seq_start, and the release the mutex in seq_stop. This means that if the nfsd server is stopped and restarted while the pool_stats file it open, then some pool stats info could be from the first instance and some from the second. This might appear odd, but is unlikely to be a problem in practice. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 4 +++- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 24 ++---------------------- include/linux/sunrpc/svc.h | 8 +++++++- net/sunrpc/svc_xprt.c | 32 +++++++++++++++++++++++--------- 5 files changed, 36 insertions(+), 34 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ab303a8b77d5..16dbef245dbb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,7 +123,9 @@ struct nfsd_net { u32 clientid_counter; u32 clverifier_counter; - struct svc_serv *nfsd_serv; + struct svc_info nfsd_info; +#define nfsd_serv nfsd_info.serv + /* When a listening socket is added to nfsd, keep_active is set * and this justifies a reference on nfsd_serv. This stops * nfsd_serv from being freed. When the number of threads is diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6a3b385703cc..46a001e81b55 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -183,7 +183,7 @@ static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = nfsd_pool_stats_release, + .release = seq_release, }; DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ee835bf9ee42..365968737923 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -684,6 +684,7 @@ int nfsd_create_serv(struct net *net) return error; } spin_lock(&nfsd_notifier_lock); + nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); @@ -1082,28 +1083,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { - int ret; struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv == NULL) { - mutex_unlock(&nfsd_mutex); - return -ENODEV; - } - svc_get(nn->nfsd_serv); - ret = svc_pool_stats_open(nn->nfsd_serv, file); - mutex_unlock(&nfsd_mutex); - return ret; -} - -int nfsd_pool_stats_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct svc_serv *serv = seq->private; - int ret = seq_release(inode, file); - - mutex_lock(&nfsd_mutex); - svc_put(serv); - mutex_unlock(&nfsd_mutex); - return ret; + return svc_pool_stats_open(&nn->nfsd_info, file); } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 544fcfe07479..3bea2840272d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -97,6 +97,12 @@ struct svc_serv { #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; +/* This is used by pool_stats to find and lock an svc */ +struct svc_info { + struct svc_serv *serv; + struct mutex *mutex; +}; + /** * svc_get() - increment reference count on a SUNRPC serv * @serv: the svc_serv to have count incremented @@ -431,7 +437,7 @@ void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -int svc_pool_stats_open(struct svc_serv *serv, struct file *file); +int svc_pool_stats_open(struct svc_info *si, struct file *file); void svc_process(struct svc_rqst *rqstp); void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp); int svc_register(const struct svc_serv *, struct net *, const int, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 1b71055fc391..b4a85a227bd7 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1362,29 +1362,36 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen) } EXPORT_SYMBOL_GPL(svc_xprt_names); - /*----------------------------------------------------------------------------*/ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) { unsigned int pidx = (unsigned int)*pos; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + mutex_lock(si->mutex); + if (!pidx) return SEQ_START_TOKEN; - return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); + if (!si->serv) + return NULL; + return pidx > si->serv->sv_nrpools ? NULL + : &si->serv->sv_pools[pidx - 1]; } static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) { struct svc_pool *pool = p; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; + struct svc_serv *serv = si->serv; dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); - if (p == SEQ_START_TOKEN) { + if (!serv) { + pool = NULL; + } else if (p == SEQ_START_TOKEN) { pool = &serv->sv_pools[0]; } else { unsigned int pidx = (pool - &serv->sv_pools[0]); @@ -1399,6 +1406,9 @@ static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) static void svc_pool_stats_stop(struct seq_file *m, void *p) { + struct svc_info *si = m->private; + + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) @@ -1426,14 +1436,18 @@ static const struct seq_operations svc_pool_stats_seq_ops = { .show = svc_pool_stats_show, }; -int svc_pool_stats_open(struct svc_serv *serv, struct file *file) +int svc_pool_stats_open(struct svc_info *info, struct file *file) { + struct seq_file *seq; int err; err = seq_open(file, &svc_pool_stats_seq_ops); - if (!err) - ((struct seq_file *) file->private_data)->private = serv; - return err; + if (err) + return err; + seq = file->private_data; + seq->private = info; + + return 0; } EXPORT_SYMBOL(svc_pool_stats_open); -- cgit v1.2.3 From 1e3577a4521ef33199eea05ce7b9099825848c49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 11:56:34 +1100 Subject: SUNRPC: discard sv_refcnt, and svc_get/svc_put sv_refcnt is no longer useful. lockd and nfs-cb only ever have the svc active when there are a non-zero number of threads, so sv_refcnt mirrors sv_nrthreads. nfsd also keeps the svc active between when a socket is added and when the first thread is started, but we don't really need a refcount for that. We can simply not destroy the svc while there are any permanent sockets attached. So remove sv_refcnt and the get/put functions. Instead of a final call to svc_put(), call svc_destroy() instead. This is changed to also store NULL in the passed-in pointer to make it easier to avoid use-after-free situations. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 10 ++++------ fs/nfs/callback.c | 13 ++++++------- fs/nfsd/netns.h | 7 ------- fs/nfsd/nfsctl.c | 13 ++----------- fs/nfsd/nfssvc.c | 26 ++++---------------------- include/linux/sunrpc/svc.h | 27 +-------------------------- net/sunrpc/svc.c | 13 ++++--------- 7 files changed, 21 insertions(+), 88 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 81be07c1d3d1..0d6cb3fdc0e1 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -345,10 +345,10 @@ static int lockd_get(void) serv->sv_maxconn = nlm_max_connections; error = svc_set_num_threads(serv, NULL, 1); - /* The thread now holds the only reference */ - svc_put(serv); - if (error < 0) + if (error < 0) { + svc_destroy(&serv); return error; + } nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); @@ -372,11 +372,9 @@ static void lockd_put(void) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif - svc_get(nlmsvc_serv); svc_set_num_threads(nlmsvc_serv, NULL, 0); - svc_put(nlmsvc_serv); timer_delete_sync(&nlmsvc_retry); - nlmsvc_serv = NULL; + svc_destroy(&nlmsvc_serv); dprintk("lockd_down: service destroyed\n"); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 4ffa1f469e90..760d27dd7225 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -187,7 +187,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) * Check whether we're already up and running. */ if (cb_info->serv) - return svc_get(cb_info->serv); + return cb_info->serv; /* * Sanity check: if there's no task, @@ -245,9 +245,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) cb_info->users++; err_net: - if (!cb_info->users) - cb_info->serv = NULL; - svc_put(serv); + if (!cb_info->users) { + svc_set_num_threads(cb_info->serv, NULL, 0); + svc_destroy(&cb_info->serv); + } err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -271,11 +272,9 @@ void nfs_callback_down(int minorversion, struct net *net) nfs_callback_down_net(minorversion, serv, net); cb_info->users--; if (cb_info->users == 0) { - svc_get(serv); svc_set_num_threads(serv, NULL, 0); - svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); - cb_info->serv = NULL; + svc_destroy(&cb_info->serv); } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 16dbef245dbb..74b4360779a1 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -126,13 +126,6 @@ struct nfsd_net { struct svc_info nfsd_info; #define nfsd_serv nfsd_info.serv - /* When a listening socket is added to nfsd, keep_active is set - * and this justifies a reference on nfsd_serv. This stops - * nfsd_serv from being freed. When the number of threads is - * set, keep_active is cleared and the reference is dropped. So - * when the last thread exits, the service will be destroyed. - */ - int keep_active; /* * clientid and stateid data for construction of net unique COPY diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 46a001e81b55..5c2fae98d98a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -711,12 +711,8 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred serv = nn->nfsd_serv; err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0 && !serv->sv_nrthreads && !nn->keep_active) + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_last_thread(net); - else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) - svc_get(serv); - - svc_put(serv); return err; } @@ -754,10 +750,6 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) - svc_get(serv); - - svc_put(serv); return 0; out_close: xprt = svc_find_xprt(serv, transport, net, PF_INET, port); @@ -766,10 +758,9 @@ out_close: svc_xprt_put(xprt); } out_err: - if (!serv->sv_nrthreads && !nn->keep_active) + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_last_thread(net); - svc_put(serv); return err; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 365968737923..1a295c409343 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -59,15 +59,6 @@ static __be32 nfsd_init_request(struct svc_rqst *, * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * - * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless - * nn->keep_active is set). That number of nfsd threads must - * exist and each must be listed in ->sp_all_threads in some entry of - * ->sv_pools[]. - * - * Each active thread holds a counted reference on nn->nfsd_serv, as does - * the nn->keep_active flag and various transient calls to svc_get(). - * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: @@ -572,6 +563,7 @@ void nfsd_last_thread(struct net *net) nfsd_shutdown_net(net); nfsd_export_flush(net); + svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) @@ -646,11 +638,9 @@ void nfsd_shutdown_threads(struct net *net) return; } - svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); nfsd_last_thread(net); - svc_put(serv); mutex_unlock(&nfsd_mutex); } @@ -666,10 +656,9 @@ int nfsd_create_serv(struct net *net) struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); - if (nn->nfsd_serv) { - svc_get(nn->nfsd_serv); + if (nn->nfsd_serv) return 0; - } + if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); @@ -680,7 +669,7 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - svc_put(serv); + svc_destroy(&serv); return error; } spin_lock(&nfsd_notifier_lock); @@ -764,7 +753,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) nthreads[0] = 1; /* apply the new numbers */ - svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], @@ -772,7 +760,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - svc_put(nn->nfsd_serv); return err; } @@ -814,13 +801,8 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out_put; error = serv->sv_nrthreads; out_put: - /* Threads now hold service active */ - if (xchg(&nn->keep_active, 0)) - svc_put(serv); - if (serv->sv_nrthreads == 0) nfsd_last_thread(net); - svc_put(serv); out: mutex_unlock(&nfsd_mutex); return error; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3bea2840272d..8d7888234e9e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -69,7 +69,6 @@ struct svc_serv { struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; - struct kref sv_refcnt; unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -103,31 +102,7 @@ struct svc_info { struct mutex *mutex; }; -/** - * svc_get() - increment reference count on a SUNRPC serv - * @serv: the svc_serv to have count incremented - * - * Returns: the svc_serv that was passed in. - */ -static inline struct svc_serv *svc_get(struct svc_serv *serv) -{ - kref_get(&serv->sv_refcnt); - return serv; -} - -void svc_destroy(struct kref *); - -/** - * svc_put - decrement reference count on a SUNRPC serv - * @serv: the svc_serv to have count decremented - * - * When the reference count reaches zero, svc_destroy() - * is called to clean up and free the serv. - */ -static inline void svc_put(struct svc_serv *serv) -{ - kref_put(&serv->sv_refcnt, svc_destroy); -} +void svc_destroy(struct svc_serv **svcp); /* * Maximum payload size supported by a kernel RPC server. diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fa4e23fa0e09..eb5856e1351d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -463,7 +463,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - kref_init(&serv->sv_refcnt); serv->sv_stats = prog->pg_stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; @@ -564,11 +563,13 @@ EXPORT_SYMBOL_GPL(svc_create_pooled); * protect sv_permsocks and sv_tempsocks. */ void -svc_destroy(struct kref *ref) +svc_destroy(struct svc_serv **servp) { - struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); + struct svc_serv *serv = *servp; unsigned int i; + *servp = NULL; + dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); timer_shutdown_sync(&serv->sv_temptimer); @@ -675,7 +676,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return ERR_PTR(-ENOMEM); - svc_get(serv); spin_lock_bh(&serv->sv_lock); serv->sv_nrthreads += 1; spin_unlock_bh(&serv->sv_lock); @@ -935,11 +935,6 @@ svc_exit_thread(struct svc_rqst *rqstp) svc_rqst_free(rqstp); - svc_put(serv); - /* That svc_put() cannot be the last, because the thread - * waiting for SP_VICTIM_REMAINS to clear must hold - * a reference. So it is still safe to access pool. - */ clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags); } EXPORT_SYMBOL_GPL(svc_exit_thread); -- cgit v1.2.3 From 17419aefcbfd9891863e8b8132f0bca9a6b2984e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 11:56:35 +1100 Subject: nfsd: rename nfsd_last_thread() to nfsd_destroy_serv() As this function now destroys the svc_serv, this is a better name. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 5 +++-- fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfssvc.c | 12 ++++++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5c2fae98d98a..8e6dbe9e0b65 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -712,7 +712,8 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) - nfsd_last_thread(net); + nfsd_destroy_serv(net); + return err; } @@ -759,7 +760,7 @@ out_close: } out_err: if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) - nfsd_last_thread(net); + nfsd_destroy_serv(net); return err; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 9ed0e08d16c2..304e9728b929 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -148,7 +148,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change); int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change); void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); -void nfsd_last_thread(struct net *net); +void nfsd_destroy_serv(struct net *net); extern int nfsd_max_blksize; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1a295c409343..a667802e08e7 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -532,7 +532,11 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -void nfsd_last_thread(struct net *net) +/** + * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace + * @net: network namespace the NFS service is associated with + */ +void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; @@ -554,7 +558,7 @@ void nfsd_last_thread(struct net *net) /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are - * started, then nfsd_last_thread will be run before any of this + * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_rpcb_cleanup(serv, net); @@ -640,7 +644,7 @@ void nfsd_shutdown_threads(struct net *net) /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_last_thread(net); + nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } @@ -802,7 +806,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = serv->sv_nrthreads; out_put: if (serv->sv_nrthreads == 0) - nfsd_last_thread(net); + nfsd_destroy_serv(net); out: mutex_unlock(&nfsd_mutex); return error; -- cgit v1.2.3