From 06ef26a0e35199cc0445d7d5708533a12af8ff5b Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 6 Apr 2016 11:32:52 -0400 Subject: SUNRPC: init xdr_stream for zero iov_len, page_len An xdr_buf with head[0].iov_len = 0 and page_len = 0 will cause xdr_init_decode() to incorrectly setup the xdr_stream. Specifically, xdr->end is never initialized. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 6bdb3865212d..c4f3cc0c0775 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) xdr_set_iov(xdr, buf->head, buf->len); else if (buf->page_len != 0) xdr_set_page_base(xdr, 0, buf->len); + else + xdr_set_iov(xdr, buf->head, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; -- cgit v1.2.3 From 3c6e0bc8a14cfc8e1d4ab87f46f77b070c815bf1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Apr 2016 20:51:54 -0400 Subject: sunrpc: plumb gfp_t parm into crcreate operation We need to be able to call the generic_cred creator from different contexts. Add a gfp_t parm to the crcreate operation and to rpcauth_lookup_credcache. For now, we just push the gfp_t parms up one level to the *_lookup_cred functions. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 4 ++-- net/sunrpc/auth.c | 4 ++-- net/sunrpc/auth_generic.c | 6 +++--- net/sunrpc/auth_gss/auth_gss.c | 6 +++--- net/sunrpc/auth_unix.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 6a241a277249..3b616aa7e4d2 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -127,7 +127,7 @@ struct rpc_authops { void (*destroy)(struct rpc_auth *); struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int); - struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int); + struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t); int (*list_pseudoflavors)(rpc_authflavor_t *, int); rpc_authflavor_t (*info2flavor)(struct rpcsec_gss_info *); int (*flavor2info)(rpc_authflavor_t, @@ -178,7 +178,7 @@ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t, int rpcauth_get_gssinfo(rpc_authflavor_t, struct rpcsec_gss_info *); int rpcauth_list_flavors(rpc_authflavor_t *, int); -struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int); +struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t); void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); struct rpc_cred * rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 02f53674dc39..e0bb30fd2ed3 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void) */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, - int flags) + int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; @@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (flags & RPCAUTH_LOOKUP_RCU) return ERR_PTR(-ECHILD); - new = auth->au_ops->crcreate(auth, acred, flags); + new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 41248b1820c7..6ed3e3df43e9 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -77,15 +77,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task, static struct rpc_cred * generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(&generic_auth, acred, flags); + return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL); } static struct rpc_cred * -generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct generic_cred *gcred; - gcred = kmalloc(sizeof(*gcred), GFP_KERNEL); + gcred = kmalloc(sizeof(*gcred), gfp); if (gcred == NULL) return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 15612ffa8d57..e64ae93d5b4f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred) static struct rpc_cred * gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags); + return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); } static struct rpc_cred * -gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; @@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) __func__, from_kuid(&init_user_ns, acred->uid), auth->au_flavor); - if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) + if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 0d3dd364c22f..9f65452b7cbc 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth) static struct rpc_cred * unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags); + return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); } static struct rpc_cred * -unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct unx_cred *cred; unsigned int groups = 0; @@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) from_kuid(&init_user_ns, acred->uid), from_kgid(&init_user_ns, acred->gid)); - if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) + if (!(cred = kmalloc(sizeof(*cred), gfp))) return ERR_PTR(-ENOMEM); rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); -- cgit v1.2.3 From c065d229e308ede426a3608cf480c61983b36fb8 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 21 Apr 2016 20:51:55 -0400 Subject: sunrpc: add rpc_lookup_generic_cred Add function rpc_lookup_generic_cred, which allows lookups of a generic credential that's not current_cred(). [jlayton: add gfp_t parm] Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 1 + net/sunrpc/auth_generic.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 3b616aa7e4d2..16bd8f8fef8c 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -167,6 +167,7 @@ void rpc_destroy_authunix(void); struct rpc_cred * rpc_lookup_cred(void); struct rpc_cred * rpc_lookup_cred_nonblock(void); +struct rpc_cred * rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t); struct rpc_cred * rpc_lookup_machine_cred(const char *service_name); int rpcauth_register(const struct rpc_authops *); int rpcauth_unregister(const struct rpc_authops *); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 6ed3e3df43e9..54dd3fdead54 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +struct rpc_cred * +rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp) +{ + return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp); +} +EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred); + struct rpc_cred *rpc_lookup_cred_nonblock(void) { return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); -- cgit v1.2.3 From 6b26cc8c8ead3636a18bfd9489984983f4ddd6f4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:40:40 -0400 Subject: sunrpc: Advertise maximum backchannel payload size RPC-over-RDMA transports have a limit on how large a backward direction (backchannel) RPC message can be. Ensure that the NFSv4.x CREATE_SESSION operation advertises this limit to servers. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 10 ++++++---- include/linux/sunrpc/clnt.h | 1 + include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 17 +++++++++++++++++ net/sunrpc/xprtrdma/backchannel.c | 16 ++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + net/sunrpc/xprtsock.c | 6 ++++++ 8 files changed, 49 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9795725d708c..196e41e12621 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7371,9 +7371,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) * always set csa_cachethis to FALSE because the current implementation * of the back channel DRC only supports caching the CB_SEQUENCE operation. */ -static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, + struct rpc_clnt *clnt) { unsigned int max_rqst_sz, max_resp_sz; + unsigned int max_bc_payload = rpc_max_bc_payload(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7391,8 +7393,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.max_rqst_sz = PAGE_SIZE; - args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_rqst_sz = max_bc_payload; + args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; @@ -7496,7 +7498,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, }; int status; - nfs4_init_channel_attrs(&args); + nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 9a7ddbaf116e..19c659d1c0f8 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -176,6 +176,7 @@ void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); int rpc_protocol(struct rpc_clnt *); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); +size_t rpc_max_bc_payload(struct rpc_clnt *); unsigned long rpc_get_timeout(struct rpc_clnt *clnt); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index fb0d212e0d3a..5aa3834619a8 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -142,6 +142,7 @@ struct rpc_xprt_ops { int (*bc_setup)(struct rpc_xprt *xprt, unsigned int min_reqs); int (*bc_up)(struct svc_serv *serv, struct net *net); + size_t (*bc_maxpayload)(struct rpc_xprt *xprt); void (*bc_free_rqst)(struct rpc_rqst *rqst); void (*bc_destroy)(struct rpc_xprt *xprt, unsigned int max_reqs); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7e0c9bf22df8..06b4df9faaa1 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1413,6 +1413,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_max_payload); +/** + * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes + * @clnt: RPC client to query + */ +size_t rpc_max_bc_payload(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + size_t ret; + + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + ret = xprt->ops->bc_maxpayload(xprt); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_max_bc_payload); + /** * rpc_get_timeout - Get timeout for transport in units of HZ * @clnt: RPC client to query diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcd7640eeb5..87762d976b63 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -191,6 +191,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) return 0; } +/** + * xprt_rdma_bc_maxpayload - Return maximum backchannel message size + * @xprt: transport + * + * Returns maximum size, in bytes, of a backchannel message + */ +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + size_t maxmsg; + + maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + return maxmsg - RPCRDMA_HDRLEN_MIN; +} + /** * rpcrdma_bc_marshal_reply - Send backwards direction reply * @rqst: buffer containing RPC reply data diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index b1b009f10ea3..9954342924df 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -707,6 +707,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { #if defined(CONFIG_SUNRPC_BACKCHANNEL) .bc_setup = xprt_rdma_bc_setup, .bc_up = xprt_rdma_bc_up, + .bc_maxpayload = xprt_rdma_bc_maxpayload, .bc_free_rqst = xprt_rdma_bc_free_rqst, .bc_destroy = xprt_rdma_bc_destroy, #endif diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ebc743cb96f..7723e5faff4d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -534,6 +534,7 @@ void xprt_rdma_cleanup(void); #if defined(CONFIG_SUNRPC_BACKCHANNEL) int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); int xprt_rdma_bc_up(struct svc_serv *, struct net *); +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int rpcrdma_bc_marshal_reply(struct rpc_rqst *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 65e759569e48..f1faf6b9aaff 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1365,6 +1365,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) return ret; return 0; } + +static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) +{ + return PAGE_SIZE; +} #else static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) @@ -2660,6 +2665,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, .bc_up = xs_tcp_bc_up, + .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, #endif -- cgit v1.2.3 From 29c554227aeec48cde5c22f911e51763f096e125 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:40:48 -0400 Subject: xprtrdma: Bound the inline threshold values Currently the sysctls that allow setting the inline threshold allow any value to be set. Small values only make the transport run slower. The default 1KB setting is as low as is reasonable. And the logic that decides how to divide a Send buffer between RPC-over-RDMA header and RPC message assumes (but does not check) that the lower bound is not crazy (say, 57 bytes). Send and receive buffers share a page with some control information. Values larger than about 3KB can't be supported, currently. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 4 +++- net/sunrpc/xprtrdma/transport.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 767190b01363..39267dc3486a 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -52,7 +52,9 @@ #define RPCRDMA_DEF_SLOT_TABLE (128U) #define RPCRDMA_MAX_SLOT_TABLE (256U) -#define RPCRDMA_DEF_INLINE (1024) /* default inline max */ +#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ +#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */ +#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */ /* Memory registration strategies, by number. * This is part of a kernel / user space API. Do not remove. */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9954342924df..16595ff91994 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int min_inline_size = RPCRDMA_MIN_INLINE; +static unsigned int max_inline_size = RPCRDMA_MAX_INLINE; static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; @@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, }, { .procname = "rdma_max_inline_write", @@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, }, { .procname = "rdma_inline_write_padding", -- cgit v1.2.3 From 949317464bc2baca0ccc69e35a7b5cd3715633a6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:40:56 -0400 Subject: xprtrdma: Limit number of RDMA segments in RPC-over-RDMA headers Send buffer space is shared between the RPC-over-RDMA header and an RPC message. A large RPC-over-RDMA header means less space is available for the associated RPC message, which then has to be moved via an RDMA Read or Write. As more segments are added to the chunk lists, the header increases in size. Typical modern hardware needs only a few segments to convey the maximum payload size, but some devices and registration modes may need a lot of segments to convey data payload. Sometimes so many are needed that the remaining space in the Send buffer is not enough for the RPC message. Sending such a message usually fails. To ensure a transport can always make forward progress, cap the number of RDMA segments that are allowed in chunk lists. This prevents less-capable devices and memory registrations from consuming a large portion of the Send buffer by reducing the maximum data payload that can be conveyed with such devices. For now I choose an arbitrary maximum of 8 RDMA segments. This allows a maximum size RPC-over-RDMA header to fit nicely in the current 1024 byte inline threshold with over 700 bytes remaining for an inline RPC message. The current maximum data payload of NFS READ or WRITE requests is one megabyte. To convey that payload on a client with 4KB pages, each chunk segment would need to handle 32 or more data pages. This is well within the capabilities of FMR. For physical registration, the maximum payload size on platforms with 4KB pages is reduced to 32KB. For FRWR, a device's maximum page list depth would need to be at least 34 to support the maximum 1MB payload. A device with a smaller maximum page list depth means the maximum data payload is reduced when using that device. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- net/sunrpc/xprtrdma/physical_ops.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 22 ---------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 21 ++++++++++++++++++++- 5 files changed, 23 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index b289e106540b..4aeb104d0696 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -48,7 +48,7 @@ static size_t fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) { return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); + RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); } static int diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c250924a9fd3..2f375982abf4 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -243,7 +243,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); } static void diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 481b9b6f4a15..e16ed54d24ed 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -47,7 +47,7 @@ static size_t physical_op_maxpages(struct rpcrdma_xprt *r_xprt) { return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt)); + RPCRDMA_MAX_HDR_SEGS); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f5ed9f982cd7..9f8d6c1dc7c6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1271,25 +1271,3 @@ out_rc: rpcrdma_recv_buffer_put(rep); return rc; } - -/* How many chunk list items fit within our inline buffers? - */ -unsigned int -rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - int bytes, segments; - - bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); - bytes -= RPCRDMA_HDRLEN_MIN; - if (bytes < sizeof(struct rpcrdma_segment) * 2) { - pr_warn("RPC: %s: inline threshold too small\n", - __func__); - return 0; - } - - segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); - dprintk("RPC: %s: max chunk list size = %d segments\n", - __func__, segments); - return segments; -} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 7723e5faff4d..00287486c62c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -144,6 +144,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) +/* To ensure a transport can always make forward progress, + * the number of RDMA segments allowed in header chunk lists + * is capped at 8. This prevents less-capable devices and + * memory registrations from overrunning the Send buffer + * while building chunk lists. + * + * Elements of the Read list take up more room than the + * Write list or Reply chunk. 8 read segments means the Read + * list (or Write list or Reply chunk) cannot consume more + * than + * + * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * + * And the fixed part of the header is another 24 bytes. + * + * The smallest inline threshold is 1024 bytes, ensuring that + * at least 750 bytes are available for RPC messages. + */ +#define RPCRDMA_MAX_HDR_SEGS (8) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -456,7 +476,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); -unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); int frwr_alloc_recovery_wq(void); -- cgit v1.2.3 From 302d3deb20682a076e1ab551821cacfdc81c5e4f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:05 -0400 Subject: xprtrdma: Prevent inline overflow When deciding whether to send a Call inline, rpcrdma_marshal_req doesn't take into account header bytes consumed by chunk lists. This results in Call messages on the wire that are sometimes larger than the inline threshold. Likewise, when a Write list or Reply chunk is in play, the server's reply has to emit an RDMA Send that includes a larger-than-minimal RPC-over-RDMA header. The actual size of a Call message cannot be estimated until after the chunk lists have been registered. Thus the size of each RPC-over-RDMA header can be estimated only after chunks are registered; but the decision to register chunks is based on the size of that header. Chicken, meet egg. The best a client can do is estimate header size based on the largest header that might occur, and then ensure that inline content is always smaller than that. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 3 ++ net/sunrpc/xprtrdma/frwr_ops.c | 3 ++ net/sunrpc/xprtrdma/physical_ops.c | 5 ++- net/sunrpc/xprtrdma/rpc_rdma.c | 85 +++++++++++++++++++++++++++++++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++ 5 files changed, 90 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 4aeb104d0696..009694b0c56e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -39,6 +39,9 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, + RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES)); return 0; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2f375982abf4..41e02e7d9b4c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -231,6 +231,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, depth; } + rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, + RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth)); return 0; } diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index e16ed54d24ed..2dc6ec2b006a 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, __func__, PTR_ERR(mr)); return -ENOMEM; } - ia->ri_dma_mr = mr; + + rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + RPCRDMA_MAX_HDR_SEGS)); return 0; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 888823bb6dae..205b81b5ca9e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -61,7 +61,6 @@ enum rpcrdma_chunktype { rpcrdma_replych }; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ " read chunk", /* some argument via rdma read */ @@ -69,18 +68,72 @@ static const char transfertypes[][12] = { "write chunk", /* some result via rdma write */ "reply chunk" /* entire reply via rdma write */ }; -#endif + +/* Returns size of largest RPC-over-RDMA header in a Call message + * + * The client marshals only one chunk list per Call message. + * The largest list is the Read list. + */ +static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Read list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size = maxsegs * sizeof(struct rpcrdma_read_chunk); + + dprintk("RPC: %s: max call header size = %u\n", + __func__, size); + return size; +} + +/* Returns size of largest RPC-over-RDMA header in a Reply message + * + * There is only one Write list or one Reply chunk per Reply + * message. The larger list is the Write list. + */ +static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Write list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size = sizeof(__be32); /* segment count */ + size += maxsegs * sizeof(struct rpcrdma_segment); + size += sizeof(__be32); /* list discriminator */ + + dprintk("RPC: %s: max reply header size = %u\n", + __func__, size); + return size; +} + +void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata, + unsigned int maxsegs) +{ + ia->ri_max_inline_write = cdata->inline_wsize - + rpcrdma_max_call_header_size(maxsegs); + ia->ri_max_inline_read = cdata->inline_rsize - + rpcrdma_max_reply_header_size(maxsegs); +} /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use * the read chunk list for this operation. */ -static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) { - unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); + return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; } /* The client can't know how large the actual reply will be. Thus it @@ -89,11 +142,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst) * limit, the client must provide a write list or a reply chunk for * this request. */ -static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) { - unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); + return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } static int @@ -492,7 +546,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; - else if (rpcrdma_results_inline(rqst)) + else if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; else wtype = rpcrdma_replych; @@ -511,7 +565,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * that both has a data payload, and whose non-data arguments * by themselves are larger than the inline threshold. */ - if (rpcrdma_args_inline(rqst)) { + if (rpcrdma_args_inline(r_xprt, rqst)) { rtype = rpcrdma_noch; } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; @@ -561,6 +615,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen < 0) return hdrlen; + if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + goto out_overflow; + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, @@ -587,6 +644,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_niovs = 2; return 0; + +out_overflow: + pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s\n", + hdrlen, rpclen, transfertypes[wtype]); + /* Terminate this RPC. Chunks registered above will be + * released by xprt_release -> xprt_rmda_free . + */ + return -EIO; } /* diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 00287486c62c..4349e03069c7 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -73,6 +73,8 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; + unsigned int ri_max_inline_write; + unsigned int ri_max_inline_read; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -538,6 +540,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *, + unsigned int); /* RPC/RDMA module init - xprtrdma/transport.c */ -- cgit v1.2.3 From cce6deeb56aa8aad1d3154d70ea419cf25967f5c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:14 -0400 Subject: xprtrdma: Avoid using Write list for small NFS READ requests Avoid the latency and interrupt overhead of registering a Write chunk when handling NFS READ requests of a few hundred bytes or less. This change does not interoperate with Linux NFS/RDMA servers that do not have commit 9d11b51ce7c1 ('svcrdma: Fix send_reply() scatter/gather set-up'). Commit 9d11b51ce7c1 was introduced in v4.3, and is included in 4.2.y, 4.1.y, and 3.18.y. Oracle bug 22925946 has been filed to request that the above fix be included in the Oracle Linux UEK4 NFS/RDMA server. Red Hat bugzillas 1327280 and 1327554 have been filed to request that RHEL NFS/RDMA server backports include the above fix. Workaround: Replace the "proto=rdma,port=20049" mount options with "proto=tcp" until commit 9d11b51ce7c1 is applied to your NFS server. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 205b81b5ca9e..0105e65ad17f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -539,15 +539,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * - * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops * return as inline. + * o Large read ops return data as write chunk(s), header as + * inline. * o Large non-read ops return as a single reply chunk. */ - if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; - else if (rpcrdma_results_inline(r_xprt, rqst)) + if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; else wtype = rpcrdma_replych; -- cgit v1.2.3 From 88b18a120332cada6ff4adb9b5b7b6e4bbb653e5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:22 -0400 Subject: xprtrdma: Update comments in rpcrdma_marshal_req() Update documenting comments to reflect code changes over the past year. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0105e65ad17f..c7c9bbbf758c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -494,13 +494,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) * Marshal a request: the primary job of this routine is to choose * the transfer modes. See comments below. * - * Uses multiple RDMA IOVs for a request: - * [0] -- RPC RDMA header, which uses memory from the *start* of the - * preregistered buffer that already holds the RPC data in - * its middle. - * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. - * [2] -- optional padding. - * [3] -- if padded, header only in [1] and data here. + * Prepares up to two IOVs per Call message: + * + * [0] -- RPC RDMA header + * [1] -- the RPC header/data * * Returns zero on success, otherwise a negative errno. */ @@ -624,13 +621,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) __func__, transfertypes[wtype], hdrlen, rpclen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); - /* - * initialize send_iov's - normally only two: rdma chunk header and - * single preregistered RPC header buffer, but if padding is present, - * then use a preregistered (and zeroed) pad buffer between the RPC - * header and any write data. In all non-rdma cases, any following - * data has been copied into the RPC header buffer. - */ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); -- cgit v1.2.3 From 94f58c58c0b4315542036ce7703adeeaf4764940 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:30 -0400 Subject: xprtrdma: Allow Read list and Reply chunk simultaneously rpcrdma_marshal_req() makes a simplifying assumption: that NFS operations with large Call messages have small Reply messages, and vice versa. Therefore with RPC-over-RDMA, only one chunk type is ever needed for each Call/Reply pair, because one direction needs chunks, the other direction will always fit inline. In fact, this assumption is asserted in the code: if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; } But RPCGSS_SEC breaks this assumption. Because krb5i and krb5p perform data transformation on RPC messages before they are transmitted, direct data placement techniques cannot be used, thus RPC messages must be sent via a Long call in both directions. All such calls are sent with a Position Zero Read chunk, and all such replies are handled with a Reply chunk. Thus the client must provide every Call/Reply pair with both a Read list and a Reply chunk. Without any special security in effect, NFSv4 WRITEs may now also use the Read list and provide a Reply chunk. The marshal_req logic was preventing that, meaning an NFSv4 WRITE with a large payload that included a GETATTR result larger than the inline threshold would fail. The code that encodes each chunk list is now completely contained in its own function. There is some code duplication, but the trade-off is that the overall logic should be more clear. Note that all three chunk lists now share the rl_segments array. Some additional per-req accounting is necessary to track this usage. For the same reasons that the above simplifying assumption has held true for so long, I don't expect more array elements are needed at this time. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 327 ++++++++++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +- 2 files changed, 272 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c7c9bbbf758c..e80f43d58903 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -62,17 +62,17 @@ enum rpcrdma_chunktype { }; static const char transfertypes[][12] = { - "pure inline", /* no chunks */ - " read chunk", /* some argument via rdma read */ - "*read chunk", /* entire request via rdma read */ - "write chunk", /* some result via rdma write */ + "inline", /* no chunks */ + "read list", /* some argument via rdma read */ + "*read list", /* entire request via rdma read */ + "write list", /* some result via rdma write */ "reply chunk" /* entire reply via rdma write */ }; /* Returns size of largest RPC-over-RDMA header in a Call message * - * The client marshals only one chunk list per Call message. - * The largest list is the Read list. + * The largest Call header contains a full-size Read list and a + * minimal Reply chunk. */ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) { @@ -85,6 +85,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * sizeof(struct rpcrdma_read_chunk); + /* Minimal Read chunk size */ + size += sizeof(__be32); /* segment count */ + size += sizeof(struct rpcrdma_segment); + size += sizeof(__be32); /* list discriminator */ + dprintk("RPC: %s: max call header size = %u\n", __func__, size); return size; @@ -431,6 +436,209 @@ out: return n; } +static inline __be32 * +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) +{ + *iptr++ = cpu_to_be32(seg->mr_rkey); + *iptr++ = cpu_to_be32(seg->mr_len); + return xdr_encode_hyper(iptr, seg->mr_base); +} + +/* XDR-encode the Read list. Supports encoding a list of read + * segments that belong to a single read chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Read list, or an error pointer. + */ +static __be32 * +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, struct rpc_rqst *rqst, + __be32 *iptr, enum rpcrdma_chunktype rtype) +{ + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + unsigned int pos; + int n, nsegs; + + if (rtype == rpcrdma_noch) { + *iptr++ = xdr_zero; /* item not present */ + return iptr; + } + + pos = rqst->rq_snd_buf.head[0].iov_len; + if (rtype == rpcrdma_areadch) + pos = 0; + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); + if (nsegs < 0) + return ERR_PTR(nsegs); + + do { + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); + if (n <= 0) + return ERR_PTR(n); + + *iptr++ = xdr_one; /* item present */ + + /* All read segments in this chunk + * have the same "position". + */ + *iptr++ = cpu_to_be32(pos); + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: read segment pos %u " + "%d@0x%016llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, pos, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.read_chunk_count++; + req->rl_nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + req->rl_nextseg = seg; + + /* Finish Read list */ + *iptr++ = xdr_zero; /* Next item not present */ + return iptr; +} + +/* XDR-encode the Write list. Supports encoding a list containing + * one array of plain segments that belong to a single write chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Write list, or an error pointer. + */ +static __be32 * +rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, __be32 *iptr, + enum rpcrdma_chunktype wtype) +{ + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + int n, nsegs, nchunks; + __be32 *segcount; + + if (wtype != rpcrdma_writech) { + *iptr++ = xdr_zero; /* no Write list present */ + return iptr; + } + + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, + wtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); + if (nsegs < 0) + return ERR_PTR(nsegs); + + *iptr++ = xdr_one; /* Write list present */ + segcount = iptr++; /* save location of segment count */ + + nchunks = 0; + do { + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); + if (n <= 0) + return ERR_PTR(n); + + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: write segment " + "%d@0x016%llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + req->rl_nchunks++; + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + req->rl_nextseg = seg; + + /* Update count of segments in this Write chunk */ + *segcount = cpu_to_be32(nchunks); + + /* Finish Write list */ + *iptr++ = xdr_zero; /* Next item not present */ + return iptr; +} + +/* XDR-encode the Reply chunk. Supports encoding an array of plain + * segments that belong to a single write (reply) chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Reply chunk, or an error pointer. + */ +static __be32 * +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, struct rpc_rqst *rqst, + __be32 *iptr, enum rpcrdma_chunktype wtype) +{ + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + int n, nsegs, nchunks; + __be32 *segcount; + + if (wtype != rpcrdma_replych) { + *iptr++ = xdr_zero; /* no Reply chunk present */ + return iptr; + } + + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); + if (nsegs < 0) + return ERR_PTR(nsegs); + + *iptr++ = xdr_one; /* Reply chunk present */ + segcount = iptr++; /* save location of segment count */ + + nchunks = 0; + do { + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); + if (n <= 0) + return ERR_PTR(n); + + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: reply segment " + "%d@0x%016llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.reply_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + req->rl_nchunks++; + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + req->rl_nextseg = seg; + + /* Update count of segments in the Reply chunk */ + *segcount = cpu_to_be32(nchunks); + + return iptr; +} + /* * Copy write data inline. * This function is used for "small" requests. Data which is passed @@ -508,24 +716,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - char *base; - size_t rpclen; - ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + unsigned int pos; + ssize_t hdrlen; + size_t rpclen; + __be32 *iptr; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) return rpcrdma_bc_marshal_reply(rqst); #endif - /* - * rpclen gets amount of data in first buffer, which is the - * pre-registered buffer. - */ - base = rqst->rq_svec[0].iov_base; - rpclen = rqst->rq_svec[0].iov_len; - headerp = rdmab_to_msg(req->rl_rdmabuf); /* don't byte-swap XID, it's already done in request */ headerp->rm_xid = rqst->rq_xid; @@ -565,8 +767,12 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { rtype = rpcrdma_noch; + rpcrdma_inline_pullup(rqst); + rpclen = rqst->rq_svec[0].iov_len; } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + rpclen = rqst->rq_svec[0].iov_len; + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); } else { r_xprt->rx_stats.nomsg_call_count++; headerp->rm_type = htonl(RDMA_NOMSG); @@ -574,52 +780,49 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rpclen = 0; } - /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { - dprintk("RPC: %s: cannot marshal multiple chunk lists\n", - __func__); - return -EIO; - } - - hdrlen = RPCRDMA_HDRLEN_MIN; - - /* - * Pull up any extra send data into the preregistered buffer. - * When padding is in use and applies to the transfer, insert - * it and change the message type. + /* This implementation supports the following combinations + * of chunk lists in one RPC-over-RDMA Call message: + * + * - Read list + * - Write list + * - Reply chunk + * - Read list + Reply chunk + * + * It might not yet support the following combinations: + * + * - Read list + Write list + * + * It does not support the following combinations: + * + * - Write list + Reply chunk + * - Read list + Write list + Reply chunk + * + * This implementation supports only a single chunk in each + * Read or Write list. Thus for example the client cannot + * send a Call message with a Position Zero Read chunk and a + * regular Read chunk at the same time. */ - if (rtype == rpcrdma_noch) { - - rpcrdma_inline_pullup(rqst); - - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - } else if (rtype == rpcrdma_readch) - rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, wtype); - } - if (hdrlen < 0) - return hdrlen; + req->rl_nchunks = 0; + req->rl_nextseg = req->rl_segments; + iptr = headerp->rm_body.rm_chunks; + iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); + if (IS_ERR(iptr)) + goto out_unmap; + iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); + if (IS_ERR(iptr)) + goto out_unmap; + iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); + if (IS_ERR(iptr)) + goto out_unmap; + hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) goto out_overflow; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" - " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, - headerp, base, rdmab_lkey(req->rl_rdmabuf)); + dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", + rqst->rq_task->tk_pid, __func__, + transfertypes[rtype], transfertypes[wtype], + hdrlen, rpclen); req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = hdrlen; @@ -637,12 +840,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) return 0; out_overflow: - pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s\n", - hdrlen, rpclen, transfertypes[wtype]); + pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", + hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); /* Terminate this RPC. Chunks registered above will be * released by xprt_release -> xprt_rmda_free . */ return -EIO; + +out_unmap: + for (pos = 0; req->rl_nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); + return PTR_ERR(iptr); } /* diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4349e03069c7..61999b694a15 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -184,7 +184,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) */ #define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) -#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ + +/* data segments + head/tail for Call + head/tail for Reply */ +#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) struct rpcrdma_buffer; @@ -298,6 +300,7 @@ struct rpcrdma_req { struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct rpcrdma_mr_seg *rl_nextseg; struct ib_cqe rl_cqe; struct list_head rl_all; -- cgit v1.2.3 From 3c19409b3d5173cf13adcf53b6423dc139994fc3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:39 -0400 Subject: xprtrdma: Remove rpcrdma_create_chunks() rpcrdma_create_chunks() has been replaced, and can be removed. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 151 ----------------------------------------- 1 file changed, 151 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e80f43d58903..9ebaf797bdef 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -285,157 +285,6 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; } -/* - * Create read/write chunk lists, and reply chunks, for RDMA - * - * Assume check against THRESHOLD has been done, and chunks are required. - * Assume only encoding one list entry for read|write chunks. The NFSv3 - * protocol is simple enough to allow this as it only has a single "bulk - * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The - * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) - * - * When used for a single reply chunk (which is a special write - * chunk used for the entire reply, rather than just the data), it - * is used primarily for READDIR and READLINK which would otherwise - * be severely size-limited by a small rdma inline read max. The server - * response will come back as an RDMA Write, followed by a message - * of type RDMA_NOMSG carrying the xid and length. As a result, reply - * chunks do not provide data alignment, however they do not require - * "fixup" (moving the response to the upper layer buffer) either. - * - * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): - * - * Read chunklist (a linked list): - * N elements, position P (same P for all chunks of same arg!): - * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 - * - * Write chunklist (a list of (one) counted array): - * N elements: - * 1 - N - HLOO - HLOO - ... - HLOO - 0 - * - * Reply chunk (a counted array): - * N elements: - * 1 - N - HLOO - HLOO - ... - HLOO - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -static ssize_t -rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, - struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int n, nsegs, nchunks = 0; - unsigned int pos; - struct rpcrdma_mr_seg *seg = req->rl_segments; - struct rpcrdma_read_chunk *cur_rchunk = NULL; - struct rpcrdma_write_array *warray = NULL; - struct rpcrdma_write_chunk *cur_wchunk = NULL; - __be32 *iptr = headerp->rm_body.rm_chunks; - int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); - - if (type == rpcrdma_readch || type == rpcrdma_areadch) { - /* a read chunk - server will RDMA Read our memory */ - cur_rchunk = (struct rpcrdma_read_chunk *) iptr; - } else { - /* a write or reply chunk - server will RDMA Write our memory */ - *iptr++ = xdr_zero; /* encode a NULL read chunk list */ - if (type == rpcrdma_replych) - *iptr++ = xdr_zero; /* a NULL write chunk list */ - warray = (struct rpcrdma_write_array *) iptr; - cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); - } - - if (type == rpcrdma_replych || type == rpcrdma_areadch) - pos = 0; - else - pos = target->head[0].iov_len; - - nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); - if (nsegs < 0) - return nsegs; - - map = r_xprt->rx_ia.ri_ops->ro_map; - do { - n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); - if (n <= 0) - goto out; - if (cur_rchunk) { /* read */ - cur_rchunk->rc_discrim = xdr_one; - /* all read chunks have the same "position" */ - cur_rchunk->rc_position = cpu_to_be32(pos); - cur_rchunk->rc_target.rs_handle = - cpu_to_be32(seg->mr_rkey); - cur_rchunk->rc_target.rs_length = - cpu_to_be32(seg->mr_len); - xdr_encode_hyper( - (__be32 *)&cur_rchunk->rc_target.rs_offset, - seg->mr_base); - dprintk("RPC: %s: read chunk " - "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, pos, n < nsegs ? "more" : "last"); - cur_rchunk++; - r_xprt->rx_stats.read_chunk_count++; - } else { /* write/reply */ - cur_wchunk->wc_target.rs_handle = - cpu_to_be32(seg->mr_rkey); - cur_wchunk->wc_target.rs_length = - cpu_to_be32(seg->mr_len); - xdr_encode_hyper( - (__be32 *)&cur_wchunk->wc_target.rs_offset, - seg->mr_base); - dprintk("RPC: %s: %s chunk " - "elem %d@0x%llx:0x%x (%s)\n", __func__, - (type == rpcrdma_replych) ? "reply" : "write", - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); - cur_wchunk++; - if (type == rpcrdma_replych) - r_xprt->rx_stats.reply_chunk_count++; - else - r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; - } - nchunks++; - seg += n; - nsegs -= n; - } while (nsegs); - - /* success. all failures return above */ - req->rl_nchunks = nchunks; - - /* - * finish off header. If write, marshal discrim and nchunks. - */ - if (cur_rchunk) { - iptr = (__be32 *) cur_rchunk; - *iptr++ = xdr_zero; /* finish the read chunk list */ - *iptr++ = xdr_zero; /* encode a NULL write chunk list */ - *iptr++ = xdr_zero; /* encode a NULL reply chunk */ - } else { - warray->wc_discrim = xdr_one; - warray->wc_nchunks = cpu_to_be32(nchunks); - iptr = (__be32 *) cur_wchunk; - if (type == rpcrdma_writech) { - *iptr++ = xdr_zero; /* finish the write chunk list */ - *iptr++ = xdr_zero; /* encode a NULL reply chunk */ - } - } - - /* - * Return header size. - */ - return (unsigned char *)iptr - (unsigned char *)headerp; - -out: - for (pos = 0; nchunks--;) - pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, - &req->rl_segments[pos]); - return n; -} - static inline __be32 * xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) { -- cgit v1.2.3 From 550d7502cf66ccc38bfa4c99f6526e402f918d98 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:47 -0400 Subject: xprtrdma: Use core ib_drain_qp() API Clean up: Replace rpcrdma_flush_cqs() and rpcrdma_clean_cqs() with the new ib_drain_qp() API. Signed-off-by: Chuck Lever Reviewed-By: Leon Romanovsky Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 41 ++++++----------------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9f8d6c1dc7c6..b7a5bc1341b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -203,15 +203,6 @@ out_fail: goto out_schedule; } -static void -rpcrdma_flush_cqs(struct rpcrdma_ep *ep) -{ - struct ib_wc wc; - - while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) - rpcrdma_receive_wc(NULL, &wc); -} - static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -373,23 +364,6 @@ out: return ERR_PTR(rc); } -/* - * Drain any cq, prior to teardown. - */ -static void -rpcrdma_clean_cq(struct ib_cq *cq) -{ - struct ib_wc wc; - int count = 0; - - while (1 == ib_poll_cq(cq, 1, &wc)) - ++count; - - if (count) - dprintk("RPC: %s: flushed %d events (last 0x%x)\n", - __func__, count, wc.opcode); -} - /* * Exported functions. */ @@ -515,7 +489,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, __func__); return -ENOMEM; } - max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -526,11 +500,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */ rc = ia->ri_ops->ro_open(ia, ep, cdata); if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; @@ -622,13 +598,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) - rpcrdma_ep_disconnect(ep, ia); - - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); - if (ia->ri_id->qp) { + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -659,7 +630,6 @@ retry: dprintk("RPC: %s: reconnecting...\n", __func__); rpcrdma_ep_disconnect(ep, ia); - rpcrdma_flush_cqs(ep); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -785,7 +755,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -797,6 +766,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } + + ib_drain_qp(ia->ri_id->qp); } struct rpcrdma_req * -- cgit v1.2.3 From 55fdfce101a0afe7bb9da17b4edbee049ae1c18d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:41:56 -0400 Subject: xprtrdma: Rename rpcrdma_frwr::sg and sg_nents Clean up: Follow same naming convention as other fields in struct rpcrdma_frwr. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 36 ++++++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 41e02e7d9b4c..7c22e9e9567c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -152,11 +152,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, if (IS_ERR(f->fr_mr)) goto out_mr_err; - f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL); - if (!f->sg) + f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); + if (!f->fr_sg) goto out_list_err; - sg_init_table(f->sg, depth); + sg_init_table(f->fr_sg, depth); init_completion(&f->fr_linv_done); @@ -185,7 +185,7 @@ __frwr_release(struct rpcrdma_mw *r) if (rc) dprintk("RPC: %s: ib_dereg_mr status %i\n", __func__, rc); - kfree(r->frmr.sg); + kfree(r->frmr.fr_sg); } static int @@ -399,12 +399,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&frmr->sg[i], + sg_set_page(&frmr->fr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&frmr->sg[i], seg->mr_offset, + sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -415,25 +415,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - frmr->sg_nents = i; + frmr->fr_nents = i; - dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction); + dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); if (!dma_nents) { pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", - __func__, frmr->sg, frmr->sg_nents); + __func__, frmr->fr_sg, frmr->fr_nents); return -ENOMEM; } - n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); - if (unlikely(n != frmr->sg_nents)) { + n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, PAGE_SIZE); + if (unlikely(n != frmr->fr_nents)) { pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", - __func__, frmr->fr_mr, n, frmr->sg_nents); + __func__, frmr->fr_mr, n, frmr->fr_nents); rc = n < 0 ? n : -EINVAL; goto out_senderr; } dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, frmr->sg_nents, mr->length); + __func__, mw, frmr->fr_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -459,14 +459,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, seg1->rl_mw = mw; seg1->mr_rkey = mr->rkey; seg1->mr_base = mr->iova; - seg1->mr_nsegs = frmr->sg_nents; + seg1->mr_nsegs = frmr->fr_nents; seg1->mr_len = mr->length; - return frmr->sg_nents; + return frmr->fr_nents; out_senderr: dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction); + ib_dma_unmap_sg(device, frmr->fr_sg, dma_nents, direction); __frwr_queue_recovery(mw); return rc; } @@ -500,7 +500,7 @@ __frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, seg->rl_mw = NULL; - ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, seg->mr_dir); if (!rc) rpcrdma_put_mw(r_xprt, mw); @@ -611,7 +611,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); + ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 61999b694a15..612e3dec10ad 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -222,8 +222,8 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct scatterlist *sg; - int sg_nents; + struct scatterlist *fr_sg; + int fr_nents; struct ib_mr *fr_mr; struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; -- cgit v1.2.3 From a3aa8b2b84a59ddd5f624aae9ee0f8b3333793e8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:04 -0400 Subject: xprtrdma: Save I/O direction in struct rpcrdma_frwr Move the the I/O direction field from rpcrdma_mr_seg into the rpcrdma_frmr. This makes it possible to DMA-unmap the frwr long after an RPC has exited and its rpcrdma_mr_seg array has been released and re-used. This might occur if an RPC times out while waiting for a new connection to be established. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 7c22e9e9567c..e1e6ac142d48 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -416,6 +416,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, break; } frmr->fr_nents = i; + frmr->fr_dir = direction; dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); if (!dma_nents) { @@ -455,7 +456,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - seg1->mr_dir = direction; seg1->rl_mw = mw; seg1->mr_rkey = mr->rkey; seg1->mr_base = mr->iova; @@ -500,7 +500,7 @@ __frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, seg->rl_mw = NULL; - ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, seg->mr_dir); + ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, f->fr_dir); if (!rc) rpcrdma_put_mw(r_xprt, mw); @@ -611,7 +611,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, seg1->mr_dir); + ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, frmr->fr_dir); read_lock(&ia->ri_qplock); rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 612e3dec10ad..b5793cb59d5c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -224,6 +224,7 @@ enum rpcrdma_frmr_state { struct rpcrdma_frmr { struct scatterlist *fr_sg; int fr_nents; + enum dma_data_direction fr_dir; struct ib_mr *fr_mr; struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; -- cgit v1.2.3 From d7a21c1bed54adcf96e1713019a6ad87e90fbbc3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:12 -0400 Subject: xprtrdma: Reset MRs in frwr_op_unmap_sync() frwr_op_unmap_sync() is now invoked in a workqueue context, the same as __frwr_queue_recovery(). There's no need to defer MR reset if posting LOCAL_INV MRs fails. This means that even when ib_post_send() fails (which should occur very rarely) the invalidation and DMA unmapping steps are still done in the correct order. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 98 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e1e6ac142d48..ce245dc4acab 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -98,6 +98,32 @@ frwr_destroy_recovery_wq(void) destroy_workqueue(wq); } +static int +__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +{ + struct rpcrdma_frmr *f = &r->frmr; + int rc; + + rc = ib_dereg_mr(f->fr_mr); + if (rc) { + pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", + rc, r); + return rc; + } + + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", + PTR_ERR(f->fr_mr), r); + return PTR_ERR(f->fr_mr); + } + + dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + f->fr_state = FRMR_IS_INVALID; + return 0; +} + /* Deferred reset of a single FRMR. Generate a fresh rkey by * replacing the MR. * @@ -111,24 +137,15 @@ __frwr_recovery_worker(struct work_struct *work) struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, frmr.fr_work); struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - - if (ib_dereg_mr(r->frmr.fr_mr)) - goto out_fail; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int rc; - r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(r->frmr.fr_mr)) - goto out_fail; + rc = __frwr_reset_mr(ia, r); + if (rc) + return; - dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); - r->frmr.fr_state = FRMR_IS_INVALID; rpcrdma_put_mw(r_xprt, r); return; - -out_fail: - pr_warn("RPC: %s: FRMR %p unrecovered\n", - __func__, r); } /* A broken MR was discovered in a context that can't sleep. @@ -490,24 +507,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) return invalidate_wr; } -static void -__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int rc) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - struct rpcrdma_mw *mw = seg->rl_mw; - struct rpcrdma_frmr *f = &mw->frmr; - - seg->rl_mw = NULL; - - ib_dma_unmap_sg(device, f->fr_sg, f->fr_nents, f->fr_dir); - - if (!rc) - rpcrdma_put_mw(r_xprt, mw); - else - __frwr_queue_recovery(mw); -} - /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -521,6 +520,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_mr_seg *seg; unsigned int i, nchunks; struct rpcrdma_frmr *f; + struct rpcrdma_mw *mw; int rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -561,11 +561,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless ri_id->qp is a valid pointer. */ rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); - if (rc) { - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); - rdma_disconnect(ia->ri_id); - goto unmap; - } + if (rc) + goto reset_mrs; wait_for_completion(&f->fr_linv_done); @@ -575,14 +572,39 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) unmap: for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { seg = &req->rl_segments[i]; + mw = seg->rl_mw; + seg->rl_mw = NULL; - __frwr_dma_unmap(r_xprt, seg, rc); + ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, + f->fr_dir); + rpcrdma_put_mw(r_xprt, mw); i += seg->mr_nsegs; seg->mr_nsegs = 0; } req->rl_nchunks = 0; + return; + +reset_mrs: + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + /* Find and reset the MRs in the LOCAL_INV WRs that did not + * get posted. This is synchronous, and slow. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + f = &mw->frmr; + + if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { + __frwr_reset_mr(ia, mw); + bad_wr = bad_wr->next; + } + + i += seg->mr_nsegs; + } + goto unmap; } /* Post a LOCAL_INV Work Request to prevent further remote access -- cgit v1.2.3 From 660bb497d0ae0c9e6be5beaff7ba17bfa5c9718c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:21 -0400 Subject: xprtrdma: Refactor the FRWR recovery worker Maintain the order of invalidation and DMA unmapping when doing a background MR reset. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ce245dc4acab..4e0a5c1abea4 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -124,6 +124,21 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return 0; } +static void +__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_frmr *f = &mw->frmr; + int rc; + + rc = __frwr_reset_mr(ia, mw); + ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); + if (rc) + return; + + rpcrdma_put_mw(r_xprt, mw); +} + /* Deferred reset of a single FRMR. Generate a fresh rkey by * replacing the MR. * @@ -136,15 +151,8 @@ __frwr_recovery_worker(struct work_struct *work) { struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, frmr.fr_work); - struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_reset_mr(ia, r); - if (rc) - return; - rpcrdma_put_mw(r_xprt, r); + __frwr_reset_and_unmap(r->frmr.fr_xprt, r); return; } @@ -483,7 +491,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, out_senderr: dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - ib_dma_unmap_sg(device, frmr->fr_sg, dma_nents, direction); __frwr_queue_recovery(mw); return rc; } -- cgit v1.2.3 From 766656b022a629201b6e183c7837160cd0919286 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:29 -0400 Subject: xprtrdma: Move fr_xprt and fr_worker to struct rpcrdma_mw In a subsequent patch, the fr_xprt and fr_worker fields will be needed by another memory registration mode. Move them into the generic rpcrdma_mw structure that wraps struct rpcrdma_frmr. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 10 +++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 4e0a5c1abea4..1251a1d4d92f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -150,9 +150,9 @@ static void __frwr_recovery_worker(struct work_struct *work) { struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - frmr.fr_work); + mw_work); - __frwr_reset_and_unmap(r->frmr.fr_xprt, r); + __frwr_reset_and_unmap(r->mw_xprt, r); return; } @@ -162,8 +162,8 @@ __frwr_recovery_worker(struct work_struct *work) static void __frwr_queue_recovery(struct rpcrdma_mw *r) { - INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->frmr.fr_work); + INIT_WORK(&r->mw_work, __frwr_recovery_worker); + queue_work(frwr_recovery_wq, &r->mw_work); } static int @@ -378,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt) return rc; } + r->mw_xprt = r_xprt; list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); - r->frmr.fr_xprt = r_xprt; } return 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b5793cb59d5c..97c90a8f5e01 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -229,8 +229,6 @@ struct rpcrdma_frmr { struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; struct completion fr_linv_done; - struct work_struct fr_work; - struct rpcrdma_xprt *fr_xprt; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; @@ -247,6 +245,8 @@ struct rpcrdma_mw { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; }; + struct work_struct mw_work; + struct rpcrdma_xprt *mw_xprt; struct list_head mw_list; struct list_head mw_all; }; -- cgit v1.2.3 From 763bc230b63c9106fd0b0a75ba5052fe3d68b786 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:38 -0400 Subject: xprtrdma: Refactor __fmr_dma_unmap() Separate the DMA unmap operation from freeing the MW. In a subsequent patch they will not always be done at the same time, and they are not related operations (except by order; freeing the MW must be the last step during invalidation). Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 009694b0c56e..9d50f3a5732a 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -186,15 +186,10 @@ static void __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct ib_device *device = r_xprt->rx_ia.ri_device; - struct rpcrdma_mw *mw = seg->rl_mw; int nsegs = seg->mr_nsegs; - seg->rl_mw = NULL; - while (nsegs--) rpcrdma_unmap_one(device, seg++); - - rpcrdma_put_mw(r_xprt, mw); } /* Invalidate all memory regions that were registered for "req". @@ -237,9 +232,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) seg = &req->rl_segments[i]; __fmr_dma_unmap(r_xprt, seg); + rpcrdma_put_mw(r_xprt, seg->rl_mw); i += seg->mr_nsegs; seg->mr_nsegs = 0; + seg->rl_mw = NULL; } req->rl_nchunks = 0; -- cgit v1.2.3 From ead3f26e359e12ac8d90baff8ed399b85e82fe5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:46 -0400 Subject: xprtrdma: Add ro_unmap_safe memreg method There needs to be a safe method of releasing registered memory resources when an RPC terminates. Safe can mean a number of things: + Doesn't have to sleep + Doesn't rely on having a QP in RTS ro_unmap_safe will be that safe method. It can be used in cases where synchronous memory invalidation can deadlock, or needs to have an active QP. The important case is fencing an RPC's memory regions after it is signaled (^C) and before it exits. If this is not done, there is a window where the server can write an RPC reply into memory that the client has released and re-used for some other purpose. Note that this is a full solution for FRWR, but FMR and physical still have some gaps where a particularly bad server can wreak some havoc on the client. These gaps are not made worse by this patch and are expected to be exceptionally rare and timing-based. They are noted in documenting comments. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 105 +++++++++++++++++++++++++++++++++---- net/sunrpc/xprtrdma/frwr_ops.c | 27 ++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 20 +++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 5 +- net/sunrpc/xprtrdma/transport.c | 9 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 3 ++ 6 files changed, 150 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 9d50f3a5732a..a658dcffba71 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -35,6 +35,64 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) +static struct workqueue_struct *fmr_recovery_wq; + +#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) + +int +fmr_alloc_recovery_wq(void) +{ + fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); + return !fmr_recovery_wq ? -ENOMEM : 0; +} + +void +fmr_destroy_recovery_wq(void) +{ + struct workqueue_struct *wq; + + if (!fmr_recovery_wq) + return; + + wq = fmr_recovery_wq; + fmr_recovery_wq = NULL; + destroy_workqueue(wq); +} + +static int +__fmr_unmap(struct rpcrdma_mw *mw) +{ + LIST_HEAD(l); + + list_add(&mw->fmr.fmr->list, &l); + return ib_unmap_fmr(&l); +} + +/* Deferred reset of a single FMR. Generate a fresh rkey by + * replacing the MR. There's no recovery if this fails. + */ +static void +__fmr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, + mw_work); + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + + __fmr_unmap(mw); + rpcrdma_put_mw(r_xprt, mw); + return; +} + +/* A broken MR was discovered in a context that can't sleep. + * Defer recovery to the recovery worker. + */ +static void +__fmr_queue_recovery(struct rpcrdma_mw *mw) +{ + INIT_WORK(&mw->mw_work, __fmr_recovery_worker); + queue_work(fmr_recovery_wq, &mw->mw_work); +} + static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) @@ -92,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) if (IS_ERR(r->fmr.fmr)) goto out_fmr_err; + r->mw_xprt = r_xprt; list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); } @@ -107,15 +166,6 @@ out: return rc; } -static int -__fmr_unmap(struct rpcrdma_mw *r) -{ - LIST_HEAD(l); - - list_add(&r->fmr.fmr->list, &l); - return ib_unmap_fmr(&l); -} - /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ @@ -242,6 +292,42 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) req->rl_nchunks = 0; } +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + * + * In the asynchronous case, DMA unmapping occurs first here + * because the rpcrdma_mr_seg is released immediately after this + * call. It's contents won't be available in __fmr_dma_unmap later. + * FIXME. + */ +static void +fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) +{ + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; + unsigned int i; + + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + if (sync) { + /* ORDER */ + __fmr_unmap(mw); + __fmr_dma_unmap(r_xprt, seg); + rpcrdma_put_mw(r_xprt, mw); + } else { + __fmr_dma_unmap(r_xprt, seg); + __fmr_queue_recovery(mw); + } + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + seg->rl_mw = NULL; + } +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -295,6 +381,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, + .ro_unmap_safe = fmr_op_unmap_safe, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1251a1d4d92f..79ba32373b15 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -614,6 +614,32 @@ reset_mrs: goto unmap; } +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + */ +static void +frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) +{ + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; + unsigned int i; + + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + if (sync) + __frwr_reset_and_unmap(r_xprt, mw); + else + __frwr_queue_recovery(mw); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + seg->rl_mw = NULL; + } +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -675,6 +701,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, + .ro_unmap_safe = frwr_op_unmap_safe, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 2dc6ec2b006a..95ef3a71f086 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -97,6 +97,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) rpcrdma_unmap_one(device, &req->rl_segments[i++]); } +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + * + * For physical memory registration, there is no good way to + * fence a single MR that has been advertised to the server. The + * client has already handed the server an R_key that cannot be + * invalidated and is shared by all MRs on this connection. + * Tearing down the PD might be the only safe choice, but it's + * not clear that a freshly acquired DMA R_key would be different + * than the one used by the PD that was just destroyed. + * FIXME. + */ +static void +physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) +{ + physical_op_unmap_sync(r_xprt, req); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -105,6 +124,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap_sync = physical_op_unmap_sync, + .ro_unmap_safe = physical_op_unmap_safe, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9ebaf797bdef..35a81096e83d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -567,7 +567,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; - unsigned int pos; ssize_t hdrlen; size_t rpclen; __be32 *iptr; @@ -697,9 +696,7 @@ out_overflow: return -EIO; out_unmap: - for (pos = 0; req->rl_nchunks--;) - pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, - &req->rl_segments[pos]); + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 16595ff91994..99d2e5b72726 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -514,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ + req->rl_task = task; return req->rl_sendbuf->rg_base; out_rdmabuf: @@ -570,7 +571,6 @@ xprt_rdma_free(void *buffer) struct rpcrdma_req *req; struct rpcrdma_xprt *r_xprt; struct rpcrdma_regbuf *rb; - int i; if (buffer == NULL) return; @@ -584,11 +584,8 @@ xprt_rdma_free(void *buffer) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - for (i = 0; req->rl_nchunks;) { - --req->rl_nchunks; - i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, - &req->rl_segments[i]); - } + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, + !RPC_IS_ASYNC(req->rl_task)); rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 97c90a8f5e01..59b647eefc99 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -295,6 +295,7 @@ struct rpcrdma_req { unsigned int rl_niovs; unsigned int rl_nchunks; unsigned int rl_connect_cookie; + struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; @@ -400,6 +401,8 @@ struct rpcrdma_memreg_ops { struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); + void (*ro_unmap_safe)(struct rpcrdma_xprt *, + struct rpcrdma_req *, bool); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); -- cgit v1.2.3 From 0b043b9fb5dabcb6f187136cc685b26a7f8bcdb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:42:54 -0400 Subject: xprtrdma: Remove ro_unmap() from all registration modes Clean up: The ro_unmap method is no longer used. Signed-off-by: Chuck Lever Tested-by: Steve Wise Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 31 --------------------------- net/sunrpc/xprtrdma/frwr_ops.c | 43 -------------------------------------- net/sunrpc/xprtrdma/physical_ops.c | 12 ----------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 4 files changed, 88 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index a658dcffba71..6326ebe8b595 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -328,36 +328,6 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } } -/* Use the ib_unmap_fmr() verb to prevent further remote - * access via RDMA READ or RDMA WRITE. - */ -static int -fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - int rc, nsegs = seg->mr_nsegs; - - dprintk("RPC: %s: FMR %p\n", __func__, mw); - - seg1->rl_mw = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia->ri_device, seg++); - rc = __fmr_unmap(mw); - if (rc) - goto out_err; - rpcrdma_put_mw(r_xprt, mw); - return nsegs; - -out_err: - /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy - * will attempt to release it when the transport is destroyed. - */ - dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); - return nsegs; -} - static void fmr_op_destroy(struct rpcrdma_buffer *buf) { @@ -382,7 +352,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap_safe = fmr_op_unmap_safe, - .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 79ba32373b15..a192b91ad67e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -640,48 +640,6 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } } -/* Post a LOCAL_INV Work Request to prevent further remote access - * via RDMA READ or RDMA WRITE. - */ -static int -frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->frmr; - struct ib_send_wr *invalidate_wr, *bad_wr; - int rc, nsegs = seg->mr_nsegs; - - dprintk("RPC: %s: FRMR %p\n", __func__, mw); - - seg1->rl_mw = NULL; - frmr->fr_state = FRMR_IS_INVALID; - invalidate_wr = &mw->frmr.fr_invwr; - - memset(invalidate_wr, 0, sizeof(*invalidate_wr)); - frmr->fr_cqe.done = frwr_wc_localinv; - invalidate_wr->wr_cqe = &frmr->fr_cqe; - invalidate_wr->opcode = IB_WR_LOCAL_INV; - invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - ib_dma_unmap_sg(ia->ri_device, frmr->fr_sg, frmr->fr_nents, frmr->fr_dir); - read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) - goto out_err; - - rpcrdma_put_mw(r_xprt, mw); - return nsegs; - -out_err: - dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - __frwr_queue_recovery(mw); - return nsegs; -} - static void frwr_op_destroy(struct rpcrdma_buffer *buf) { @@ -702,7 +660,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap_safe = frwr_op_unmap_safe, - .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 95ef3a71f086..3750596cc432 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -74,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return 1; } -/* Unmap a memory region, but leave it registered. - */ -static int -physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - rpcrdma_unmap_one(ia->ri_device, seg); - return 1; -} - /* DMA unmap all memory regions that were mapped for "req". */ static void @@ -125,7 +114,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap_safe = physical_op_unmap_safe, - .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 59b647eefc99..512bbdcc7d76 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -399,8 +399,6 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *, int, bool); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct rpcrdma_req *); - int (*ro_unmap)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); int (*ro_open)(struct rpcrdma_ia *, -- cgit v1.2.3 From b2dde94bfa374b9e851756ab8191209f1a604e0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 May 2016 14:43:03 -0400 Subject: xprtrdma: Faster server reboot recovery In a cluster failover scenario, it is desirable for the client to attempt to reconnect quickly, as an alternate NFS server is already waiting to take over for the down server. The client can't see that a server IP address has moved to a new server until the existing connection is gone. For fabrics and devices where it is meaningful, set a definite upper bound on the amount of time before it is determined that a connection is no longer valid. This allows the RPC client to detect connection loss in a timely matter, then perform a fresh resolution of the server GUID in case it has changed (cluster failover). Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b7a5bc1341b4..be66f658308c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -554,6 +554,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.recv_cq = recvcq; /* Initialize cma parameters */ + memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); /* RPC/RDMA does not use private data */ ep->rep_remote_cma.private_data = NULL; @@ -567,7 +568,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.responder_resources = ia->ri_device->attrs.max_qp_rd_atom; - ep->rep_remote_cma.retry_count = 7; + /* Limit transport retries so client can detect server + * GID changes quickly. RPC layer handles re-establishing + * transport connection and retransmission. + */ + ep->rep_remote_cma.retry_count = 6; + + /* RPC-over-RDMA handles its own flow control. In addition, + * make all RNR NAKs visible so we know that RPC-over-RDMA + * flow control is working correctly (no NAKs should be seen). + */ ep->rep_remote_cma.flow_control = 0; ep->rep_remote_cma.rnr_retry_count = 0; -- cgit v1.2.3 From 6e14a92c363e1f42c6e1339da8413fcfbe1bdc3a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 4 May 2016 10:41:48 -0400 Subject: xprtrdma: Remove qplock Clean up. After "xprtrdma: Remove ro_unmap() from all registration modes", there are no longer any sites that take rpcrdma_ia::qplock for read. The one site that takes it for write is always single-threaded. It is safe to remove it. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 --- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index be66f658308c..b044d98a1370 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -433,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) dprintk("RPC: %s: memory registration strategy is '%s'\n", __func__, ia->ri_ops->ro_displayname); - rwlock_init(&ia->ri_qplock); return 0; out3: @@ -672,10 +671,8 @@ retry: goto out; } - write_lock(&ia->ri_qplock); old = ia->ri_id; ia->ri_id = id; - write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); rpcrdma_destroy_id(old); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 512bbdcc7d76..95cdc66225ee 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,7 +65,6 @@ */ struct rpcrdma_ia { const struct rpcrdma_memreg_ops *ri_ops; - rwlock_t ri_qplock; struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; -- cgit v1.2.3 From 9a8f6b5ea275ff01fc8ef3b8630a3d4ed6b0a362 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 May 2016 17:42:42 -0400 Subject: SUNRPC: Ensure get_rpccred() and put_rpccred() can take NULL arguments Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 3 ++- net/sunrpc/auth.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 6f36b2bf3e05..899791573a40 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -202,7 +202,8 @@ char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline struct rpc_cred * get_rpccred(struct rpc_cred *cred) { - atomic_inc(&cred->cr_count); + if (cred != NULL) + atomic_inc(&cred->cr_count); return cred; } diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index e0bb30fd2ed3..040ff627c18a 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); - if (req->rq_cred != NULL) - put_rpccred(req->rq_cred); + put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } @@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) void put_rpccred(struct rpc_cred *cred) { + if (cred == NULL) + return; /* Fast path for unhashed credentials */ if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { if (atomic_dec_and_test(&cred->cr_count)) -- cgit v1.2.3