From d737e5d418706abf32f6de68c3e09958516d422f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Feb 2021 16:04:15 -0500 Subject: SUNRPC: Set TCP_CORK until the transmit queue is empty When we have multiple RPC requests queued up, it makes sense to set the TCP_CORK option while the transmit queue is non-empty. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 2 ++ net/sunrpc/xprtsock.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 691ccf8049a4..a853f75d4968 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1352,6 +1352,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) list_add_tail(&req->rq_xmit, &xprt->xmit_queue); INIT_LIST_HEAD(&req->rq_xmit2); out: + atomic_long_inc(&xprt->xmit_queuelen); set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); } @@ -1381,6 +1382,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) } } else list_del(&req->rq_xmit2); + atomic_long_dec(&req->rq_xprt->xmit_queuelen); } /** diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e35760f238a4..a64f5ed1edb4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1018,6 +1018,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req) * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ req->rq_xtime = ktime_get(); + tcp_sock_set_cork(transport->inet, true); while (1) { status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); @@ -1032,6 +1033,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req) if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; + if (atomic_long_read(&xprt->xmit_queuelen) == 1) + tcp_sock_set_cork(transport->inet, false); return 0; } @@ -2163,6 +2166,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } xs_tcp_set_socket_timeouts(xprt, sock); + tcp_sock_set_nodelay(sk); write_lock_bh(&sk->sk_callback_lock); @@ -2177,7 +2181,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* socket options */ sock_reset_flag(sk, SOCK_LINGER); - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); -- cgit v1.2.3 From 6b996476f364009e9be43e98f5bca11e5ec95b2d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 22 Mar 2021 13:29:04 +0800 Subject: sunrpc: honor rpc_task's timeout value in rpcb_create() Currently rpcbind client is created without setting rpc timeout (thus using the default value). But if the rpc_task already has a customized timeout in its tk_client field, it's also ignored. Let's use the same timeout setting in rpc_task->tk_client->cl_timeout for rpcbind connection. Signed-off-by: Eryu Guan Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 38fe2ce8a5aa..647b323cc1d5 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -344,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, - const struct cred *cred) + const struct cred *cred, + const struct rpc_timeout *timeo) { struct rpc_create_args args = { .net = net, .protocol = proto, .address = srvaddr, .addrsize = salen, + .timeout = timeo, .servername = hostname, .nodename = nodename, .program = &rpcb_program, @@ -705,7 +707,8 @@ void rpcb_getport_async(struct rpc_task *task) clnt->cl_nodename, xprt->servername, sap, salen, xprt->prot, bind_version, - clnt->cl_cred); + clnt->cl_cred, + task->tk_client->cl_timeout); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); goto bailout_nofree; -- cgit v1.2.3 From 98b5cee37389b899de044bc4aac56e6ff33dbd4d Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 22 Mar 2021 14:37:01 -0400 Subject: SUNRPC: Ensure the transport backchannel association If the server sends CB_ calls on a connection that is not associated with the backchannel, refuse to process the call and shut down the connection. This avoids a NULL dereference crash in xprt_complete_bc_request(). There's not much more we can do in this situation unless we want to look into allowing all connections to be associated with the fore and back channel. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a64f5ed1edb4..47aa47a2b07c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -558,6 +558,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) struct rpc_rqst *req; ssize_t ret; + /* Is this transport associated with the backchannel? */ + if (!xprt->bc_serv) + return -ESHUTDOWN; + /* Look up and lock the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->recv.xid); if (!req) { -- cgit v1.2.3 From 7638e0bfaed1b653d3ca663e560e9ffb44bb1030 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 13:22:14 -0400 Subject: SUNRPC: Move fault injection call sites I've hit some crashes that occur in the xprt_rdma_inject_disconnect path. It appears that, for some provides, rdma_disconnect() can take so long that the transport can disconnect and release its hardware resources while rdma_disconnect() is still running, resulting in a UAF in the provider. The transport's fault injection method may depend on the stability of transport data structures. That means it needs to be invoked only from contexts that hold the transport write lock. Fixes: 4a0682583988 ("SUNRPC: Transport fault injection") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 1 - net/sunrpc/xprt.c | 6 ++++-- net/sunrpc/xprtrdma/transport.c | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612f0a641f4c..c2a01125be1a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1799,7 +1799,6 @@ call_allocate(struct rpc_task *task) status = xprt->ops->buf_alloc(task); trace_rpc_buf_alloc(task, status); - xprt_inject_disconnect(xprt); if (status == 0) return; if (status != -ENOMEM) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a853f75d4968..80c94ead4aa0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1485,7 +1485,10 @@ bool xprt_prepare_transmit(struct rpc_task *task) void xprt_end_transmit(struct rpc_task *task) { - xprt_release_write(task->tk_rqstp->rq_xprt, task); + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; + + xprt_inject_disconnect(xprt); + xprt_release_write(xprt, task); } /** @@ -1887,7 +1890,6 @@ void xprt_release(struct rpc_task *task) spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); - xprt_inject_disconnect(xprt); xdr_free_bvec(&req->rq_rcv_buf); xdr_free_bvec(&req->rq_snd_buf); if (req->rq_cred != NULL) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 78d29d1bcc20..09953597d055 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -262,8 +262,10 @@ xprt_rdma_connect_worker(struct work_struct *work) * xprt_rdma_inject_disconnect - inject a connection fault * @xprt: transport context * - * If @xprt is connected, disconnect it to simulate spurious connection - * loss. + * If @xprt is connected, disconnect it to simulate spurious + * connection loss. Caller must hold @xprt's send lock to + * ensure that data structures and hardware resources are + * stable during the rdma_disconnect() call. */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) -- cgit v1.2.3 From e936a5970ef596ff48fca72aa8200955753c543f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 13:22:27 -0400 Subject: SUNRPC: Add tracepoint that fires when an RPC is retransmitted A separate tracepoint can be left enabled all the time to capture rare but important retransmission events. So for example: kworker/u26:3-568 [009] 156.967933: xprt_retransmit: task:44093@5 xid=0xa25dbc79 nfsv3 WRITE ntrans=2 Or, for example, enable all nfs and nfs4 tracepoints, and set up a trigger to disable tracing when xprt_retransmit fires to capture everything that leads up to it. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 40 ++++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprt.c | 4 +++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 036eb1f5c133..430b42f2351f 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1079,6 +1079,46 @@ TRACE_EVENT(xprt_transmit, __entry->seqno, __entry->status) ); +TRACE_EVENT(xprt_retransmit, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, ntrans) + __field(int, version) + __string(progname, + rqst->rq_task->tk_client->cl_program->name) + __string(procedure, + rqst->rq_task->tk_msg.rpc_proc->p_name) + ), + + TP_fast_assign( + struct rpc_task *task = rqst->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->ntrans = rqst->rq_ntrans; + __assign_str(progname, + task->tk_client->cl_program->name) + __entry->version = task->tk_client->cl_vers; + __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + ), + + TP_printk( + "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d", + __entry->task_id, __entry->client_id, __entry->xid, + __get_str(progname), __entry->version, __get_str(procedure), + __entry->ntrans) +); + TRACE_EVENT(xprt_ping, TP_PROTO(const struct rpc_xprt *xprt, int status), diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 80c94ead4aa0..67039ee443eb 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1542,8 +1542,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) return status; } - if (is_retrans) + if (is_retrans) { task->tk_client->cl_stats->rpcretrans++; + trace_xprt_retransmit(req); + } xprt_inject_disconnect(xprt); -- cgit v1.2.3 From 6cf23783f750634e10daeede48b0f5f5d64ebf3a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 16:03:08 -0400 Subject: SUNRPC: Remove trace_xprt_transmit_queued This tracepoint can crash when dereferencing snd_task because when some transports connect, they put a cookie in that field instead of a pointer to an rpc_task. BUG: KASAN: use-after-free in trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] Read of size 2 at addr ffff8881a83bd3a0 by task git/331872 CPU: 11 PID: 331872 Comm: git Tainted: G S 5.12.0-rc2-00007-g3ab6e585a7f9 #1453 Hardware name: Supermicro SYS-6028R-T/X10DRi, BIOS 1.1a 10/16/2015 Call Trace: dump_stack+0x9c/0xcf print_address_description.constprop.0+0x18/0x239 kasan_report+0x174/0x1b0 trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] xprt_prepare_transmit+0x8e/0xc1 [sunrpc] call_transmit+0x4d/0xc6 [sunrpc] Fixes: 9ce07ae5eb1d ("SUNRPC: Replace dprintk() call site in xprt_prepare_transmit") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 1 - net/sunrpc/xprt.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 430b42f2351f..3c3c4c843c62 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1181,7 +1181,6 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, DEFINE_WRITELOCK_EVENT(reserve_xprt); DEFINE_WRITELOCK_EVENT(release_xprt); -DEFINE_WRITELOCK_EVENT(transmit_queued); DECLARE_EVENT_CLASS(xprt_cong_event, TP_PROTO( diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 67039ee443eb..baba7d14825a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1471,8 +1471,6 @@ bool xprt_prepare_transmit(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; if (!xprt_lock_write(xprt, task)) { - trace_xprt_transmit_queued(xprt, task); - /* Race breaker: someone may have transmitted us */ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) rpc_wake_up_queued_task_set_status(&xprt->sending, -- cgit v1.2.3 From 09252177d5f924f404551b4b4eded5daa7f04a3a Mon Sep 17 00:00:00 2001 From: Chris Dion Date: Sun, 4 Apr 2021 21:29:26 -0400 Subject: SUNRPC: Handle major timeout in xprt_adjust_timeout() Currently if a major timeout value is reached, but the minor value has not been reached, an ETIMEOUT will not be sent back to the caller. This can occur if the v4 server is not responding to requests and retrans is configured larger than the default of two. For example, A TCP mount with a configured timeout value of 50 and a retransmission count of 3 to a v4 server which is not responding: 1. Initial value and increment set to 5s, maxval set to 20s, retries at 3 2. Major timeout is set to 20s, minor timeout set to 5s initially 3. xport_adjust_timeout() is called after 5s, retry with 10s timeout, minor timeout is bumped to 10s 4. And again after another 10s, 15s total time with minor timeout set to 15s 5. After 20s total time xport_adjust_timeout is called as major timeout is reached, but skipped because the minor timeout is not reached - After this time the cpu spins continually calling xport_adjust_timeout() and returning 0 for 10 seconds. As seen on perf sched: 39243.913182 [0005] mount.nfs[3794] 4607.938 0.017 9746.863 6. This continues until the 15s minor timeout condition is reached (in this case for 10 seconds). After which the ETIMEOUT is processed back to the caller, the cpu spinning stops, and normal operations continue Fixes: 7de62bc09fe6 ("SUNRPC dont update timeout value on connection reset") Signed-off-by: Chris Dion Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index baba7d14825a..e5b5a960a69b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -698,9 +698,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; int status = 0; - if (time_before(jiffies, req->rq_minortimeo)) - return status; if (time_before(jiffies, req->rq_majortimeo)) { + if (time_before(jiffies, req->rq_minortimeo)) + return status; if (to->to_exponential) req->rq_timeout <<= 1; else -- cgit v1.2.3 From 32e6b68167f1d446111c973d57e6f52aee11897a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:03 -0400 Subject: xprtrdma: Avoid Receive Queue wrapping Commit e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)") increased the number of Receive WRs that are posted by the client, but did not increase the size of the Receive Queue allocated during transport set-up. This is usually not an issue because RPCRDMA_BACKWARD_WRS is defined as (32) when SUNRPC_BACKCHANNEL is defined. In cases where it isn't, there is a real risk of Receive Queue wrapping. Fixes: e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)") Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 766a1048a48a..132df9b59ab4 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -257,6 +257,7 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ ep->re_attr.cap.max_recv_wr = ep->re_max_requests; ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ ep->re_max_rdma_segs = -- cgit v1.2.3 From 15788d1d1077ebe029c48842c738876516d85076 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:09 -0400 Subject: xprtrdma: Do not refresh Receive Queue while it is draining Currently the Receive completion handler refreshes the Receive Queue whenever a successful Receive completion occurs. On disconnect, xprtrdma drains the Receive Queue. The first few Receive completions after a disconnect are typically successful, until the first flushed Receive. This means the Receive completion handler continues to post more Receive WRs after the drain sentinel has been posted. The late- posted Receives flush after the drain sentinel has completed, leading to a crash later in rpcrdma_xprt_disconnect(). To prevent this crash, xprtrdma has to ensure that the Receive handler stops posting Receives before ib_drain_rq() posts its drain sentinel. Suggested-by: Tom Talpey Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 13 +++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ec912cf9c618..d8ed69442219 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -101,6 +101,12 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rdma_cm_id *id = ep->re_id; + /* Wait for rpcrdma_post_recvs() to leave its critical + * section. + */ + if (atomic_inc_return(&ep->re_receiving) > 1) + wait_for_completion(&ep->re_done); + /* Flush Receives, then wait for deferred Reply work * to complete. */ @@ -414,6 +420,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) __module_get(THIS_MODULE); device = id->device; ep->re_id = id; + reinit_completion(&ep->re_done); ep->re_max_requests = r_xprt->rx_xprt.max_reqs; ep->re_inline_send = xprt_rdma_max_inline_write; @@ -1385,6 +1392,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; + if (atomic_inc_return(&ep->re_receiving) > 1) + goto out; + /* fast path: all needed reps can be found on the free list */ wr = NULL; while (needed) { @@ -1410,6 +1420,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + out: trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index fe3be985e239..31404326f29f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -83,6 +83,7 @@ struct rpcrdma_ep { unsigned int re_max_inline_recv; int re_async_rc; int re_connect_status; + atomic_t re_receiving; atomic_t re_force_disconnect; struct ib_qp_init_attr re_attr; wait_queue_head_t re_connect_wait; -- cgit v1.2.3 From 5030c9a938f875f31932928632e1597f03e79ace Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:16 -0400 Subject: xprtrdma: Put flushed Receives on free list instead of destroying them Defer destruction of an rpcrdma_rep until transport tear-down to preserve the rb_all_reps list while Receives flush. Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d8ed69442219..1b599a623eea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -80,6 +80,8 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep); static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); @@ -211,7 +213,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) out_flushed: rpcrdma_flush_disconnect(r_xprt, wc); - rpcrdma_rep_destroy(rep); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); } static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, -- cgit v1.2.3 From eaf86e8cc85c4abf3e4a2a0d3f59af613d2bacab Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Apr 2021 15:02:28 -0400 Subject: xprtrdma: Improve locking around rpcrdma_rep destruction Currently rpcrdma_reps_destroy() assumes that, at transport tear-down, the content of the rb_free_reps list is the same as the content of the rb_all_reps list. Although that is usually true, using the rb_all_reps list should be more reliable because of the way it's managed. And, rpcrdma_reps_unmap() uses rb_all_reps; these two functions should both traverse the "all" list. Ensure that all rpcrdma_reps are always destroyed whether they are on the rep free list or not. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1b599a623eea..2bc4589c37ce 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1007,16 +1007,23 @@ out: return NULL; } -/* No locking needed here. This function is invoked only by the - * Receive completion handler, or during transport shutdown. - */ -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +static void rpcrdma_rep_free(struct rpcrdma_rep *rep) { - list_del(&rep->rr_all); rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); +} + static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1049,8 +1056,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_rep *rep; - while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) - rpcrdma_rep_destroy(rep); + spin_lock(&buf->rb_lock); + while ((rep = list_first_entry_or_null(&buf->rb_all_reps, + struct rpcrdma_rep, + rr_all)) != NULL) { + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); + + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); } /** -- cgit v1.2.3 From 8b5292be6880025cb3789cc811d19b4b8f0bf786 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:28 -0400 Subject: xprtrdma: Improve commentary around rpcrdma_reps_unmap() Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2bc4589c37ce..baf4b8c99d4c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1041,6 +1041,10 @@ static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, llist_add(&rep->rr_node, &buf->rb_free_reps); } +/* Caller must ensure the QP is quiescent (RQ is drained) before + * invoking this function, to guarantee rb_all_reps is not + * changing. + */ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1048,7 +1052,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; + rep->rr_temp = true; /* Mark this rep for destruction */ } } -- cgit v1.2.3 From 9e3ca33b62d4878f6ae39776abb6deebb37db597 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:35 -0400 Subject: xprtrdma: Improve locking around rpcrdma_rep creation Defensive clean up: Protect the rb_all_reps list during rep creation. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index baf4b8c99d4c..95ce9328e5d2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -963,13 +963,11 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) rpcrdma_req_reset(req); } -/* No locking needed here. This function is called only by the - * Receive completion handler. - */ static noinline struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; rep = kzalloc(sizeof(*rep), GFP_KERNEL); @@ -996,7 +994,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; - list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_all, &buf->rb_all_reps); + spin_unlock(&buf->rb_lock); return rep; out_free_regbuf: -- cgit v1.2.3 From 35d8b10a25884050bb3b0149b62c3818ec59f77c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:41 -0400 Subject: xprtrdma: Fix cwnd update ordering After a reconnect, the reply handler is opening the cwnd (and thus enabling more RPC Calls to be sent) /before/ rpcrdma_post_recvs() can post enough Receive WRs to receive their replies. This causes an RNR and the new connection is lost immediately. The race is most clearly exposed when KASAN and disconnect injection are enabled. This slows down rpcrdma_rep_create() enough to allow the send side to post a bunch of RPC Calls before the Receive completion handler can invoke ib_post_recv(). Fixes: 2ae50ad68cd7 ("xprtrdma: Close window between waking RPC senders and posting Receives") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 10 +++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 292f066d006e..21ddd78a8c35 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1430,9 +1430,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), + false); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); - rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); if (unlikely(req->rl_reply)) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 95ce9328e5d2..de96fcede7d4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -544,7 +544,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) * outstanding Receives. */ rpcrdma_ep_get(ep); - rpcrdma_post_recvs(r_xprt, true); + rpcrdma_post_recvs(r_xprt, 1, true); rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) @@ -1395,21 +1395,21 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance - * @temp: mark Receive buffers to be deleted after use + * @needed: current credit grant + * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; - int needed, count, rc; + int count, rc; rc = 0; count = 0; - needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 31404326f29f..2504f67af63e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -462,7 +462,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c -- cgit v1.2.3 From c35ca60d490e32b7e7d21f344693ea29d4f4a9d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:47 -0400 Subject: xprtrdma: Delete rpcrdma_recv_buffer_put() Clean up: The name recv_buffer_put() is a vestige of older code, and the function is just a wrapper for the newer rpcrdma_rep_put(). In most of the existing call sites, a pointer to the owning rpcrdma_buffer is already available. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/backchannel.c | 4 +++- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 24 ++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 14 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index a249837d6a55..1151efd09b27 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -155,9 +155,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_rep *rep = req->rl_reply; struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); req->rl_reply = NULL; spin_lock(&xprt->bc_pa_lock); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 21ddd78a8c35..be4e888e78a3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1437,7 +1437,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) req = rpcr_to_rdmar(rqst); if (unlikely(req->rl_reply)) - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(buf, req->rl_reply); req->rl_reply = rep; rep->rr_rqst = rqst; @@ -1465,5 +1465,5 @@ out_shortreply: trace_xprtrdma_reply_short_err(rep); out: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index de96fcede7d4..5002d86a55ee 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -80,8 +80,6 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep); static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); @@ -1036,8 +1034,13 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) return llist_entry(node, struct rpcrdma_rep, rr_node); } -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep) +/** + * rpcrdma_rep_put - Release rpcrdma_rep back to free list + * @buf: buffer pool + * @rep: rep to release + * + */ +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { llist_add(&rep->rr_node, &buf->rb_free_reps); } @@ -1252,17 +1255,6 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) spin_unlock(&buffers->rb_lock); } -/** - * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list - * @rep: rep to release - * - * Used after error conditions. - */ -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) -{ - rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); -} - /* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for @@ -1455,7 +1447,7 @@ out: rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); wr = wr->next; - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); --count; } } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2504f67af63e..9c5039d4634a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -481,7 +481,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); -- cgit v1.2.3 From 1363e6388c363d0433f9aa4e2f33efe047572687 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:54 -0400 Subject: xprtrdma: rpcrdma_mr_pop() already does list_del_init() The rpcrdma_mr_pop() earlier in the function has already cleared out mr_list, so it must not be done again in the error path. Fixes: 847568942f93 ("xprtrdma: Remove fr_state") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 132df9b59ab4..aca2228095db 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -576,7 +576,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - list_del_init(&mr->mr_list); frwr_mr_recycle(mr); } } -- cgit v1.2.3 From f912af77e2c1ba25bd40534668b10da5b20f686a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:00 -0400 Subject: xprtrdma: Rename frwr_release_mr() Clean up: To be consistent with other functions in this source file, follow the naming convention of putting the object being acted upon before the action itself. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index aca2228095db..bfc057fdb75f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -50,11 +50,11 @@ #endif /** - * frwr_release_mr - Destroy one MR + * frwr_mr_release - Destroy one MR * @mr: MR allocated by frwr_mr_init * */ -void frwr_release_mr(struct rpcrdma_mr *mr) +void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; @@ -88,7 +88,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } static void frwr_mr_put(struct rpcrdma_mr *mr) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5002d86a55ee..f97b03129d99 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1138,7 +1138,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } rpcrdma_regbuf_free(req->rl_recvbuf); @@ -1169,7 +1169,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); spin_lock(&buf->rb_lock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9c5039d4634a..1b187d1dee8a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -528,7 +528,7 @@ rpcrdma_data_dir(bool writing) void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); -void frwr_release_mr(struct rpcrdma_mr *mr); +void frwr_mr_release(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, -- cgit v1.2.3 From 44438ad9ae22277a261f9fa4fdc6387a8ff50f2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:06 -0400 Subject: xprtrdma: Clarify use of barrier in frwr_wc_localinv_done() Clean up: The comment and the placement of the memory barrier is confusing. Humans want to read the function statements from head to tail. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index bfc057fdb75f..af85cec0ce31 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -592,14 +592,16 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); - struct rpcrdma_rep *rep = mr->mr_req->rl_reply; + struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); - frwr_mr_done(wc, mr); - /* Ensure @rep is generated before frwr_mr_done */ + /* Ensure that @rep is generated before the MR is released */ + rep = mr->mr_req->rl_reply; smp_rmb(); + + frwr_mr_done(wc, mr); rpcrdma_complete_rqst(rep); rpcrdma_flush_disconnect(cq->cq_context, wc); -- cgit v1.2.3 From e4b52ca01315ad53df41877708428c1c41c1444d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:12 -0400 Subject: xprtrdma: Do not recycle MR after FastReg/LocalInv flushes Better not to touch MRs involved in a flush or post error until the Send and Receive Queues are drained and the transport is fully quiescent. Simply don't insert such MRs back onto the free list. They remain on mr_all and will be released when the connection is torn down. I had thought that recycling would prevent hardware resources from being tied up for a long time. However, since v5.7, a transport disconnect destroys the QP and other hardware-owned resources. The MRs get cleaned up nicely at that point. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 1 - net/sunrpc/xprtrdma/frwr_ops.c | 69 +++++++++++------------------------------- 2 files changed, 17 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c838e7ac1c2d..e38e745d13b0 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1014,7 +1014,6 @@ DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); -DEFINE_ANON_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index af85cec0ce31..27087dc8ba3c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -49,6 +49,16 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} + /** * frwr_mr_release - Destroy one MR * @mr: MR allocated by frwr_mr_init @@ -58,6 +68,8 @@ void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; + frwr_mr_unmap(mr->mr_xprt, mr); + rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) trace_xprtrdma_frwr_dereg(mr, rc); @@ -65,32 +77,6 @@ void frwr_mr_release(struct rpcrdma_mr *mr) kfree(mr); } -static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) -{ - if (mr->mr_device) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, - mr->mr_dir); - mr->mr_device = NULL; - } -} - -static void frwr_mr_recycle(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - - frwr_mr_unmap(r_xprt, mr); - - spin_lock(&r_xprt->rx_buf.rb_lock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_lock); - - frwr_mr_release(mr); -} - static void frwr_mr_put(struct rpcrdma_mr *mr) { frwr_mr_unmap(mr->mr_xprt, mr); @@ -365,6 +351,7 @@ out_mapmr_err: * @cq: completion queue * @wc: WCE for a completed FastReg WR * + * Each flushed MR gets destroyed after the QP has drained. */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { @@ -374,7 +361,6 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); - /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -448,9 +434,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { - if (wc->status != IB_WC_SUCCESS) - frwr_mr_recycle(mr); - else + if (likely(wc->status == IB_WC_SUCCESS)) frwr_mr_put(mr); } @@ -567,17 +551,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ + /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, - fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } } /** @@ -621,7 +596,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; struct rpcrdma_ep *ep = r_xprt->rx_ep; - const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -663,21 +637,12 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * replaces the QP. The RPC reply handler won't call us * unless re_id->qp is a valid pointer. */ - bad_wr = NULL; - rc = ib_post_send(ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, NULL); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ + /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will -- cgit v1.2.3 From 8a053433de00380a9c5758d94c7c2ec2e25321fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:19 -0400 Subject: xprtrdma: Do not wake RPC consumer on a failed LocalInv Throw away any reply where the LocalInv flushes or could not be posted. The registered memory region is in an unknown state until the disconnect completes. rpcrdma_xprt_disconnect() will find and release the MR. No need to put it back on the MR free list in this case. The client retransmits pending RPC requests once it reestablishes a fresh connection, so a replacement reply should be forthcoming on the next connection instance. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++++------ net/sunrpc/xprtrdma/rpc_rdma.c | 32 +++++++++++++++++++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 41 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 27087dc8ba3c..951ae20485f3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -576,10 +576,14 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) rep = mr->mr_req->rl_reply; smp_rmb(); - frwr_mr_done(wc, mr); + if (wc->status != IB_WC_SUCCESS) { + if (rep) + rpcrdma_unpin_rqst(rep); + rpcrdma_flush_disconnect(cq->cq_context, wc); + return; + } + frwr_mr_put(mr); rpcrdma_complete_rqst(rep); - - rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -645,8 +649,9 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) trace_xprtrdma_post_linv_err(req, rc); /* The final LOCAL_INV WR in the chain is supposed to - * do the wake. If it was never posted, the wake will - * not happen, so wake here in that case. + * do the wake. If it was never posted, the wake does + * not happen. Unpin the rqst in preparation for its + * retransmission. */ - rpcrdma_complete_rqst(req->rl_reply); + rpcrdma_unpin_rqst(req->rl_reply); } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index be4e888e78a3..649f7d8b9733 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1326,9 +1326,35 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return -EIO; } -/* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. +/** + * rpcrdma_unpin_rqst - Release rqst without completing it + * @rep: RPC/RDMA Receive context + * + * This is done when a connection is lost so that a Reply + * can be dropped and its matching Call can be subsequently + * retransmitted on a new connection. + */ +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->queue_lock); +} + +/** + * rpcrdma_complete_rqst - Pass completed rqst back to RPC + * @rep: RPC/RDMA Receive context + * + * Reconstruct the RPC reply and complete the transaction + * while @rqst is still pinned to ensure the rep, rqst, and + * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1b187d1dee8a..bb8aba390b88 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -561,6 +561,7 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) -- cgit v1.2.3 From b3ce7a25f44f03d481d12a17768cfce18b942ec2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:25 -0400 Subject: xprtrdma: Avoid Send Queue wrapping Send WRs can be signalled or unsignalled. A signalled Send WR always has a matching Send completion, while a unsignalled Send has a completion only if the Send WR fails. xprtrdma has a Send account mechanism that is designed to reduce the number of signalled Send WRs. This in turn mitigates the interrupt rate of the underlying device. RDMA consumers can't leave all Sends unsignaled, however, because providers rely on Send completions to maintain their Send Queue head and tail pointers. xprtrdma counts the number of unsignaled Send WRs that have been posted to ensure that Sends are signalled often enough to prevent the Send Queue from wrapping. This mechanism neglected to account for FastReg WRs, which are posted on the Send Queue but never signalled. As a result, the Send Queue wrapped on occasion, resulting in duplication completions of FastReg and LocalInv WRs. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++++++++-- net/sunrpc/xprtrdma/verbs.c | 16 +--------------- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 951ae20485f3..43a412ea337a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -390,11 +390,13 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct ib_send_wr *post_wr, *send_wr = &req->rl_wr; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; + unsigned int num_wrs; - post_wr = &req->rl_wr; + num_wrs = 1; + post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; @@ -409,8 +411,19 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr->fr_regwr.wr.send_flags = 0; post_wr = &frwr->fr_regwr.wr; + ++num_wrs; } + if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->re_send_count = min_t(unsigned int, ep->re_send_batch, + num_wrs - ep->re_send_count); + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + ep->re_send_count -= num_wrs; + } + + trace_xprtrdma_post_send(req); return ib_post_send(ep->re_id->qp, post_wr, NULL); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f97b03129d99..72b24dca96c1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1365,21 +1365,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) */ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_wr; - struct rpcrdma_ep *ep = r_xprt->rx_ep; - int rc; - - if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { - send_wr->send_flags |= IB_SEND_SIGNALED; - ep->re_send_count = ep->re_send_batch; - } else { - send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->re_send_count; - } - - trace_xprtrdma_post_send(req); - rc = frwr_send(r_xprt, req); - if (rc) + if (frwr_send(r_xprt, req)) return -ENOTCONN; return 0; } -- cgit v1.2.3 From 4ddd0fc32c94fbb77a8c0728dc507b2bdcc67edc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:31 -0400 Subject: xprtrdma: Add tracepoints showing FastReg WRs and remote invalidation The Send signaling logic is a little subtle, so add some observability around it. For every xprtrdma_mr_fastreg event, there should be an xprtrdma_mr_localinv or xprtrdma_mr_reminv event. When these tracepoints are enabled, we can see exactly when an MR is DMA-mapped, registered, invalidated (either locally or remotely) and then DMA-unmapped. kworker/u25:2-190 [000] 787.979512: xprtrdma_mr_map: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979515: xprtrdma_chunk_read: task:351@5 pos=148 5608@0x8679e0c8f6f56000:0x00000503 (last) kworker/u25:2-190 [000] 787.979519: xprtrdma_marshal: task:351@5 xid=0x8679e0c8: hdr=52 xdr=148/5608/0 read list/inline kworker/u25:2-190 [000] 787.979525: xprtrdma_mr_fastreg: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979526: xprtrdma_post_send: task:351@5 cq.id=0 cid=73 (2 SGEs) ... kworker/5:1H-219 [005] 787.980567: xprtrdma_wc_receive: cq.id=1 cid=161 status=SUCCESS (0/0x0) received=164 kworker/5:1H-219 [005] 787.980571: xprtrdma_post_recvs: peer=[192.168.100.55]:20049 r_xprt=0xffff8884974d4000: 0 new recvs, 70 active (rc 0) kworker/5:1H-219 [005] 787.980573: xprtrdma_reply: task:351@5 xid=0x8679e0c8 credits=64 kworker/5:1H-219 [005] 787.980576: xprtrdma_mr_reminv: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/5:1H-219 [005] 787.980577: xprtrdma_mr_unmap: mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) Note that I've moved the xprtrdma_post_send tracepoint so that event always appears after the xprtrdma_mr_fastreg tracepoint. Otherwise the event log looks counterintuitive (FastReg is always supposed to happen before Send). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 2 ++ net/sunrpc/xprtrdma/frwr_ops.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38e745d13b0..9462326b3535 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1010,7 +1010,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ) ); +DEFINE_MR_EVENT(fastreg); DEFINE_MR_EVENT(localinv); +DEFINE_MR_EVENT(reminv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 43a412ea337a..0f47c1e24037 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -400,6 +400,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; + trace_xprtrdma_mr_fastreg(mr); frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; @@ -440,6 +441,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); + trace_xprtrdma_mr_reminv(mr); frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } -- cgit v1.2.3 From e1648eb23d839bd4b9f2999296d5e81dcd93311f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:50 -0400 Subject: xprtrdma: Remove the RPC/RDMA QP event handler Clean up: The handler only recorded a trace event. If indeed no action is needed by the RPC/RDMA consumer, then the event can be ignored. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 32 -------------------------------- net/sunrpc/xprtrdma/verbs.c | 18 ------------------ 2 files changed, 50 deletions(-) (limited to 'net') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38b8e33be2d..ef6166b840e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -577,38 +577,6 @@ TRACE_EVENT(xprtrdma_op_set_cto, ) ); -TRACE_EVENT(xprtrdma_qp_event, - TP_PROTO( - const struct rpcrdma_ep *ep, - const struct ib_event *event - ), - - TP_ARGS(ep, event), - - TP_STRUCT__entry( - __field(unsigned long, event) - __string(name, event->device->name) - __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) - __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) - ), - - TP_fast_assign( - const struct rdma_cm_id *id = ep->re_id; - - __entry->event = event->event; - __assign_str(name, event->device->name); - memcpy(__entry->srcaddr, &id->route.addr.src_addr, - sizeof(struct sockaddr_in6)); - memcpy(__entry->dstaddr, &id->route.addr.dst_addr, - sizeof(struct sockaddr_in6)); - ), - - TP_printk("%pISpc -> %pISpc device=%s %s (%lu)", - __entry->srcaddr, __entry->dstaddr, __get_str(name), - rdma_show_ib_event(__entry->event), __entry->event - ) -); - /** ** Call events **/ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 72b24dca96c1..1e965a380896 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -120,22 +120,6 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) rpcrdma_ep_put(ep); } -/** - * rpcrdma_qp_event_handler - Handle one QP event (error notification) - * @event: details of the event - * @context: ep that owns QP where event occurred - * - * Called from the RDMA provider (device driver) possibly in an interrupt - * context. The QP is always destroyed before the ID, so the ID will be - * reliably available when this handler is invoked. - */ -static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) -{ - struct rpcrdma_ep *ep = context; - - trace_xprtrdma_qp_event(ep, event); -} - /* Ensure xprt_force_disconnect() is invoked exactly once when a * connection is closed or lost. (The important thing is it needs * to be invoked "at least" once). @@ -431,8 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); - ep->re_attr.event_handler = rpcrdma_qp_event_handler; - ep->re_attr.qp_context = ep; ep->re_attr.srq = NULL; ep->re_attr.cap.max_inline_data = 0; ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; -- cgit v1.2.3 From 0a26d10e300204f2a064e44fb181323bc6d986eb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:56 -0400 Subject: xprtrdma: Move fr_cid to struct rpcrdma_mr Clean up (for several purposes): - The MR's cid is initialized sooner so that tracepoints can show something reasonable even if the MR is never posted. - The MR's res.id doesn't change so the cid won't change either. Initializing the cid once is sufficient. - struct rpcrdma_frwr is going away soon. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 31 +++++++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0f47c1e24037..d3c18c776bf9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -49,6 +49,15 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_mr *mr) +{ + struct rpc_rdma_cid *cid = &mr->mr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = mr->frwr.fr_mr->res.id; +} + static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { if (mr->mr_device) { @@ -134,6 +143,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); + frwr_cid_init(ep, mr); sg_init_table(sg, depth); mr->mr_sg = sg; @@ -358,22 +368,14 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); + struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); + trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); rpcrdma_flush_disconnect(cq->cq_context, wc); } -static void frwr_cid_init(struct rpcrdma_ep *ep, - struct rpcrdma_frwr *frwr) -{ - struct rpc_rdma_cid *cid = &frwr->fr_cid; - - cid->ci_queue_id = ep->re_attr.send_cq->res.id; - cid->ci_completion_id = frwr->fr_mr->res.id; -} - /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -404,7 +406,6 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; - frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -467,7 +468,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li(wc, &mr->mr_cid); frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -488,7 +489,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); @@ -529,7 +530,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -585,7 +585,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li_done(wc, &mr->mr_cid); /* Ensure that @rep is generated before the MR is released */ rep = mr->mr_req->rl_reply; @@ -631,7 +631,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bb8aba390b88..0cf073f0ee64 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -232,7 +232,6 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; - struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -254,6 +253,7 @@ struct rpcrdma_mr { u32 mr_length; u64 mr_offset; struct list_head mr_all; + struct rpc_rdma_cid mr_cid; }; /* -- cgit v1.2.3 From e10fa96d347488d1fd278e84f52ba7b25067cc71 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:03 -0400 Subject: xprtrdma: Move cqe to struct rpcrdma_mr Clean up. - Simplify variable initialization in the completion handlers. - Move another field out of struct rpcrdma_frwr. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 35 +++++++++++++++-------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index d3c18c776bf9..2a886a28d82b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -366,9 +366,7 @@ out_mapmr_err: static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); @@ -405,9 +403,9 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) trace_xprtrdma_mr_fastreg(mr); frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_fastreg; + mr->mr_cqe.done = frwr_wc_fastreg; frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.wr_cqe = &mr->mr_cqe; frwr->fr_regwr.wr.num_sge = 0; frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; frwr->fr_regwr.wr.send_flags = 0; @@ -463,9 +461,7 @@ static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, &mr->mr_cid); @@ -484,9 +480,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + struct rpcrdma_frwr *frwr = &mr->frwr; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); @@ -529,16 +524,17 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; } @@ -547,7 +543,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * last WR in the chain completes, all WRs in the chain * are complete. */ - frwr->fr_cqe.done = frwr_wc_localinv_wake; + last->wr_cqe->done = frwr_wc_localinv_wake; reinit_completion(&frwr->fr_linv_done); /* Transport disconnect drains the receive CQ before it @@ -579,9 +575,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ @@ -630,16 +624,17 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; } @@ -649,7 +644,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * are complete. The last completion will wake up the * RPC waiter. */ - frwr->fr_cqe.done = frwr_wc_localinv_done; + last->wr_cqe->done = frwr_wc_localinv_done; /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0cf073f0ee64..f72b69c3f0ea 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,7 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - struct ib_cqe fr_cqe; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -247,6 +246,7 @@ struct rpcrdma_mr { struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; + struct ib_cqe mr_cqe; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; -- cgit v1.2.3 From 9a301cafc8619c7f30032d314da6e65d9d913d57 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:09 -0400 Subject: xprtrdma: Move fr_linv_done field to struct rpcrdma_mr Clean up: Move more of struct rpcrdma_frwr into its parent. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 9 ++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2a886a28d82b..7d7a64d2ca4a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -142,7 +142,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->frwr.fr_mr = frmr; mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); - init_completion(&mr->frwr.fr_linv_done); + init_completion(&mr->mr_linv_done); frwr_cid_init(ep, mr); sg_init_table(sg, depth); @@ -481,12 +481,11 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); - struct rpcrdma_frwr *frwr = &mr->frwr; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); frwr_mr_done(wc, mr); - complete(&frwr->fr_linv_done); + complete(&mr->mr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -544,7 +543,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * are complete. */ last->wr_cqe->done = frwr_wc_localinv_wake; - reinit_completion(&frwr->fr_linv_done); + reinit_completion(&mr->mr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us @@ -558,7 +557,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * not happen, so don't wait in that case. */ if (bad_wr != first) - wait_for_completion(&frwr->fr_linv_done); + wait_for_completion(&mr->mr_linv_done); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f72b69c3f0ea..1374054b73fd 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,7 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; @@ -247,6 +246,7 @@ struct rpcrdma_mr { int mr_nents; enum dma_data_direction mr_dir; struct ib_cqe mr_cqe; + struct completion mr_linv_done; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; -- cgit v1.2.3 From dcff9ed209aa6ad8fc575c7fccf6496fef44e869 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:15 -0400 Subject: xprtrdma: Move the Work Request union to struct rpcrdma_mr Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 28 +++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 8 ++++---- 2 files changed, 13 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 7d7a64d2ca4a..bb2b9c607c71 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -333,7 +333,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); - reg_wr = &mr->frwr.fr_regwr; + reg_wr = &mr->mr_regwr; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? @@ -398,19 +398,15 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) num_wrs = 1; post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { - struct rpcrdma_frwr *frwr; - trace_xprtrdma_mr_fastreg(mr); - frwr = &mr->frwr; mr->mr_cqe.done = frwr_wc_fastreg; - frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &mr->mr_cqe; - frwr->fr_regwr.wr.num_sge = 0; - frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; - frwr->fr_regwr.wr.send_flags = 0; - - post_wr = &frwr->fr_regwr.wr; + mr->mr_regwr.wr.next = post_wr; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + post_wr = &mr->mr_regwr.wr; ++num_wrs; } @@ -506,7 +502,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct ib_send_wr *first, **prev, *last; struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -515,15 +510,13 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; @@ -608,22 +601,19 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; /* Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1374054b73fd..99ef83d673d6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,10 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - union { - struct ib_reg_wr fr_regwr; - struct ib_send_wr fr_invwr; - }; }; struct rpcrdma_req; @@ -247,6 +243,10 @@ struct rpcrdma_mr { enum dma_data_direction mr_dir; struct ib_cqe mr_cqe; struct completion mr_linv_done; + union { + struct ib_reg_wr mr_regwr; + struct ib_send_wr mr_invwr; + }; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; -- cgit v1.2.3 From 13bcf7e32a0181095cd62010579869e87aacb332 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:21 -0400 Subject: xprtrdma: Move fr_mr field to struct rpcrdma_mr Clean up: The last remaining field in struct rpcrdma_frwr has been removed, so the struct can be eliminated. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 12 ++++++------ net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++----- 3 files changed, 12 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ef6166b840e7..bd55908c1bef 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -379,7 +379,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -420,7 +420,7 @@ DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -903,7 +903,7 @@ TRACE_EVENT(xprtrdma_frwr_alloc, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->rc = rc; ), @@ -931,7 +931,7 @@ TRACE_EVENT(xprtrdma_frwr_dereg, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -964,7 +964,7 @@ TRACE_EVENT(xprtrdma_frwr_sgerr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->nents = sg_nents; @@ -994,7 +994,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->num_mapped = num_mapped; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index bb2b9c607c71..285d73246fc2 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -55,7 +55,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, struct rpc_rdma_cid *cid = &mr->mr_cid; cid->ci_queue_id = ep->re_attr.send_cq->res.id; - cid->ci_completion_id = mr->frwr.fr_mr->res.id; + cid->ci_completion_id = mr->mr_ibmr->res.id; } static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) @@ -79,7 +79,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) frwr_mr_unmap(mr->mr_xprt, mr); - rc = ib_dereg_mr(mr->frwr.fr_mr); + rc = ib_dereg_mr(mr->mr_ibmr); if (rc) trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); @@ -139,7 +139,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) goto out_list_err; mr->mr_xprt = r_xprt; - mr->frwr.fr_mr = frmr; + mr->mr_ibmr = frmr; mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->mr_linv_done); @@ -323,7 +323,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, goto out_dmamap_err; mr->mr_device = ep->re_id->device; - ibmr = mr->frwr.fr_mr; + ibmr = mr->mr_ibmr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); if (n != dma_nents) goto out_mapmr_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 99ef83d673d6..436ad7312614 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -229,14 +229,12 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_frwr { - struct ib_mr *fr_mr; -}; - struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + + struct ib_mr *mr_ibmr; struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; @@ -247,7 +245,6 @@ struct rpcrdma_mr { struct ib_reg_wr mr_regwr; struct ib_send_wr mr_invwr; }; - struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; -- cgit v1.2.3 From f8f7e0fb22b2e75be55f2f0c13e229e75b0eac07 Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Sat, 1 May 2021 14:10:51 +1000 Subject: sunrpc: Fix misplaced barrier in call_decode Fix a misplaced barrier in call_decode. The struct rpc_rqst is modified as follows by xprt_complete_rqst: req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; And currently read as follows by call_decode: smp_rmb(); // misplaced if (!req->rq_reply_bytes_recvd) goto out; req->rq_rcv_buf.len = req->rq_private_buf.len; This patch places the smp_rmb after the if to ensure that rq_reply_bytes_recvd and rq_private_buf.len are read in order. Fixes: 9ba828861c56a ("SUNRPC: Don't try to parse incomplete RPC messages") Signed-off-by: Baptiste Lepers Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c2a01125be1a..f555d335e910 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2456,12 +2456,6 @@ call_decode(struct rpc_task *task) task->tk_flags &= ~RPC_CALL_MAJORSEEN; } - /* - * Ensure that we see all writes made by xprt_complete_rqst() - * before it changed req->rq_reply_bytes_recvd. - */ - smp_rmb(); - /* * Did we ever call xprt_complete_rqst()? If not, we should assume * the message is incomplete. @@ -2470,6 +2464,11 @@ call_decode(struct rpc_task *task) if (!req->rq_reply_bytes_recvd) goto out; + /* Ensure that we see all writes made by xprt_complete_rqst() + * before it changed req->rq_reply_bytes_recvd. + */ + smp_rmb(); + req->rq_rcv_buf.len = req->rq_private_buf.len; trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); -- cgit v1.2.3 From 9e895cd9649abe4392c59d14e31b0f5667d082d2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 1 May 2021 15:38:02 -0400 Subject: xprtrdma: Fix a NULL dereference in frwr_unmap_sync() The normal mechanism that invalidates and unmaps MRs is frwr_unmap_async(). frwr_unmap_sync() is used only when an RPC Reply bearing Write or Reply chunks has been lost (ie, almost never). Coverity found that after commit 9a301cafc861 ("xprtrdma: Move fr_linv_done field to struct rpcrdma_mr"), the while() loop in frwr_unmap_sync() exits only once @mr is NULL, unconditionally causing subsequent dereferences of @mr to Oops. I've tested this fix by creating a client that skips invoking frwr_unmap_async() when RPC Replies complete. That forces all invalidation tasks to fall upon frwr_unmap_sync(). Simple workloads with this fix applied to the adulterated client work as designed. Reported-by: coverity-bot Addresses-Coverity-ID: 1504556 ("Null pointer dereferences") Fixes: 9a301cafc861 ("xprtrdma: Move fr_linv_done field to struct rpcrdma_mr") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 285d73246fc2..229fcc9a9064 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -530,6 +530,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) *prev = last; prev = &last->next; } + mr = container_of(last, struct rpcrdma_mr, mr_invwr); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain -- cgit v1.2.3