From 5cd8b0d4dd96eece89f0b4ad623028057edd3245 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 12:58:08 -0400 Subject: SUNRPC: Eliminate log noise in call_reserveresult Sep 11 16:35:20 manet kernel: call_reserveresult: unrecognized error -512, exiting Diagnostic error messages such as this likely have no value for NFS client administrators. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f7f78566be46..a72829766b50 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1676,8 +1676,6 @@ call_reserveresult(struct rpc_task *task) return; } - printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", - __func__, status); rpc_call_rpcerror(task, -EIO); return; } @@ -1686,11 +1684,8 @@ call_reserveresult(struct rpc_task *task) * Even though there was an error, we may have acquired * a request slot somehow. Make sure not to leak it. */ - if (task->tk_rqstp) { - printk(KERN_ERR "%s: status=%d, request allocated anyway\n", - __func__, status); + if (task->tk_rqstp) xprt_release(task); - } switch (status) { case -ENOMEM: @@ -1699,14 +1694,9 @@ call_reserveresult(struct rpc_task *task) case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; - case -EIO: /* probably a shutdown */ - break; default: - printk(KERN_ERR "%s: unrecognized error %d, exiting\n", - __func__, status); - break; + rpc_call_rpcerror(task, status); } - rpc_call_rpcerror(task, status); } /* -- cgit v1.2.3 From bf7ca707ae60045342e145c88a83bbe00f66775f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 12:58:14 -0400 Subject: SUNRPC: Add trace points to observe transport congestion control To help debug problems with RPC/RDMA credit management, replace dprintk() call sites in the transport send lock paths with trace events. Similar trace points are defined for the non-congestion paths. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 93 +++++++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprt.c | 22 +++++----- 2 files changed, 106 insertions(+), 9 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index ffa3c51dbb1a..378233fe5ac7 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -777,6 +777,99 @@ TRACE_EVENT(xprt_ping, __get_str(addr), __get_str(port), __entry->status) ); +DECLARE_EVENT_CLASS(xprt_writelock_event, + TP_PROTO( + const struct rpc_xprt *xprt, const struct rpc_task *task + ), + + TP_ARGS(xprt, task), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(unsigned int, snd_task_id) + ), + + TP_fast_assign( + if (task) { + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + } else { + __entry->task_id = -1; + __entry->client_id = -1; + } + __entry->snd_task_id = xprt->snd_task ? + xprt->snd_task->tk_pid : -1; + ), + + TP_printk("task:%u@%u snd_task:%u", + __entry->task_id, __entry->client_id, + __entry->snd_task_id) +); + +#define DEFINE_WRITELOCK_EVENT(name) \ + DEFINE_EVENT(xprt_writelock_event, xprt_##name, \ + TP_PROTO( \ + const struct rpc_xprt *xprt, \ + const struct rpc_task *task \ + ), \ + TP_ARGS(xprt, task)) + +DEFINE_WRITELOCK_EVENT(reserve_xprt); +DEFINE_WRITELOCK_EVENT(release_xprt); + +DECLARE_EVENT_CLASS(xprt_cong_event, + TP_PROTO( + const struct rpc_xprt *xprt, const struct rpc_task *task + ), + + TP_ARGS(xprt, task), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(unsigned int, snd_task_id) + __field(unsigned long, cong) + __field(unsigned long, cwnd) + __field(bool, wait) + ), + + TP_fast_assign( + if (task) { + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + } else { + __entry->task_id = -1; + __entry->client_id = -1; + } + __entry->snd_task_id = xprt->snd_task ? + xprt->snd_task->tk_pid : -1; + __entry->cong = xprt->cong; + __entry->cwnd = xprt->cwnd; + __entry->wait = test_bit(XPRT_CWND_WAIT, &xprt->state); + ), + + TP_printk("task:%u@%u snd_task:%u cong=%lu cwnd=%lu%s", + __entry->task_id, __entry->client_id, + __entry->snd_task_id, __entry->cong, __entry->cwnd, + __entry->wait ? " (wait)" : "") +); + +#define DEFINE_CONG_EVENT(name) \ + DEFINE_EVENT(xprt_cong_event, xprt_##name, \ + TP_PROTO( \ + const struct rpc_xprt *xprt, \ + const struct rpc_task *task \ + ), \ + TP_ARGS(xprt, task)) + +DEFINE_CONG_EVENT(reserve_cong); +DEFINE_CONG_EVENT(release_cong); +DEFINE_CONG_EVENT(get_cong); +DEFINE_CONG_EVENT(put_cong); + TRACE_EVENT(xs_stream_read_data, TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total), diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8a45b3ccc313..fcbeb56c82cd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -205,20 +205,20 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) - return 1; + goto out_locked; goto out_sleep; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; xprt->snd_task = task; +out_locked: + trace_xprt_reserve_xprt(xprt, task); return 1; out_unlock: xprt_clear_locked(xprt); out_sleep: - dprintk("RPC: %5u failed to lock transport %p\n", - task->tk_pid, xprt); task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, @@ -269,23 +269,22 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) - return 1; + goto out_locked; goto out_sleep; } if (req == NULL) { xprt->snd_task = task; - return 1; + goto out_locked; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - return 1; + goto out_locked; } out_unlock: xprt_clear_locked(xprt); out_sleep: - dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, @@ -293,6 +292,9 @@ out_sleep: else rpc_sleep_on(&xprt->sending, task, NULL); return 0; +out_locked: + trace_xprt_reserve_cong(xprt, task); + return 1; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -357,6 +359,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } + trace_xprt_release_xprt(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt); @@ -374,6 +377,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } + trace_xprt_release_cong(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); @@ -395,8 +399,7 @@ __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (req->rq_cong) return 1; - dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + trace_xprt_get_cong(xprt, req->rq_task); if (RPCXPRT_CONGESTED(xprt)) { xprt_set_congestion_window_wait(xprt); return 0; @@ -418,6 +421,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; xprt_test_and_clear_congestion_window_wait(xprt); + trace_xprt_put_cong(xprt, req->rq_task); __xprt_lock_write_next_cong(xprt); } -- cgit v1.2.3 From 4b93dab36f28e673725e5e6123ebfccf7697f96a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:21 -0400 Subject: xprtrdma: Add unique trace points for posting Local Invalidate WRs When adding frwr_unmap_async way back when, I re-used the existing trace_xprtrdma_post_send() trace point to record the return code of ib_post_send. Unfortunately there are some cases where re-using that trace point causes a crash. Instead, construct a trace point specific to posting Local Invalidate WRs that will always be safe to use in that context, and will act as a trace log eye-catcher for Local Invalidation. Fixes: 847568942f93 ("xprtrdma: Remove fr_state") Fixes: d8099feda483 ("xprtrdma: Reduce context switching due ... ") Signed-off-by: Chuck Lever Tested-by: Bill Baker Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 25 +++++++++++++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index a13830616107..7fd11ec1c9a4 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -735,6 +735,31 @@ TRACE_EVENT(xprtrdma_post_recvs, ) ); +TRACE_EVENT(xprtrdma_post_linv, + TP_PROTO( + const struct rpcrdma_req *req, + int status + ), + + TP_ARGS(req, status), + + TP_STRUCT__entry( + __field(const void *, req) + __field(int, status) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->req = req; + __entry->status = status; + __entry->xid = be32_to_cpu(req->rl_slot.rq_xid); + ), + + TP_printk("req=%p xid=0x%08x status=%d", + __entry->req, __entry->xid, __entry->status + ) +); + /** ** Completion events **/ diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 30065a28628c..9901a811f598 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -570,7 +570,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ bad_wr = NULL; rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); - trace_xprtrdma_post_send(req, rc); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -583,6 +582,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ + trace_xprtrdma_post_linv(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -673,12 +673,12 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ bad_wr = NULL; rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); - trace_xprtrdma_post_send(req, rc); if (!rc) return; /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ + trace_xprtrdma_post_linv(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); -- cgit v1.2.3 From a31b2f939219dd9bffdf01a45bd91f209f8cc369 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:27 -0400 Subject: xprtrdma: Connection becomes unstable after a reconnect This is because xprt_request_get_cong() is allowing more than one RPC Call to be transmitted before the first Receive on the new connection. The first Receive fills the Receive Queue based on the server's credit grant. Before that Receive, there is only a single Receive WR posted because the client doesn't know the server's credit grant. Solution is to clear rq_cong on all outstanding rpc_rqsts when the the cwnd is reset. This is because an RPC/RDMA credit is good for one connection instance only. Fixes: 75891f502f5f ("SUNRPC: Support for congestion control ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 +++ net/sunrpc/xprtrdma/verbs.c | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 160558b4135e..c67d465dc062 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -428,8 +428,11 @@ void xprt_rdma_close(struct rpc_xprt *xprt) /* Prepare @xprt for the next connection by reinitializing * its credit grant to one (see RFC 8166, Section 3.3.3). */ + spin_lock(&xprt->transport_lock); r_xprt->rx_buf.rb_credits = 1; + xprt->cong = 0; xprt->cwnd = RPC_CWNDSHIFT; + spin_unlock(&xprt->transport_lock); out: xprt->reestablish_timeout = 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3a907537e2cf..f4b136504e96 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -75,6 +75,7 @@ * internal functions */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); @@ -780,6 +781,7 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) trace_xprtrdma_disconnect(r_xprt, rc); rpcrdma_xprt_drain(r_xprt); + rpcrdma_reqs_reset(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -1042,6 +1044,26 @@ out1: return NULL; } +/** + * rpcrdma_reqs_reset - Reset all reqs owned by a transport + * @r_xprt: controlling transport instance + * + * ASSUMPTION: the rb_allreqs list is stable for the duration, + * and thus can be walked without holding rb_lock. Eg. the + * caller is holding the transport send lock to exclude + * device removal or disconnection. + */ +static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + /* Credits are valid only for one connection */ + req->rl_slot.rq_cong = 0; + } +} + static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { -- cgit v1.2.3 From eea63ca7ffa1f3a4a0b02b902ec51eab2d4e9df4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:32 -0400 Subject: xprtrdma: Initialize rb_credits in one place Clean up/code de-duplication. Nit: RPC_CWNDSHIFT is incorrect as the initial value for xprt->cwnd. This mistake does not appear to have operational consequences, since the cwnd value is replaced with a valid value upon the first Receive completion. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 42 +++++++++++++++++++++++++++++++++++------ net/sunrpc/xprtrdma/transport.c | 9 --------- net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 38 insertions(+), 16 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b86b5fd62d9f..f1e3639d2050 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -916,6 +916,40 @@ out_err: return ret; } +static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, + struct rpcrdma_buffer *buf, + u32 grant) +{ + buf->rb_credits = grant; + xprt->cwnd = grant << RPC_CWNDSHIFT; +} + +static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); + spin_unlock(&xprt->transport_lock); +} + +/** + * rpcrdma_reset_cwnd - Reset the xprt's congestion window + * @r_xprt: controlling transport instance + * + * Prepare @r_xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ +void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + xprt->cong = 0; + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); + spin_unlock(&xprt->transport_lock); +} + /** * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs * @rqst: controlling RPC request @@ -1356,12 +1390,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - if (buf->rb_credits != credits) { - spin_lock(&xprt->transport_lock); - buf->rb_credits = credits; - xprt->cwnd = credits << RPC_CWNDSHIFT; - spin_unlock(&xprt->transport_lock); - } + if (buf->rb_credits != credits) + rpcrdma_update_cwnd(r_xprt, credits); req = rpcr_to_rdmar(rqst); if (req->rl_reply) { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c67d465dc062..0711308277eb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -425,15 +425,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt) return; rpcrdma_ep_disconnect(ep, ia); - /* Prepare @xprt for the next connection by reinitializing - * its credit grant to one (see RFC 8166, Section 3.3.3). - */ - spin_lock(&xprt->transport_lock); - r_xprt->rx_buf.rb_credits = 1; - xprt->cong = 0; - xprt->cwnd = RPC_CWNDSHIFT; - spin_unlock(&xprt->transport_lock); - out: xprt->reestablish_timeout = 0; ++xprt->connect_cookie; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f4b136504e96..97bc15e287b2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -727,6 +727,7 @@ retry: ep->rep_connected = 0; xprt_clear_connected(xprt); + rpcrdma_reset_cwnd(r_xprt); rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -1163,7 +1164,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) list_add(&req->rl_list, &buf->rb_send_bufs); } - buf->rb_credits = 1; init_llist_head(&buf->rb_free_reps); rc = rpcrdma_sendctxs_create(r_xprt); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 65e6b0eb862e..b170cf52c638 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -576,6 +576,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); -- cgit v1.2.3 From 2ae50ad68cd79224198b525f7bd645c9da98b6ff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:38 -0400 Subject: xprtrdma: Close window between waking RPC senders and posting Receives A recent clean up attempted to separate Receive handling and RPC Reply processing, in the name of clean layering. Unfortunately, we can't do this because the Receive Queue has to be refilled _after_ the most recent credit update from the responder is parsed from the transport header, but _before_ we wake up the next RPC sender. That is right in the middle of rpcrdma_reply_handler(). Usually this isn't a problem because current responder implementations don't vary their credit grant. The one exception is when a connection is established: the grant goes from one to a much larger number on the first Receive. The requester MUST post enough Receives right then so that any outstanding requests can be sent without risking RNR and connection loss. Fixes: 6ceea36890a0 ("xprtrdma: Refactor Receive accounting") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 1 + net/sunrpc/xprtrdma/verbs.c | 11 +++++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f1e3639d2050..7c125e6cca4f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1392,6 +1392,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = buf->rb_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); + rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); if (req->rl_reply) { diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 97bc15e287b2..3a1a31c3d791 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -85,7 +85,6 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); -static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* Wait for outstanding transport work to finish. ib_drain_qp * handles the drains in the wrong order for us, so open code @@ -171,7 +170,6 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); - rpcrdma_post_recvs(r_xprt, false); rpcrdma_reply_handler(rep); return; @@ -1477,8 +1475,13 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, return 0; } -static void -rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +/** + * rpcrdma_post_recvs - Refill the Receive Queue + * @r_xprt: controlling transport instance + * @temp: mark Receive buffers to be deleted after use + * + */ +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = &r_xprt->rx_ep; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b170cf52c638..2f89cfccdb48 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -474,6 +474,7 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* * Buffer calls - xprtrdma/verbs.c -- cgit v1.2.3 From c3700780a096fc66467c81076ddf7f3f11d639b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:43 -0400 Subject: xprtrdma: Fix MR list handling Close some holes introduced by commit 6dc6ec9e04c4 ("xprtrdma: Cache free MRs in each rpcrdma_req") that could result in list corruption. In addition, the result that is tabulated in @count is no longer used, so @count is removed. Fixes: 6dc6ec9e04c4 ("xprtrdma: Cache free MRs in each rpcrdma_req") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3a1a31c3d791..82361e7bbb51 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -79,7 +79,6 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); -static void rpcrdma_mr_free(struct rpcrdma_mr *mr); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -966,7 +965,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; spin_lock(&buf->rb_lock); - list_add(&mr->mr_list, &buf->rb_mrs); + rpcrdma_mr_push(mr, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); spin_unlock(&buf->rb_lock); } @@ -1183,10 +1182,19 @@ out: */ void rpcrdma_req_destroy(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + list_del(&req->rl_all); - while (!list_empty(&req->rl_free_mrs)) - rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs)); + while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_release_mr(mr); + } rpcrdma_regbuf_free(req->rl_recvbuf); rpcrdma_regbuf_free(req->rl_sendbuf); @@ -1194,24 +1202,28 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) +/** + * rpcrdma_mrs_destroy - Release all of a transport's MRs + * @buf: controlling buffer instance + * + * Relies on caller holding the transport send lock to protect + * removing mr->mr_list from req->rl_free_mrs safely. + */ +static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); struct rpcrdma_mr *mr; - unsigned int count; - count = 0; spin_lock(&buf->rb_lock); while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, struct rpcrdma_mr, mr_all)) != NULL) { + list_del(&mr->mr_list); list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); frwr_release_mr(mr); - count++; spin_lock(&buf->rb_lock); } spin_unlock(&buf->rb_lock); @@ -1284,17 +1296,6 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } -static void rpcrdma_mr_free(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - mr->mr_req = NULL; - spin_lock(&buf->rb_lock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_lock); -} - /** * rpcrdma_buffer_get - Get a request buffer * @buffers: Buffer pool from which to obtain a buffer -- cgit v1.2.3 From 9d2da4ff00f37de17fc25c23e50463b58b9e8fec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Oct 2019 13:07:48 -0400 Subject: xprtrdma: Manage MRs in context of a single connection MRs are now allocated on demand so we can safely throw them away on disconnect. This way an idle transport can disconnect and it won't pin hardware MR resources. Two additional changes: - Now that all MRs are destroyed on disconnect, there's no need to check during header marshaling if a req has MRs to recycle. Each req is sent only once per connection, and now rl_registered is guaranteed to be empty when rpcrdma_marshal_req is invoked. - Because MRs are now destroyed in a WQ_MEM_RECLAIM context, they also must be allocated in a WQ_MEM_RECLAIM context. This reduces the likelihood that device driver memory allocation will trigger memory reclaim during NFS writeback. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 24 ++-------------- net/sunrpc/xprtrdma/rpc_rdma.c | 9 +----- net/sunrpc/xprtrdma/verbs.c | 62 ++++++++++++++++++++++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 40 insertions(+), 57 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 9901a811f598..37ba82dc2474 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -36,8 +36,8 @@ * connect worker from running concurrently. * * When the underlying transport disconnects, MRs that are in flight - * are flushed and are likely unusable. Thus all flushed MRs are - * destroyed. New MRs are created on demand. + * are flushed and are likely unusable. Thus all MRs are destroyed. + * New MRs are created on demand. */ #include @@ -119,20 +119,6 @@ frwr_mr_recycle_worker(struct work_struct *work) frwr_mr_recycle(mr->mr_xprt, mr); } -/* frwr_recycle - Discard MRs - * @req: request to reset - * - * Used after a reconnect. These MRs could be in flight, we can't - * tell. Safe thing to do is release them. - */ -void frwr_recycle(struct rpcrdma_req *req) -{ - struct rpcrdma_mr *mr; - - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - frwr_mr_recycle(mr->mr_xprt, mr); -} - /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -166,9 +152,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) struct ib_mr *frmr; int rc; - /* NB: ib_alloc_mr and device drivers typically allocate - * memory with GFP_KERNEL. - */ frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; @@ -440,9 +423,6 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - /* If ib_post_send fails, the next ->send_request for - * @req will queue these MRs for recovery. - */ return ib_post_send(ia->ri_id->qp, post_wr, NULL); } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 7c125e6cca4f..7b1358284242 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -363,8 +363,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, out_getmr_err: trace_xprtrdma_nomrs(req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); - if (r_xprt->rx_ep.rep_connected != -ENODEV) - schedule_work(&r_xprt->rx_buf.rb_refresh_worker); + rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); } @@ -863,12 +862,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } - /* If this is a retransmit, discard previously registered - * chunks. Very likely the connection has been replaced, - * so these registrations are invalid and unusable. - */ - frwr_recycle(req); - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 82361e7bbb51..c79d8620d9c8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -78,7 +78,7 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -407,8 +407,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; - cancel_work_sync(&buf->rb_refresh_worker); - /* This is similar to rpcrdma_ep_destroy, but: * - Don't cancel the connect worker. * - Don't call rpcrdma_ep_disconnect, which waits @@ -435,7 +433,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); } - rpcrdma_mrs_destroy(buf); + rpcrdma_mrs_destroy(r_xprt); ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; @@ -628,8 +626,6 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: rdma_create_qp returned %d\n", err); goto out3; } - - rpcrdma_mrs_create(r_xprt); return 0; out3: @@ -703,7 +699,6 @@ retry: memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: - dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); if (rc) { rc = -ENETUNREACH; @@ -741,7 +736,7 @@ retry: goto out; } - dprintk("RPC: %s: connected\n", __func__); + rpcrdma_mrs_create(r_xprt); out: if (rc) @@ -756,11 +751,8 @@ out_noupdate: * @ep: endpoint to disconnect * @ia: associated interface adapter * - * This is separate from destroy to facilitate the ability - * to reconnect without recreating the endpoint. - * - * This call is not reentrant, and must not be made in parallel - * on the same endpoint. + * Caller serializes. Either the transport send lock is held, + * or we're being called to destroy the transport. */ void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) @@ -780,6 +772,7 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) rpcrdma_xprt_drain(r_xprt); rpcrdma_reqs_reset(r_xprt); + rpcrdma_mrs_destroy(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -986,6 +979,28 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) xprt_write_space(&r_xprt->rx_xprt); } +/** + * rpcrdma_mrs_refresh - Wake the MR refresh worker + * @r_xprt: controlling transport instance + * + */ +void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + /* If there is no underlying device, it's no use to + * wake the refresh worker. + */ + if (ep->rep_connected != -ENODEV) { + /* The work is scheduled on a WQ_MEM_RECLAIM + * workqueue in order to prevent MR allocation + * from recursing into NFS during direct reclaim. + */ + queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); + } +} + /** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt @@ -1145,8 +1160,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_all_mrs); INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - rpcrdma_mrs_create(r_xprt); - INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -1177,8 +1190,8 @@ out: * rpcrdma_req_destroy - Destroy an rpcrdma_req object * @req: unused object to be destroyed * - * This function assumes that the caller prevents concurrent device - * unload and transport tear-down. + * Relies on caller holding the transport send lock to protect + * removing req->rl_all from buf->rb_all_reqs safely. */ void rpcrdma_req_destroy(struct rpcrdma_req *req) { @@ -1204,17 +1217,18 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) /** * rpcrdma_mrs_destroy - Release all of a transport's MRs - * @buf: controlling buffer instance + * @r_xprt: controlling transport instance * * Relies on caller holding the transport send lock to protect * removing mr->mr_list from req->rl_free_mrs safely. */ -static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) +static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, - rx_buf); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_mr *mr; + cancel_work_sync(&buf->rb_refresh_worker); + spin_lock(&buf->rb_lock); while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, struct rpcrdma_mr, @@ -1224,10 +1238,10 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) spin_unlock(&buf->rb_lock); frwr_release_mr(mr); + spin_lock(&buf->rb_lock); } spin_unlock(&buf->rb_lock); - r_xprt->rx_stats.mrs_allocated = 0; } /** @@ -1241,8 +1255,6 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_work_sync(&buf->rb_refresh_worker); - rpcrdma_sendctxs_destroy(buf); rpcrdma_reps_destroy(buf); @@ -1254,8 +1266,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) list_del(&req->rl_list); rpcrdma_req_destroy(req); } - - rpcrdma_mrs_destroy(buf); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2f89cfccdb48..a7ef9653bafd 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -488,6 +488,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); static inline void rpcrdma_mr_recycle(struct rpcrdma_mr *mr) @@ -543,7 +544,6 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ bool frwr_is_supported(struct ib_device *device); -void frwr_recycle(struct rpcrdma_req *req); void frwr_reset(struct rpcrdma_req *req); int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); -- cgit v1.2.3 From 15d9b015d3d1c997893472cb42d9f225a60a9219 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:09 -0400 Subject: xprtrdma: Ensure ri_id is stable during MR recycling ia->ri_id is replaced during a reconnect. The connect_worker runs with the transport send lock held to prevent ri_id from being dereferenced by the send_request path during this process. Currently, however, there is no guarantee that ia->ri_id is stable in the MR recycling worker, which operates in the background and is not serialized with the connect_worker in any way. But now that Local_Inv completions are being done in process context, we can handle the recycling operation there instead of deferring the recycling work to another process. Because the disconnect path drains all work before allowing tear down to proceed, it is guaranteed that Local Invalidations complete only while the ri_id pointer is stable. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 23 ++++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ------- 2 files changed, 6 insertions(+), 24 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 37ba82dc2474..5cd871568c67 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -88,8 +88,10 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +static void frwr_mr_recycle(struct rpcrdma_mr *mr) { + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + trace_xprtrdma_mr_recycle(mr); if (mr->mr_dir != DMA_NONE) { @@ -107,18 +109,6 @@ static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) frwr_release_mr(mr); } -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -frwr_mr_recycle_worker(struct work_struct *work) -{ - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, - mr_recycle); - - frwr_mr_recycle(mr->mr_xprt, mr); -} - /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -163,7 +153,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) mr->frwr.fr_mr = frmr; mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); - INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); init_completion(&mr->frwr.fr_linv_done); sg_init_table(sg, depth); @@ -448,7 +437,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); else rpcrdma_mr_put(mr); } @@ -570,7 +559,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) bad_wr = bad_wr->next; list_del_init(&mr->mr_list); - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); } } @@ -664,7 +653,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - rpcrdma_mr_recycle(mr); + frwr_mr_recycle(mr); } /* The final LOCAL_INV WR in the chain is supposed to diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a7ef9653bafd..b8e768d55cb0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -257,7 +257,6 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; - struct work_struct mr_recycle; struct list_head mr_all; }; @@ -490,12 +489,6 @@ struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); -static inline void -rpcrdma_mr_recycle(struct rpcrdma_mr *mr) -{ - schedule_work(&mr->mr_recycle); -} - struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); -- cgit v1.2.3 From f995879ec4aa8b50c3924fda3014b0ab9acad7bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:18 -0400 Subject: xprtrdma: Remove rpcrdma_sendctx::sc_xprt Micro-optimization: Save eight bytes in a frequently allocated structure. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 19 ++++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c79d8620d9c8..3ab086aa69bb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -74,7 +74,8 @@ /* * internal functions */ -static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); @@ -124,7 +125,7 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC - * @cq: completion queue (ignored) + * @cq: completion queue * @wc: completed WR * */ @@ -137,7 +138,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); - rpcrdma_sendctx_put_locked(sc); + rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); } /** @@ -518,7 +519,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; - sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL, + sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, ep->rep_attr.cap.max_send_wr + 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { @@ -840,7 +841,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) if (!sc) return -ENOMEM; - sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } @@ -903,6 +903,7 @@ out_emptyq: /** * rpcrdma_sendctx_put_locked - Release a send context + * @r_xprt: controlling transport instance * @sc: send context to release * * Usage: Called from Send completion to return a sendctxt @@ -910,10 +911,10 @@ out_emptyq: * * The caller serializes calls to this function (per transport). */ -static void -rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc) { - struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; unsigned long next_tail; /* Unmap SGEs of previously completed but unsignaled @@ -931,7 +932,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) /* Paired with READ_ONCE */ smp_store_release(&buf->rb_sc_tail, next_tail); - xprt_write_space(&sc->sc_xprt->rx_xprt); + xprt_write_space(&r_xprt->rx_xprt); } static void diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b8e768d55cb0..4897c09d6f61 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -218,12 +218,10 @@ enum { /* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes */ struct rpcrdma_req; -struct rpcrdma_xprt; struct rpcrdma_sendctx { struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; struct ib_device *sc_device; - struct rpcrdma_xprt *sc_xprt; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; -- cgit v1.2.3 From b5cde6aa882dfb40a2b29c1c7371fdc3655c51ce Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:27 -0400 Subject: xprtrdma: Remove rpcrdma_sendctx::sc_device Micro-optimization: Save eight bytes in a frequently allocated structure. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 7b1358284242..1941b2261ca5 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -564,6 +564,7 @@ static void rpcrdma_sendctx_done(struct kref *kref) */ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { + struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; struct ib_sge *sge; if (!sc->sc_unmap_count) @@ -575,7 +576,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) */ for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; ++sge, --sc->sc_unmap_count) - ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length, + ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); @@ -625,7 +626,6 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, */ if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; - sc->sc_device = rdmab_device(rb); sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4897c09d6f61..0e5b7f373077 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -221,7 +221,6 @@ struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; - struct ib_device *sc_device; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; -- cgit v1.2.3 From dc15c3d5f16808f7c171b55da6a82a5c0f279647 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:35 -0400 Subject: xprtrdma: Move the rpcrdma_sendctx::sc_wr field Clean up: This field is not needed in the Send completion handler, so it can be moved to struct rpcrdma_req to reduce the size of struct rpcrdma_sendctx, and to reduce the amount of memory that is sloshed between the sending process and the Send completion process. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 5 ++--- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 9 ++++++--- net/sunrpc/xprtrdma/verbs.c | 5 +---- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 7fd11ec1c9a4..f8edab91e09c 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -667,9 +667,8 @@ TRACE_EVENT(xprtrdma_post_send, __entry->client_id = rqst->rq_task->tk_client ? rqst->rq_task->tk_client->cl_clid : -1; __entry->req = req; - __entry->num_sge = req->rl_sendctx->sc_wr.num_sge; - __entry->signaled = req->rl_sendctx->sc_wr.send_flags & - IB_SEND_SIGNALED; + __entry->num_sge = req->rl_wr.num_sge; + __entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED; __entry->status = status; ), diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5cd871568c67..523722be6a16 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -396,7 +396,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; - post_wr = &req->rl_sendctx->sc_wr; + post_wr = &req->rl_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1941b2261ca5..53cd2e3cf003 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -599,7 +599,7 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - sc->sc_wr.num_sge++; + req->rl_wr.num_sge++; return true; out_regbuf: @@ -711,7 +711,7 @@ map_tail: } out: - sc->sc_wr.num_sge += sge_no; + req->rl_wr.num_sge += sge_no; if (sc->sc_unmap_count) kref_get(&req->rl_kref); return true; @@ -752,10 +752,13 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) goto err; - req->rl_sendctx->sc_wr.num_sge = 0; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; kref_init(&req->rl_kref); + req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; + req->rl_wr.sg_list = req->rl_sendctx->sc_sges; + req->rl_wr.num_sge = 0; + req->rl_wr.opcode = IB_WR_SEND; ret = -EIO; if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3ab086aa69bb..2f4658255343 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -811,9 +811,6 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) if (!sc) return NULL; - sc->sc_wr.wr_cqe = &sc->sc_cqe; - sc->sc_wr.sg_list = sc->sc_sges; - sc->sc_wr.opcode = IB_WR_SEND; sc->sc_cqe.done = rpcrdma_wc_send; return sc; } @@ -1469,7 +1466,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; + struct ib_send_wr *send_wr = &req->rl_wr; int rc; if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0e5b7f373077..cdd6a3d43e0f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -219,7 +219,6 @@ enum { */ struct rpcrdma_req; struct rpcrdma_sendctx { - struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; @@ -314,6 +313,7 @@ struct rpcrdma_req { struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; struct xdr_buf rl_hdrbuf; + struct ib_send_wr rl_wr; struct rpcrdma_sendctx *rl_sendctx; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ -- cgit v1.2.3 From d6764bbd7763fa9d669bba7fc5a50a4bdd8f591b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:44 -0400 Subject: xprtrdma: Refactor rpcrdma_prepare_msg_sges() Refactor: Replace spaghetti with code that makes it plain what needs to be done for each rtype. This makes it easier to add features and optimizations. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 263 +++++++++++++++++++++++------------------ 1 file changed, 146 insertions(+), 117 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 53cd2e3cf003..a441dbf9f198 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -589,148 +589,162 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, { struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = sc->sc_sges; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) - goto out_regbuf; + return false; sge->addr = rdmab_addr(rb); sge->length = len; sge->lkey = rdmab_lkey(rb); ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - req->rl_wr.num_sge++; return true; - -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; } -/* Prepare the Send SGEs. The head and tail iovec, and each entry - * in the page list, gets its own SGE. +/* The head iovec is straightforward, as it is usually already + * DMA-mapped. Sync the content that has changed. */ -static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, - struct xdr_buf *xdr, - enum rpcrdma_chunktype rtype) +static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, unsigned int len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; - unsigned int sge_no, page_base, len, remaining; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; struct rpcrdma_regbuf *rb = req->rl_sendbuf; - struct ib_sge *sge = sc->sc_sges; - struct page *page, **ppages; - /* The head iovec is straightforward, as it is already - * DMA-mapped. Sync the content that has changed. - */ if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) - goto out_regbuf; - sge_no = 1; - sge[sge_no].addr = rdmab_addr(rb); - sge[sge_no].length = xdr->head[0].iov_len; - sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, - sge[sge_no].length, DMA_TO_DEVICE); - - /* If there is a Read chunk, the page list is being handled - * via explicit RDMA, and thus is skipped here. However, the - * tail iovec may include an XDR pad for the page list, as - * well as additional content, and may not reside in the - * same page as the head iovec. - */ - if (rtype == rpcrdma_readch) { - len = xdr->tail[0].iov_len; + return false; - /* Do not include the tail if it is only an XDR pad */ - if (len < 4) - goto out; + sge->addr = rdmab_addr(rb); + sge->length = len; + sge->lkey = rdmab_lkey(rb); - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); + return true; +} - /* If the content in the page list is an odd length, - * xdr_write_pages() has added a pad at the beginning - * of the tail iovec. Force the tail's non-pad content - * to land at the next XDR position in the Send message. - */ - page_base += len & 3; - len -= len & 3; - goto map_tail; - } +/* If there is a page list present, DMA map and prepare an + * SGE for each page to be sent. + */ +static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + unsigned int page_base, len, remaining; + struct page **ppages; + struct ib_sge *sge; - /* If there is a page list present, temporarily DMA map - * and prepare an SGE for each page to be sent. - */ - if (xdr->page_len) { - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining) { - sge_no++; - if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) - goto out_mapping_overflow; - - len = min_t(u32, PAGE_SIZE - page_base, remaining); - sge[sge_no].addr = - ib_dma_map_page(rdmab_device(rb), *ppages, - page_base, len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdmab_device(rb), - sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = rdmab_lkey(rb); - - sc->sc_unmap_count++; - ppages++; - remaining -= len; - page_base = 0; - } - } + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + sge = &sc->sc_sges[req->rl_wr.num_sge++]; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, + page_base, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; - /* The tail iovec is not always constructed in the same - * page where the head iovec resides (see, for example, - * gss_wrap_req_priv). To neatly accommodate that case, - * DMA map it separately. - */ - if (xdr->tail[0].iov_len) { - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); - len = xdr->tail[0].iov_len; + sge->length = len; + sge->lkey = rdmab_lkey(rb); -map_tail: - sge_no++; - sge[sge_no].addr = - ib_dma_map_page(rdmab_device(rb), page, page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = rdmab_lkey(rb); sc->sc_unmap_count++; + ppages++; + remaining -= len; + page_base = 0; } -out: - req->rl_wr.num_sge += sge_no; - if (sc->sc_unmap_count) - kref_get(&req->rl_kref); return true; -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); +out_mapping_err: + trace_xprtrdma_dma_maperr(sge->addr); return false; +} -out_mapping_overflow: - rpcrdma_sendctx_unmap(sc); - pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); - return false; +/* The tail iovec may include an XDR pad for the page list, + * as well as additional content, and may not reside in the + * same page as the head iovec. + */ +static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, + struct xdr_buf *xdr, + unsigned int page_base, unsigned int len) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + struct page *page = virt_to_page(xdr->tail[0].iov_base); + + sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; + + sge->length = len; + sge->lkey = rdmab_lkey(rb); + ++sc->sc_unmap_count; + return true; out_mapping_err: - rpcrdma_sendctx_unmap(sc); - trace_xprtrdma_dma_maperr(sge[sge_no].addr); + trace_xprtrdma_dma_maperr(sge->addr); return false; } +static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct kvec *tail = &xdr->tail[0]; + + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; + if (xdr->page_len) + if (!rpcrdma_prepare_pagelist(req, xdr)) + return false; + if (tail->iov_len) + if (!rpcrdma_prepare_tail_iov(req, xdr, + offset_in_page(tail->iov_base), + tail->iov_len)) + return false; + + if (req->rl_sendctx->sc_unmap_count) + kref_get(&req->rl_kref); + return true; +} + +static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; + + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. + */ + + /* Do not include the tail if it is only an XDR pad */ + if (xdr->tail[0].iov_len > 3) { + unsigned int page_base, len; + + /* If the content in the page list is an odd length, + * xdr_write_pages() adds a pad at the beginning of + * the tail iovec. Force the tail's non-pad content to + * land at the next XDR position in the Send message. + */ + page_base = offset_in_page(xdr->tail[0].iov_base); + len = xdr->tail[0].iov_len; + page_base += len & 3; + len -= len & 3; + if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) + return false; + kref_get(&req->rl_kref); + } + + return true; +} + /** * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR * @r_xprt: controlling transport @@ -741,17 +755,17 @@ out_mapping_err: * * Returns 0 on success; otherwise a negative errno is returned. */ -int -rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, u32 hdrlen, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) { int ret; ret = -EAGAIN; req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) - goto err; + goto out_nosc; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; kref_init(&req->rl_kref); @@ -762,13 +776,28 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, ret = -EIO; if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) - goto err; - if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype)) - goto err; + goto out_unmap; + + switch (rtype) { + case rpcrdma_noch: + if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_readch: + if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_areadch: + break; + default: + goto out_unmap; + } + return 0; -err: +out_unmap: + rpcrdma_sendctx_unmap(req->rl_sendctx); +out_nosc: trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); return ret; } -- cgit v1.2.3 From 614f3c96d7e5efd1c4dc699524857130a52c6a7f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2019 14:31:53 -0400 Subject: xprtrdma: Pull up sometimes On some platforms, DMA mapping part of a page is more costly than copying bytes. Restore the pull-up code and use that when we think it's going to be faster. The heuristic for now is to pull-up when the size of the RPC message body fits in the buffer underlying the head iovec. Indeed, not involving the I/O MMU can help the RPC/RDMA transport scale better for tiny I/Os across more RDMA devices. This is because interaction with the I/O MMU is eliminated, as is handling a Send completion, for each of these small I/Os. Without the explicit unmapping, the NIC no longer needs to do a costly internal TLB shoot down for buffers that are just a handful of bytes. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 4 ++ net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 82 ++++++++++++++++++++++++++++++++++++--- net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 2 + 5 files changed, 85 insertions(+), 7 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index f8edab91e09c..213c72585a5f 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -532,6 +532,8 @@ DEFINE_WRCH_EVENT(write); DEFINE_WRCH_EVENT(reply); TRACE_DEFINE_ENUM(rpcrdma_noch); +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); TRACE_DEFINE_ENUM(rpcrdma_readch); TRACE_DEFINE_ENUM(rpcrdma_areadch); TRACE_DEFINE_ENUM(rpcrdma_writech); @@ -540,6 +542,8 @@ TRACE_DEFINE_ENUM(rpcrdma_replych); #define xprtrdma_show_chunktype(x) \ __print_symbolic(x, \ { rpcrdma_noch, "inline" }, \ + { rpcrdma_noch_pullup, "pullup" }, \ + { rpcrdma_noch_mapped, "mapped" }, \ { rpcrdma_readch, "read list" }, \ { rpcrdma_areadch, "*read list" }, \ { rpcrdma_writech, "write list" }, \ diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 50e075fcdd8f..11685245546a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p = xdr_zero; if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; trace_xprtrdma_cb_reply(rqst); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a441dbf9f198..4ad88893e964 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, unsigned int pos; int nsegs; - if (rtype == rpcrdma_noch) + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; pos = rqst->rq_snd_buf.head[0].iov_len; @@ -691,6 +691,72 @@ out_mapping_err: return false; } +/* Copy the tail to the end of the head buffer. + */ +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned char *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len + xdr->page_len; + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; +} + +/* Copy pagelist content into the head buffer. + */ +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned int len, page_base, remaining; + struct page **ppages; + unsigned char *src, *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + src = page_address(*ppages); + src += page_base; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + memcpy(dst, src, len); + r_xprt->rx_stats.pullup_copy_count += len; + + ppages++; + dst += len; + remaining -= len; + page_base = 0; + } +} + +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. + * When the head, pagelist, and tail are small, a pull-up copy + * is considerably less costly than DMA mapping the components + * of @xdr. + * + * Assumptions: + * - the caller has already verified that the total length + * of the RPC Call body will fit into @rl_sendbuf. + */ +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (unlikely(xdr->tail[0].iov_len)) + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); + + if (unlikely(xdr->page_len)) + rpcrdma_pullup_pagelist(r_xprt, req, xdr); + + /* The whole RPC message resides in the head iovec now */ + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); +} + static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, goto out_unmap; switch (rtype) { - case rpcrdma_noch: + case rpcrdma_noch_pullup: + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_noch_mapped: if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) goto out_unmap; break; @@ -827,6 +897,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; + struct xdr_buf *buf = &rqst->rq_snd_buf; bool ddp_allowed; __be32 *p; int ret; @@ -884,8 +955,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { *p++ = rdma_msg; - rtype = rpcrdma_noch; - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? + rpcrdma_noch_pullup : rpcrdma_noch_mapped; + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { *p++ = rdma_msg; rtype = rpcrdma_readch; } else { @@ -927,7 +999,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) goto out_err; ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, - &rqst->rq_snd_buf, rtype); + buf, rtype); if (ret) goto out_err; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2f4658255343..a514e2c89ac3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, GFP_KERNEL); if (!req) goto out; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cdd6a3d43e0f..5d15140a0266 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -554,6 +554,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); enum rpcrdma_chunktype { rpcrdma_noch = 0, + rpcrdma_noch_pullup, + rpcrdma_noch_mapped, rpcrdma_readch, rpcrdma_areadch, rpcrdma_writech, -- cgit v1.2.3 From 6cb28687fd1db3f94b35c2a7b37bf468f945244a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Oct 2019 10:01:52 -0400 Subject: xprtrdma: Wake tasks after connect worker fails Pending tasks are currently never awoken when the connect worker fails. The reason is that XPRT_CONNECTED is always clear after a failure return of rpcrdma_ep_connect, thus the xprt_test_and_clear_connected() check in xprt_rdma_connect_worker() always fails. - xprt_rdma_close always clears XPRT_CONNECTED. - rpcrdma_ep_connect always clears XPRT_CONNECTED. After reviewing the TCP connect worker, it appears that there's no need for extra test_and_set paranoia in xprt_rdma_connect_worker. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 0711308277eb..361e59146807 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -243,16 +243,13 @@ xprt_rdma_connect_worker(struct work_struct *work) rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); xprt_clear_connecting(xprt); if (r_xprt->rx_ep.rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - - xprt->stat.connect_start; - xprt_wake_pending_tasks(xprt, -EAGAIN); - } - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, rc); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_set_connected(xprt); + rc = -EAGAIN; } + xprt_wake_pending_tasks(xprt, rc); } /** -- cgit v1.2.3 From 7b020f17bbd34c219419b634d9efb9e93a3af4c2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Oct 2019 10:01:58 -0400 Subject: xprtrdma: Report the computed connect delay For debugging, the op_connect trace point should report the computed connect delay. We can then ensure that the delay is computed at the proper times, for example. As a further clean-up, remove a few low-value "heartbeat" trace points in the connect path. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 77 ++++++++++++++++++++++++++++++----------- net/sunrpc/xprtrdma/transport.c | 3 +- net/sunrpc/xprtrdma/verbs.c | 13 ++----- 3 files changed, 60 insertions(+), 33 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 213c72585a5f..99e3c61609b2 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -85,6 +85,44 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt, ), \ TP_ARGS(r_xprt)) +DECLARE_EVENT_CLASS(xprtrdma_connect_class, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + int rc + ), + + TP_ARGS(r_xprt, rc), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(int, rc) + __field(int, connect_status) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->rc = rc; + __entry->connect_status = r_xprt->rx_ep.rep_connected; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->rc, __entry->connect_status + ) +); + +#define DEFINE_CONN_EVENT(name) \ + DEFINE_EVENT(xprtrdma_connect_class, xprtrdma_##name, \ + TP_PROTO( \ + const struct rpcrdma_xprt *r_xprt, \ + int rc \ + ), \ + TP_ARGS(r_xprt, rc)) + DECLARE_EVENT_CLASS(xprtrdma_rdch_event, TP_PROTO( const struct rpc_task *task, @@ -333,47 +371,44 @@ TRACE_EVENT(xprtrdma_cm_event, ) ); -TRACE_EVENT(xprtrdma_disconnect, +DEFINE_CONN_EVENT(connect); +DEFINE_CONN_EVENT(disconnect); + +DEFINE_RXPRT_EVENT(xprtrdma_create); +DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); +DEFINE_RXPRT_EVENT(xprtrdma_remove); +DEFINE_RXPRT_EVENT(xprtrdma_reinsert); +DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); +DEFINE_RXPRT_EVENT(xprtrdma_op_close); + +TRACE_EVENT(xprtrdma_op_connect, TP_PROTO( const struct rpcrdma_xprt *r_xprt, - int status + unsigned long delay ), - TP_ARGS(r_xprt, status), + TP_ARGS(r_xprt, delay), TP_STRUCT__entry( __field(const void *, r_xprt) - __field(int, status) - __field(int, connected) + __field(unsigned long, delay) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( __entry->r_xprt = r_xprt; - __entry->status = status; - __entry->connected = r_xprt->rx_ep.rep_connected; + __entry->delay = delay; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected", - __get_str(addr), __get_str(port), - __entry->r_xprt, __entry->status, - __entry->connected == 1 ? "still " : "dis" + TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->delay ) ); -DEFINE_RXPRT_EVENT(xprtrdma_conn_start); -DEFINE_RXPRT_EVENT(xprtrdma_conn_tout); -DEFINE_RXPRT_EVENT(xprtrdma_create); -DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); -DEFINE_RXPRT_EVENT(xprtrdma_remove); -DEFINE_RXPRT_EVENT(xprtrdma_reinsert); -DEFINE_RXPRT_EVENT(xprtrdma_reconnect); -DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); -DEFINE_RXPRT_EVENT(xprtrdma_op_close); -DEFINE_RXPRT_EVENT(xprtrdma_op_connect); TRACE_EVENT(xprtrdma_op_set_cto, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 361e59146807..ce263e6f779c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -527,13 +527,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); unsigned long delay; - trace_xprtrdma_op_connect(r_xprt); - delay = 0; if (r_xprt->rx_ep.rep_connected != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } + trace_xprtrdma_op_connect(r_xprt, delay); queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, delay); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a514e2c89ac3..92bdf053716a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -297,8 +297,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc; - trace_xprtrdma_conn_start(xprt); - init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); @@ -314,10 +312,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) if (rc) goto out; rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); - if (rc < 0) { - trace_xprtrdma_conn_tout(xprt); + if (rc < 0) goto out; - } rc = ia->ri_async_rc; if (rc) @@ -328,10 +324,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) if (rc) goto out; rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); - if (rc < 0) { - trace_xprtrdma_conn_tout(xprt); + if (rc < 0) goto out; - } rc = ia->ri_async_rc; if (rc) goto out; @@ -644,8 +638,6 @@ static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rdma_cm_id *id, *old; int err, rc; - trace_xprtrdma_reconnect(r_xprt); - rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); rc = -EHOSTUNREACH; @@ -744,6 +736,7 @@ out: ep->rep_connected = rc; out_noupdate: + trace_xprtrdma_connect(r_xprt, rc); return rc; } -- cgit v1.2.3 From d4957f01d29b2a01200117fc04b9faaa52aca4bf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Oct 2019 10:02:03 -0400 Subject: xprtrdma: Refine trace_xprtrdma_fixup Slightly reduce overhead and display more useful information. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 60 +++++++++--------------------------------- net/sunrpc/xprtrdma/rpc_rdma.c | 5 ++-- 2 files changed, 15 insertions(+), 50 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 99e3c61609b2..542177c5896f 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1084,66 +1084,32 @@ DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); TRACE_EVENT(xprtrdma_fixup, TP_PROTO( const struct rpc_rqst *rqst, - int len, - int hdrlen + unsigned long fixup ), - TP_ARGS(rqst, len, hdrlen), + TP_ARGS(rqst, fixup), TP_STRUCT__entry( __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(const void *, base) - __field(int, len) - __field(int, hdrlen) - ), - - TP_fast_assign( - __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->base = rqst->rq_rcv_buf.head[0].iov_base; - __entry->len = len; - __entry->hdrlen = hdrlen; - ), - - TP_printk("task:%u@%u base=%p len=%d hdrlen=%d", - __entry->task_id, __entry->client_id, - __entry->base, __entry->len, __entry->hdrlen - ) -); - -TRACE_EVENT(xprtrdma_fixup_pg, - TP_PROTO( - const struct rpc_rqst *rqst, - int pageno, - const void *pos, - int len, - int curlen - ), - - TP_ARGS(rqst, pageno, pos, len, curlen), - - TP_STRUCT__entry( - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(const void *, pos) - __field(int, pageno) - __field(int, len) - __field(int, curlen) + __field(unsigned long, fixup) + __field(size_t, headlen) + __field(unsigned int, pagelen) + __field(size_t, taillen) ), TP_fast_assign( __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->pos = pos; - __entry->pageno = pageno; - __entry->len = len; - __entry->curlen = curlen; + __entry->fixup = fixup; + __entry->headlen = rqst->rq_rcv_buf.head[0].iov_len; + __entry->pagelen = rqst->rq_rcv_buf.page_len; + __entry->taillen = rqst->rq_rcv_buf.tail[0].iov_len; ), - TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d", - __entry->task_id, __entry->client_id, - __entry->pageno, __entry->pos, __entry->len, __entry->curlen + TP_printk("task:%u@%u fixup=%lu xdr=%zu/%u/%zu", + __entry->task_id, __entry->client_id, __entry->fixup, + __entry->headlen, __entry->pagelen, __entry->taillen ) ); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4ad88893e964..26d334c83b38 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1086,7 +1086,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -1106,8 +1105,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - trace_xprtrdma_fixup_pg(rqst, i, srcp, - copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -1139,6 +1136,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf.tail[0].iov_base = srcp; } + if (fixup_copy_count) + trace_xprtrdma_fixup(rqst, fixup_copy_count); return fixup_copy_count; } -- cgit v1.2.3 From f54c870d326aa02b73b68d2e0a503ec81dd3a4e4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Oct 2019 10:02:09 -0400 Subject: xprtrdma: Replace dprintk() in rpcrdma_update_connect_private() Clean up: Use a single trace point to record each connection's negotiated inline thresholds and the computed maximum byte size of transport headers. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 36 ++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---- net/sunrpc/xprtrdma/verbs.c | 21 ++++++++++----------- 3 files changed, 46 insertions(+), 15 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 542177c5896f..0ca118bcc5ce 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -371,6 +371,42 @@ TRACE_EVENT(xprtrdma_cm_event, ) ); +TRACE_EVENT(xprtrdma_inline_thresh, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt + ), + + TP_ARGS(r_xprt), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, inline_send) + __field(unsigned int, inline_recv) + __field(unsigned int, max_send) + __field(unsigned int, max_recv) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + const struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + __entry->r_xprt = r_xprt; + __entry->inline_send = ep->rep_inline_send; + __entry->inline_recv = ep->rep_inline_recv; + __entry->max_send = ep->rep_max_inline_send; + __entry->max_recv = ep->rep_max_inline_recv; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->inline_send, __entry->inline_recv, + __entry->max_send, __entry->max_recv + ) +); + DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 26d334c83b38..aec3beb93b25 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -78,8 +78,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max call header size = %u\n", - __func__, size); return size; } @@ -100,8 +98,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max reply header size = %u\n", - __func__, size); return size; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 92bdf053716a..77c7dd7f05e8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -177,11 +177,11 @@ out_flushed: rpcrdma_recv_buffer_put(rep); } -static void -rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, - struct rdma_conn_param *param) +static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ @@ -197,13 +197,11 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < r_xprt->rx_ep.rep_inline_recv) - r_xprt->rx_ep.rep_inline_recv = rsize; - if (wsize < r_xprt->rx_ep.rep_inline_send) - r_xprt->rx_ep.rep_inline_send = wsize; - dprintk("RPC: %s: max send %u, max recv %u\n", __func__, - r_xprt->rx_ep.rep_inline_send, - r_xprt->rx_ep.rep_inline_recv); + if (rsize < ep->rep_inline_recv) + ep->rep_inline_recv = rsize; + if (wsize < ep->rep_inline_send) + ep->rep_inline_send = wsize; + rpcrdma_set_max_header_sizes(r_xprt); } @@ -257,7 +255,8 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; ep->rep_connected = 1; - rpcrdma_update_connect_private(r_xprt, &event->param.conn); + rpcrdma_update_cm_private(r_xprt, &event->param.conn); + trace_xprtrdma_inline_thresh(r_xprt); wake_up_all(&ep->rep_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: -- cgit v1.2.3 From a52c23b8b207d676d6cdf531af482a79fa622b9d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 23 Oct 2019 10:02:14 -0400 Subject: xprtrdma: Replace dprintk in xprt_rdma_set_port Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 1 + net/sunrpc/xprtrdma/transport.c | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 0ca118bcc5ce..69a8278e5cef 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -416,6 +416,7 @@ DEFINE_RXPRT_EVENT(xprtrdma_remove); DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); +DEFINE_RXPRT_EVENT(xprtrdma_op_setport); TRACE_EVENT(xprtrdma_op_connect, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ce263e6f779c..7395eb2cfdeb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -441,12 +441,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) struct sockaddr *sap = (struct sockaddr *)&xprt->addr; char buf[8]; - dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", - __func__, xprt, - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT], - port); - rpc_set_port(sap, port); kfree(xprt->address_strings[RPC_DISPLAY_PORT]); @@ -456,6 +450,9 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); snprintf(buf, sizeof(buf), "%4hx", port); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); + + trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt, + rx_xprt)); } /** -- cgit v1.2.3