diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-10-25 22:23:36 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-10-25 22:23:37 +0300 |
commit | 8846f9a04b10b7f61214425409838d764df7080d (patch) | |
tree | cc10ea2e27fb56390fbe54cc50afd95a8f64cfb4 /net/mptcp/sockopt.c | |
parent | aad36cd32982d59470de6365f97f154c5af5d1d2 (diff) | |
parent | 8005184fd1ca6aeb3fea36f4eb9463fc1b90c114 (diff) | |
download | linux-8846f9a04b10b7f61214425409838d764df7080d.tar.xz |
Merge branch 'mptcp-features-and-fixes-for-v6-7'
Mat Martineau says:
====================
mptcp: Features and fixes for v6.7
Patch 1 adds a configurable timeout for the MPTCP connection when all
subflows are closed, to support break-before-make use cases.
Patch 2 is a fix for a 1-byte error in rx data counters with MPTCP
fastopen connections.
Patch 3 is a minor code cleanup.
Patches 4 & 5 add handling of rcvlowat for MPTCP sockets, with a
prerequisite patch to use a common scaling ratio between TCP and MPTCP.
Patch 6 improves efficiency of memory copying in MPTCP transmit code.
Patch 7 refactors syncing of socket options from the MPTCP socket to
its subflows.
Patches 8 & 9 help the MPTCP packet scheduler perform well by changing
the handling of notsent_lowat in subflows and how available buffer space
is calculated for MPTCP-level sends.
====================
Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-0-9dc60939d371@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/mptcp/sockopt.c')
-rw-r--r-- | net/mptcp/sockopt.c | 65 |
1 files changed, 47 insertions, 18 deletions
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 59bd5e114392..574e221bb765 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in case SO_SNDBUFFORCE: ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; break; case SO_RCVBUF: case SO_RCVBUFFORCE: @@ -1415,8 +1416,10 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_userlocks & tx_rx_locks) { ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; - if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) { WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; + } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); } @@ -1444,37 +1447,63 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); } -static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) -{ - bool slow = lock_sock_fast(ssk); - - sync_socket_options(msk, ssk); - - unlock_sock_fast(ssk, slow); -} - -void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); msk_owned_by_me(msk); + ssk->sk_rcvlowat = 0; + + /* subflows must ignore any latency-related settings: will not affect + * the user-space - only the msk is relevant - but will foul the + * mptcp scheduler + */ + tcp_sk(ssk)->notsent_lowat = UINT_MAX; + if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { - __mptcp_sockopt_sync(msk, ssk); + sync_socket_options(msk, ssk); subflow->setsockopt_seq = msk->setsockopt_seq; } } -void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) +/* unfortunately this is different enough from the tcp version so + * that we can't factor it out + */ +int mptcp_set_rcvlowat(struct sock *sk, int val) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_subflow_context *subflow; + int space, cap; - msk_owned_by_me(msk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + cap = sk->sk_rcvbuf >> 1; + else + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + val = min(val, cap); + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { - sync_socket_options(msk, ssk); + /* Check if we need to signal EPOLLIN right now */ + if (mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); - subflow->setsockopt_seq = msk->setsockopt_seq; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val); + if (space <= sk->sk_rcvbuf) + return 0; + + /* propagate the rcvbuf changes to all the subflows */ + WRITE_ONCE(sk->sk_rcvbuf, space); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + WRITE_ONCE(ssk->sk_rcvbuf, space); + tcp_sk(ssk)->window_clamp = val; + unlock_sock_fast(ssk, slow); } + return 0; } |