diff options
author | Jakub Kicinski <kuba@kernel.org> | 2022-11-30 07:24:31 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2022-11-30 07:24:33 +0300 |
commit | 7f0c940be5c5f52b0a7acaf2b55df73337f5c7a8 (patch) | |
tree | 917eebb20ba632172411d69cd576d4d57efc5f89 | |
parent | f2bb566f5c977ff010baaa9e5e14d9a75b06e5f2 (diff) | |
parent | ca7ae89160434cd045a4795a235eb16587bd8f73 (diff) | |
download | linux-7f0c940be5c5f52b0a7acaf2b55df73337f5c7a8.tar.xz |
Merge branch 'mptcp-msg_fastopen-and-tfo-listener-side-support'
Matthieu Baerts says:
====================
mptcp: MSG_FASTOPEN and TFO listener side support
Before this series, only the initiator of a connection was able to combine
both TCP FastOpen and MPTCP when using TCP_FASTOPEN_CONNECT socket option.
These new patches here add (in theory) the full support of TFO with MPTCP,
which means:
- MSG_FASTOPEN sendmsg flag support (patch 1/8)
- TFO support for the listener side (patches 2-5/8)
- TCP_FASTOPEN socket option (patch 6/8)
- TCP_FASTOPEN_KEY socket option (patch 7/8)
To support TFO for the server side, a few preparation patches are needed
(patches 2 to 5/8). Some of them were inspired by a previous work from
Benjamin Hesmans.
Note that TFO support with MPTCP has been validated with selftests
(patch 8/8) but also with Packetdrill tests running with a modified
but still very WIP version supporting MPTCP. Both the modified tool
and the tests are available online:
https://github.com/multipath-tcp/packetdrill/
====================
Link: https://lore.kernel.org/r/20221125222958.958636-1-matthieu.baerts@tessares.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | net/mptcp/Makefile | 2 | ||||
-rw-r--r-- | net/mptcp/fastopen.c | 73 | ||||
-rw-r--r-- | net/mptcp/options.c | 25 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 39 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 28 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 9 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 105 | ||||
-rw-r--r-- | tools/testing/selftests/net/mptcp/mptcp_connect.c | 171 | ||||
-rwxr-xr-x | tools/testing/selftests/net/mptcp/mptcp_connect.sh | 21 |
9 files changed, 358 insertions, 115 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 6e7df47c9584..a3829ce548f9 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c new file mode 100644 index 000000000000..d237d142171c --- /dev/null +++ b/net/mptcp/fastopen.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP Fast Open Mechanism + * + * Copyright (c) 2021-2022, Dmytro SHYTYI + */ + +#include "protocol.h" + +void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, + struct request_sock *req) +{ + struct sock *ssk = subflow->tcp_sock; + struct sock *sk = subflow->conn; + struct sk_buff *skb; + struct tcp_sock *tp; + + tp = tcp_sk(ssk); + + subflow->is_mptfo = 1; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return; + + /* dequeue the skb from sk receive queue */ + __skb_unlink(skb, &ssk->sk_receive_queue); + skb_ext_reset(skb); + skb_orphan(skb); + + /* We copy the fastopen data, but that don't belong to the mptcp sequence + * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() + */ + tp->copied_seq += skb->len; + subflow->ssn_offset += skb->len; + + /* initialize a dummy sequence number, we will update it at MPC + * completion, if needed + */ + MPTCP_SKB_CB(skb)->map_seq = -skb->len; + MPTCP_SKB_CB(skb)->end_seq = 0; + MPTCP_SKB_CB(skb)->offset = 0; + MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + + mptcp_data_lock(sk); + + mptcp_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + + sk->sk_data_ready(sk); + + mptcp_data_unlock(sk); +} + +void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) +{ + struct sock *sk = (struct sock *)msk; + struct sk_buff *skb; + + mptcp_data_lock(sk); + skb = skb_peek_tail(&sk->sk_receive_queue); + if (skb) { + WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); + pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk, + MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, + MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); + MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; + MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; + } + + pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq); + mptcp_data_unlock(sk); +} diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 30d289044e71..5ded85e2c374 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -26,6 +26,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, { u8 subtype = *ptr >> 4; int expected_opsize; + u16 subopt; u8 version; u8 flags; u8 i; @@ -38,11 +39,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + subopt = OPTION_MPTCP_MPC_ACK; } else { - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; - else + subopt = OPTION_MPTCP_MPC_SYNACK; + } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + subopt = OPTION_MPTCP_MPC_SYN; + } } /* Cfr RFC 8684 Section 3.3.0: @@ -85,7 +90,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); - mp_opt->suboptions |= OPTIONS_MPTCP_MPC; + mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; @@ -934,7 +939,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); - goto fully_established; + goto check_notify; } /* we must process OoO packets before the first subflow is fully @@ -945,17 +950,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; + if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) + goto set_fully_established; return subflow->mp_capable; } - if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || - ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) { + if (subflow->remote_key_valid && + (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); - goto fully_established; + goto check_notify; } /* If the first established packet does not contain MP_CAPABLE + data @@ -974,11 +982,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); +set_fully_established: if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); -fully_established: +check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3722a8580b61..b0d387be500a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -36,15 +36,6 @@ struct mptcp6_sock { }; #endif -struct mptcp_skb_cb { - u64 map_seq; - u64 end_seq; - u32 offset; - u8 has_rxtstamp:1; -}; - -#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) - enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), @@ -200,7 +191,7 @@ static void mptcp_rfree(struct sk_buff *skb) mptcp_rmem_uncharge(sk, len); } -static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; @@ -1711,17 +1702,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int ret = 0; long timeo; - /* we don't support FASTOPEN yet */ - if (msg->msg_flags & MSG_FASTOPEN) - return -EOPNOTSUPP; - /* silently ignore everything else */ - msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; + msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect || + msg->msg_flags & MSG_FASTOPEN))) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); @@ -3048,7 +3036,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_sock *msk; - u64 ack_seq; if (!nsk) return NULL; @@ -3074,15 +3061,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; - if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { - msk->can_ack = true; - msk->remote_key = mp_opt->sndr_key; - mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); - ack_seq++; - WRITE_ONCE(msk->ack_seq, ack_seq); - atomic64_set(&msk->rcv_wnd_sent, ack_seq); - } - sock_reset_flag(nsk, SOCK_RCU_FREE); /* will be fully established after successful MPC subflow creation */ inet_sk_state_store(nsk, TCP_SYN_RECV); @@ -3355,7 +3333,6 @@ void mptcp_finish_connect(struct sock *ssk) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; - u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; @@ -3363,22 +3340,16 @@ void mptcp_finish_connect(struct sock *ssk) pr_debug("msk=%p, token=%u", sk, subflow->token); - mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); - ack_seq++; - subflow->map_seq = ack_seq; + subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ - WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); - WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); - atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6a09ab99a12d..8b4379a2cd85 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -126,6 +126,15 @@ #define MPTCP_CONNECTED 6 #define MPTCP_RESET_SCHEDULER 7 +struct mptcp_skb_cb { + u64 map_seq; + u64 end_seq; + u32 offset; + u8 has_rxtstamp:1; +}; + +#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) + static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; @@ -467,17 +476,22 @@ struct mptcp_subflow_context { send_fastclose : 1, send_infinite_map : 1, rx_eof : 1, - can_ack : 1, /* only after processing the remote a key */ + remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ - valid_csum_seen : 1; /* at least one csum validated */ + valid_csum_seen : 1, /* at least one csum validated */ + is_mptfo : 1, /* subflow is doing TFO */ + __unused : 8; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; - u8 hmac[MPTCPOPT_HMAC_LEN]; + union { + u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ + u64 iasn; /* initial ack sequence number, MPC subflows only */ + }; u8 local_id; u8 remote_id; u8 reset_seen:1; @@ -603,7 +617,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - struct mptcp_options_received *mp_opt); + const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); @@ -619,6 +633,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); @@ -826,6 +841,11 @@ void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_ void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); +void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt); +void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, + struct request_sock *req); + static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index f62f6483ef77..a47423ebb33a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -559,7 +559,9 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: case TCP_INQ: + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return true; } @@ -569,9 +571,6 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY, TCP_FASTOPEN are not supported because - * fastopen for the listener side is currently unsupported - */ } return false; } @@ -801,7 +800,9 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); return 0; + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); @@ -1166,7 +1167,9 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_INFO: case TCP_CC_INFO: case TCP_DEFER_ACCEPT: + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f3c336872475..29904303f5c2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -307,7 +307,48 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, return NULL; } +static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_request_sock *ireq = inet_rsk(req); + + /* clear tstamp_ok, as needed depending on cookie */ + if (foc && foc->len > -1) + ireq->tstamp_ok = 0; + + if (synack_type == TCP_SYNACK_FASTOPEN) + mptcp_fastopen_subflow_synack_set_params(subflow, req); +} + +static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + subflow_prep_synack(sk, req, foc, synack_type); + + return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, + synack_type, syn_skb); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + subflow_prep_synack(sk, req, foc, synack_type); + + return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, + synack_type, syn_skb); +} + static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, @@ -392,11 +433,33 @@ static void mptcp_set_connected(struct sock *sk) mptcp_data_unlock(sk); } +static void subflow_set_remote_key(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) +{ + /* active MPC subflow will reach here multiple times: + * at subflow_finish_connect() time and at 4th ack time + */ + if (subflow->remote_key_valid) + return; + + subflow->remote_key_valid = 1; + subflow->remote_key = mp_opt->sndr_key; + mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); + subflow->iasn++; + + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->iasn); + WRITE_ONCE(msk->can_ack, true); + atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; + struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -404,6 +467,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + msk = mptcp_sk(parent); mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; @@ -416,19 +480,16 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); - pr_fallback(mptcp_sk(subflow->conn)); + pr_fallback(msk); goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) - WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); + WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) - WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; - subflow->can_ack = 1; - subflow->remote_key = mp_opt.sndr_key; - pr_debug("subflow=%p, remote_key=%llu", subflow, - subflow->remote_key); + subflow_set_remote_key(msk, subflow, &mp_opt); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_set_connected(parent); @@ -466,7 +527,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); - if (subflow_use_different_dport(mptcp_sk(parent), sk)) { + if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); @@ -474,7 +535,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } } else if (mptcp_check_fallback(sk)) { fallback: - mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_rcv_space_init(msk, sk); mptcp_set_connected(parent); } return; @@ -637,14 +698,16 @@ static void subflow_drop_ctx(struct sock *ssk) } void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - struct mptcp_options_received *mp_opt) + const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); - subflow->remote_key = mp_opt->sndr_key; + subflow_set_remote_key(msk, subflow, mp_opt); subflow->fully_established = 1; - subflow->can_ack = 1; WRITE_ONCE(msk->fully_established, true); + + if (subflow->is_mptfo) + mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -760,7 +823,7 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) + if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -1198,16 +1261,8 @@ static bool subflow_check_data_avail(struct sock *ssk) if (WARN_ON_ONCE(!skb)) goto no_data; - /* if msk lacks the remote key, this subflow must provide an - * MP_CAPABLE-based mapping - */ - if (unlikely(!READ_ONCE(msk->can_ack))) { - if (!subflow->mpc_map) - goto fallback; - WRITE_ONCE(msk->remote_key, subflow->remote_key); - WRITE_ONCE(msk->ack_seq, subflow->map_seq); - WRITE_ONCE(msk->can_ack, true); - } + if (unlikely(!READ_ONCE(msk->can_ack))) + goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); @@ -1480,6 +1535,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); + subflow->remote_key_valid = 1; subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -1873,6 +1929,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; + new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; @@ -1929,6 +1986,7 @@ void __init mptcp_subflow_init(void) subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; + subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1942,6 +2000,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; + subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index e54653ea2ed4..8a8266957bc5 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -83,6 +83,7 @@ struct cfg_cmsg_types { struct cfg_sockopt_types { unsigned int transparent:1; + unsigned int mptfo:1; }; struct tcp_inq_state { @@ -90,6 +91,13 @@ struct tcp_inq_state { bool expect_eof; }; +struct wstate { + char buf[8192]; + unsigned int len; + unsigned int off; + unsigned int total_len; +}; + static struct tcp_inq_state tcp_inq; static struct cfg_cmsg_types cfg_cmsg_types; @@ -232,6 +240,14 @@ static void set_transparent(int fd, int pf) } } +static void set_mptfo(int fd, int pf) +{ + int qlen = 25; + + if (setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) == -1) + perror("TCP_FASTOPEN"); +} + static int do_ulp_so(int sock, const char *name) { return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name)); @@ -300,6 +316,9 @@ static int sock_listen_mptcp(const char * const listenaddr, if (cfg_sockopt_types.transparent) set_transparent(sock, pf); + if (cfg_sockopt_types.mptfo) + set_mptfo(sock, pf); + if (bind(sock, a->ai_addr, a->ai_addrlen) == 0) break; /* success */ @@ -330,13 +349,15 @@ static int sock_listen_mptcp(const char * const listenaddr, static int sock_connect_mptcp(const char * const remoteaddr, const char * const port, int proto, - struct addrinfo **peer) + struct addrinfo **peer, + int infd, struct wstate *winfo) { struct addrinfo hints = { .ai_protocol = IPPROTO_TCP, .ai_socktype = SOCK_STREAM, }; struct addrinfo *a, *addr; + int syn_copied = 0; int sock = -1; hints.ai_family = pf; @@ -354,14 +375,34 @@ static int sock_connect_mptcp(const char * const remoteaddr, if (cfg_mark) set_mark(sock, cfg_mark); - if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) { - *peer = a; - break; /* success */ + if (cfg_sockopt_types.mptfo) { + if (!winfo->total_len) + winfo->total_len = winfo->len = read(infd, winfo->buf, + sizeof(winfo->buf)); + + syn_copied = sendto(sock, winfo->buf, winfo->len, MSG_FASTOPEN, + a->ai_addr, a->ai_addrlen); + if (syn_copied >= 0) { + winfo->off = syn_copied; + winfo->len -= syn_copied; + *peer = a; + break; /* success */ + } + } else { + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) { + *peer = a; + break; /* success */ + } + } + if (cfg_sockopt_types.mptfo) { + perror("sendto()"); + close(sock); + sock = -1; + } else { + perror("connect()"); + close(sock); + sock = -1; } - - perror("connect()"); - close(sock); - sock = -1; } freeaddrinfo(addr); @@ -571,14 +612,14 @@ static void shut_wr(int fd) shutdown(fd, SHUT_WR); } -static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after_out) +static int copyfd_io_poll(int infd, int peerfd, int outfd, + bool *in_closed_after_out, struct wstate *winfo) { struct pollfd fds = { .fd = peerfd, .events = POLLIN | POLLOUT, }; - unsigned int woff = 0, wlen = 0, total_wlen = 0, total_rlen = 0; - char wbuf[8192]; + unsigned int total_wlen = 0, total_rlen = 0; set_nonblock(peerfd, true); @@ -638,19 +679,19 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after } if (fds.revents & POLLOUT) { - if (wlen == 0) { - woff = 0; - wlen = read(infd, wbuf, sizeof(wbuf)); + if (winfo->len == 0) { + winfo->off = 0; + winfo->len = read(infd, winfo->buf, sizeof(winfo->buf)); } - if (wlen > 0) { + if (winfo->len > 0) { ssize_t bw; /* limit the total amount of written data to the trunc value */ - if (cfg_truncate > 0 && wlen + total_wlen > cfg_truncate) - wlen = cfg_truncate - total_wlen; + if (cfg_truncate > 0 && winfo->len + total_wlen > cfg_truncate) + winfo->len = cfg_truncate - total_wlen; - bw = do_rnd_write(peerfd, wbuf + woff, wlen); + bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len); if (bw < 0) { if (cfg_rcv_trunc) return 0; @@ -658,10 +699,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after return 111; } - woff += bw; - wlen -= bw; + winfo->off += bw; + winfo->len -= bw; total_wlen += bw; - } else if (wlen == 0) { + } else if (winfo->len == 0) { /* We have no more data to send. */ fds.events &= ~POLLOUT; @@ -717,10 +758,26 @@ static int do_recvfile(int infd, int outfd) return (int)r; } -static int do_mmap(int infd, int outfd, unsigned int size) +static int spool_buf(int fd, struct wstate *winfo) +{ + while (winfo->len) { + int ret = write(fd, winfo->buf + winfo->off, winfo->len); + + if (ret < 0) { + perror("write"); + return 4; + } + winfo->off += ret; + winfo->len -= ret; + } + return 0; +} + +static int do_mmap(int infd, int outfd, unsigned int size, + struct wstate *winfo) { char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0); - ssize_t ret = 0, off = 0; + ssize_t ret = 0, off = winfo->total_len; size_t rem; if (inbuf == MAP_FAILED) { @@ -728,7 +785,11 @@ static int do_mmap(int infd, int outfd, unsigned int size) return 1; } - rem = size; + ret = spool_buf(outfd, winfo); + if (ret < 0) + return ret; + + rem = size - winfo->total_len; while (rem > 0) { ret = write(outfd, inbuf + off, rem); @@ -772,8 +833,16 @@ static int get_infd_size(int fd) return (int)count; } -static int do_sendfile(int infd, int outfd, unsigned int count) +static int do_sendfile(int infd, int outfd, unsigned int count, + struct wstate *winfo) { + int ret = spool_buf(outfd, winfo); + + if (ret < 0) + return ret; + + count -= winfo->total_len; + while (count > 0) { ssize_t r; @@ -790,7 +859,8 @@ static int do_sendfile(int infd, int outfd, unsigned int count) } static int copyfd_io_mmap(int infd, int peerfd, int outfd, - unsigned int size, bool *in_closed_after_out) + unsigned int size, bool *in_closed_after_out, + struct wstate *winfo) { int err; @@ -799,9 +869,9 @@ static int copyfd_io_mmap(int infd, int peerfd, int outfd, if (err) return err; - err = do_mmap(infd, peerfd, size); + err = do_mmap(infd, peerfd, size, winfo); } else { - err = do_mmap(infd, peerfd, size); + err = do_mmap(infd, peerfd, size, winfo); if (err) return err; @@ -815,7 +885,7 @@ static int copyfd_io_mmap(int infd, int peerfd, int outfd, } static int copyfd_io_sendfile(int infd, int peerfd, int outfd, - unsigned int size, bool *in_closed_after_out) + unsigned int size, bool *in_closed_after_out, struct wstate *winfo) { int err; @@ -824,9 +894,9 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd, if (err) return err; - err = do_sendfile(infd, peerfd, size); + err = do_sendfile(infd, peerfd, size, winfo); } else { - err = do_sendfile(infd, peerfd, size); + err = do_sendfile(infd, peerfd, size, winfo); if (err) return err; @@ -839,7 +909,7 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd, return err; } -static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd) +static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct wstate *winfo) { bool in_closed_after_out = false; struct timespec start, end; @@ -851,21 +921,24 @@ static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd) switch (cfg_mode) { case CFG_MODE_POLL: - ret = copyfd_io_poll(infd, peerfd, outfd, &in_closed_after_out); + ret = copyfd_io_poll(infd, peerfd, outfd, &in_closed_after_out, + winfo); break; case CFG_MODE_MMAP: file_size = get_infd_size(infd); if (file_size < 0) return file_size; - ret = copyfd_io_mmap(infd, peerfd, outfd, file_size, &in_closed_after_out); + ret = copyfd_io_mmap(infd, peerfd, outfd, file_size, + &in_closed_after_out, winfo); break; case CFG_MODE_SENDFILE: file_size = get_infd_size(infd); if (file_size < 0) return file_size; - ret = copyfd_io_sendfile(infd, peerfd, outfd, file_size, &in_closed_after_out); + ret = copyfd_io_sendfile(infd, peerfd, outfd, file_size, + &in_closed_after_out, winfo); break; default: @@ -999,6 +1072,7 @@ static void maybe_close(int fd) int main_loop_s(int listensock) { struct sockaddr_storage ss; + struct wstate winfo; struct pollfd polls; socklen_t salen; int remotesock; @@ -1033,7 +1107,8 @@ again: SOCK_TEST_TCPULP(remotesock, 0); - copyfd_io(fd, remotesock, 1, true); + memset(&winfo, 0, sizeof(winfo)); + copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1130,6 +1205,11 @@ static void parse_setsock_options(const char *name) return; } + if (strncmp(name, "MPTFO", len) == 0) { + cfg_sockopt_types.mptfo = 1; + return; + } + fprintf(stderr, "Unrecognized setsockopt option %s\n", name); exit(1); } @@ -1166,11 +1246,18 @@ void xdisconnect(int fd, int addrlen) int main_loop(void) { - int fd, ret, fd_in = 0; + int fd = 0, ret, fd_in = 0; struct addrinfo *peer; + struct wstate winfo; + + if (cfg_input && cfg_sockopt_types.mptfo) { + fd_in = open(cfg_input, O_RDONLY); + if (fd < 0) + xerror("can't open %s:%d", cfg_input, errno); + } - /* listener is ready. */ - fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer); + memset(&winfo, 0, sizeof(winfo)); + fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer, fd_in, &winfo); if (fd < 0) return 2; @@ -1186,14 +1273,13 @@ again: if (cfg_cmsg_types.cmsg_enabled) apply_cmsg_types(fd, &cfg_cmsg_types); - if (cfg_input) { + if (cfg_input && !cfg_sockopt_types.mptfo) { fd_in = open(cfg_input, O_RDONLY); if (fd < 0) xerror("can't open %s:%d", cfg_input, errno); } - /* close the client socket open only if we are not going to reconnect */ - ret = copyfd_io(fd_in, fd, 1, 0); + ret = copyfd_io(fd_in, fd, 1, 0, &winfo); if (ret) return ret; @@ -1210,6 +1296,7 @@ again: xerror("can't reconnect: %d", errno); if (cfg_input) close(fd_in); + memset(&winfo, 0, sizeof(winfo)); goto again; } else { close(fd); diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 621af6895f4d..60198b91a530 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -762,6 +762,23 @@ run_tests_peekmode() run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" } +run_tests_mptfo() +{ + echo "INFO: with MPTFO start" + ip netns exec "$ns1" sysctl -q net.ipv4.tcp_fastopen=2 + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_fastopen=1 + + run_tests_lo "$ns1" "$ns2" 10.0.1.1 0 "-o MPTFO" + run_tests_lo "$ns1" "$ns2" 10.0.1.1 0 "-o MPTFO" + + run_tests_lo "$ns1" "$ns2" dead:beef:1::1 0 "-o MPTFO" + run_tests_lo "$ns1" "$ns2" dead:beef:1::1 0 "-o MPTFO" + + ip netns exec "$ns1" sysctl -q net.ipv4.tcp_fastopen=0 + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_fastopen=0 + echo "INFO: with MPTFO end" +} + run_tests_disconnect() { local peekmode="$1" @@ -901,6 +918,10 @@ run_tests_peekmode "saveWithPeek" run_tests_peekmode "saveAfterPeek" stop_if_error "Tests with peek mode have failed" +# MPTFO (MultiPath TCP Fatopen tests) +run_tests_mptfo +stop_if_error "Tests with MPTFO have failed" + # connect to ns4 ip address, ns2 should intercept/proxy run_test_transparent 10.0.3.1 "tproxy ipv4" run_test_transparent dead:beef:3::1 "tproxy ipv6" |