diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/Kconfig | 24 | ||||
-rw-r--r-- | net/mptcp/Makefile | 6 | ||||
-rw-r--r-- | net/mptcp/crypto.c | 63 | ||||
-rw-r--r-- | net/mptcp/crypto_test.c | 72 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 169 | ||||
-rw-r--r-- | net/mptcp/options.c | 9 | ||||
-rw-r--r-- | net/mptcp/pm.c | 46 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 2 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 526 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 84 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 131 | ||||
-rw-r--r-- | net/mptcp/token.c | 339 | ||||
-rw-r--r-- | net/mptcp/token_test.c | 140 |
13 files changed, 1131 insertions, 480 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a9ed3bf1d93f..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,17 +13,29 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 default y -config MPTCP_HMAC_TEST - bool "Tests for MPTCP HMAC implementation" +endif + +config MPTCP_KUNIT_TESTS + tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS + select MPTCP + depends on KUNIT + default KUNIT_ALL_TESTS help - This option enable boot time self-test for the HMAC implementation - used by the MPTCP code + Currently covers the MPTCP crypto and token helpers. + Only useful for kernel devs running KUnit test harness and are not + for inclusion into a production build. - Say N if you are unsure. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. -endif diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index baa0640527c7..2360cbd27d59 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,3 +3,9 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o + +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + +mptcp_crypto_test-objs := crypto_test.o +mptcp_token_test-objs := token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 3d980713a9e2..6c4ea979dfd4 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -87,65 +87,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256_final(&state, (u8 *)hmac); } -#ifdef CONFIG_MPTCP_HMAC_TEST -struct test_cast { - char *key; - char *msg; - char *result; -}; - -/* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size. - */ -static struct test_cast tests[] = { - { - .key = "0b0b0b0b0b0b0b0b", - .msg = "48692054", - .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", - }, - { - .key = "aaaaaaaaaaaaaaaa", - .msg = "dddddddd", - .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", - }, - { - .key = "0102030405060708", - .msg = "cdcdcdcd", - .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", - }, -}; - -static int __init test_mptcp_crypto(void) -{ - char hmac[32], hmac_hex[65]; - u32 nonce1, nonce2; - u64 key1, key2; - u8 msg[8]; - int i, j; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - /* mptcp hmap will convert to be before computing the hmac */ - key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); - key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); - nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); - nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - - put_unaligned_be32(nonce1, &msg[0]); - put_unaligned_be32(nonce2, &msg[4]); - - mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); - for (j = 0; j < 32; ++j) - sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); - hmac_hex[64] = 0; - - if (memcmp(hmac_hex, tests[i].result, 64)) - pr_err("test %d failed, got %s expected %s", i, - hmac_hex, tests[i].result); - else - pr_info("test %d [ ok ]", i); - } - return 0; -} - -late_initcall(test_mptcp_crypto); +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c new file mode 100644 index 000000000000..017248dea038 --- /dev/null +++ b/net/mptcp/crypto_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +struct test_case { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size. + */ +static struct test_case tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", + }, +}; + +static void mptcp_crypto_test_basic(struct kunit *test) +{ + char hmac[32], hmac_hex[65]; + u32 nonce1, nonce2; + u64 key1, key2; + u8 msg[8]; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); + for (j = 0; j < 32; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[64] = 0; + + KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result); + } +} + +static struct kunit_case mptcp_crypto_test_cases[] = { + KUNIT_CASE(mptcp_crypto_test_basic), + {} +}; + +static struct kunit_suite mptcp_crypto_suite = { + .name = "mptcp-crypto", + .test_cases = mptcp_crypto_test_cases, +}; + +kunit_test_suite(mptcp_crypto_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni <pabeni@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/inet_diag.h> +#include <net/netlink.h> +#include <uapi/linux/mptcp.h> +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8f940be42f98..19707c07efc1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + if (unlikely(mptcp_check_fallback(sk))) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(sk)->is_mptcp = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); return false; } @@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; + if (__mptcp_check_fallback(msk)) + return; + mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 977d9c8b1453..a8ad20559aaa 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,8 +10,6 @@ #include <net/mptcp.h> #include "protocol.h" -static struct workqueue_struct *pm_wq; - /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (queue_work(pm_wq, &msk->pm.work)) + if (schedule_work(&msk->work)) sock_hold((struct sock *)msk); return true; } @@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } -static void pm_worker(struct work_struct *work) -{ - struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, - work); - struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); - struct sock *sk = (struct sock *)msk; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - sock_put(sk); -} - void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; @@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.status = 0; spin_lock_init(&msk->pm.lock); - INIT_WORK(&msk->pm.work, pm_worker); mptcp_pm_nl_data_init(msk); } -void mptcp_pm_close(struct mptcp_sock *msk) -{ - if (cancel_work_sync(&msk->pm.work)) - sock_put((struct sock *)msk); -} - -void mptcp_pm_init(void) +void __init mptcp_pm_init(void) { - pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); - if (!pm_wq) - panic("Failed to allocate workqueue"); - mptcp_pm_nl_init(); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b78edf237ba0..c8820c4156e6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = { .size = sizeof(struct pm_nl_pernet), }; -void mptcp_pm_nl_init(void) +void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3980fbb6f31e..dbe43e0cd734 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -52,18 +52,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) -{ - return msk->first && !sk_is_mptcp(msk->first); -} - -static struct socket *mptcp_is_tcpsk(struct sock *sk) +static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; - if (sock->sk != sk) - return NULL; - if (unlikely(sk->sk_prot == &tcp_prot)) { /* we are being invoked after mptcp_accept() has * accepted a non-mp-capable flow: sk is a tcp_sk, @@ -73,59 +65,37 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk) * bypass mptcp. */ sock->ops = &inet_stream_ops; - return sock; + return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { sock->ops = &inet6_stream_ops; - return sock; + return true; #endif } - return NULL; + return false; } -static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - struct socket *sock; - sock_owned_by_me((const struct sock *)msk); - sock = mptcp_is_tcpsk((struct sock *)msk); - if (unlikely(sock)) - return sock; - - if (likely(!__mptcp_needs_tcp_fallback(msk))) + if (likely(!__mptcp_check_fallback(msk))) return NULL; - return msk->subflow; + return msk->first; } -static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) -{ - return !msk->first; -} - -static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock; - - ssock = __mptcp_nmpc_socket(msk); - if (ssock) - goto set_state; - - if (!__mptcp_can_create_subflow(msk)) - return ERR_PTR(-EINVAL); - err = mptcp_subflow_create_socket(sk, &ssock); if (err) - return ERR_PTR(err); + return err; msk->first = ssock->sk; msk->subflow = ssock; @@ -133,10 +103,12 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; -set_state: - if (state != MPTCP_SAME_STATE) - inet_sk_state_store(sk, state); - return ssock; + /* accept() will wait on first subflow sk_wq, and we always wakes up + * via msk->sk_socket + */ + RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + + return 0; } static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -207,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, return false; } - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); - - if (rcvbuf > sk->sk_rcvbuf) - sk->sk_rcvbuf = rcvbuf; - } - tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -229,6 +194,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (!skb) break; + if (__mptcp_check_fallback(msk)) { + /* if we are running under the workqueue, TCP could have + * collapsed skbs between dummy map creation and now + * be sure to adjust the size + */ + map_remaining = skb->len; + subflow->map_data_len = skb->len; + } + offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { @@ -466,8 +440,15 @@ static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - u64 snd_una = atomic64_read(&msk->snd_una); bool cleaned = false; + u64 snd_una; + + /* on fallback we just need to ignore snd_una, as this is really + * plain TCP + */ + if (__mptcp_check_fallback(msk)) + atomic64_set(&msk->snd_una, msk->write_seq); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -740,7 +721,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; struct sock *ssk; bool tx_ok; @@ -759,15 +739,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } -fallback: - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { - release_sock(sk); - pr_debug("fallback passthrough"); - ret = sock_sendmsg(ssock, msg); - return ret >= 0 ? ret + copied : (copied ? copied : ret); - } - pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); @@ -819,17 +790,6 @@ wait_for_sndbuf: } break; } - if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { - /* Can happen for passive sockets: - * 3WHS negotiated MPTCP, but first packet after is - * plain TCP (e.g. due to middlebox filtering unknown - * options). - * - * Fall back to TCP. - */ - release_sock(ssk); - goto fallback; - } copied += ret; @@ -949,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, return copied; } +/* receive buffer autotuning. See tcp_rcv_space_adjust for more information. + * + * Only difference: Use highest rtt estimate of the subflows in use. + */ +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 time, advmss = 1; + u64 rtt_us, mstamp; + + sock_owned_by_me(sk); + + if (copied <= 0) + return; + + msk->rcvq_space.copied += copied; + + mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); + time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + + rtt_us = msk->rcvq_space.rtt_us; + if (rtt_us && time < (rtt_us >> 3)) + return; + + rtt_us = 0; + mptcp_for_each_subflow(msk, subflow) { + const struct tcp_sock *tp; + u64 sf_rtt_us; + u32 sf_advmss; + + tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); + + sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); + sf_advmss = READ_ONCE(tp->advmss); + + rtt_us = max(sf_rtt_us, rtt_us); + advmss = max(sf_advmss, advmss); + } + + msk->rcvq_space.rtt_us = rtt_us; + if (time < (rtt_us >> 3) || rtt_us == 0) + return; + + if (msk->rcvq_space.copied <= msk->rcvq_space.space) + goto new_measure; + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvmem, rcvbuf; + u64 rcvwin, grow; + + rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; + + grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); + + do_div(grow, msk->rcvq_space.space); + rcvwin += (grow << 1); + + rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(sk, rcvmem) < advmss) + rcvmem += 128; + + do_div(rcvwin, advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + + if (rcvbuf > sk->sk_rcvbuf) { + u32 window_clamp; + + window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); + tcp_sk(ssk)->window_clamp = window_clamp; + } + } + } + + msk->rcvq_space.space = msk->rcvq_space.copied; +new_measure: + msk->rcvq_space.copied = 0; + msk->rcvq_space.time = mstamp; +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { unsigned int moved = 0; @@ -972,7 +1026,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int copied = 0; int target; long timeo; @@ -981,16 +1034,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { -fallback: - release_sock(sk); - pr_debug("fallback-read subflow=%p", - mptcp_subflow_ctx(ssock->sk)); - copied = sock_recvmsg(ssock, msg, flags); - return copied; - } - timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); @@ -1056,9 +1099,6 @@ fallback: pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - goto fallback; } if (skb_queue_empty(&sk->sk_receive_queue)) { @@ -1075,6 +1115,8 @@ fallback: set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + mptcp_rcv_space_adjust(msk, copied); + release_sock(sk); return copied; } @@ -1172,6 +1214,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void pm_work(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1188,6 +1253,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (msk->pm.status) + pm_work(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -1283,7 +1351,12 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + ret = __mptcp_socket_create(mptcp_sk(sk)); + if (ret) + return ret; + sk_sockets_allocated_inc(sk); + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; return 0; @@ -1335,8 +1408,6 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, break; } - /* Wake up anyone sleeping in poll. */ - ssk->sk_state_change(ssk); release_sock(ssk); } @@ -1375,7 +1446,6 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); - mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -1448,20 +1518,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; - if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { - nsk->sk_state = TCP_CLOSE; - bh_unlock_sock(nsk); - - /* we can't call into mptcp_close() here - possible BH context - * free the sock directly. - * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() - * too. - */ - sk_common_release(nsk); - sk_free(nsk); - return NULL; - } - msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); if (mp_opt->mp_capable) { @@ -1482,6 +1538,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk, return nsk; } +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + msk->rcvq_space.time = tp->tcp_mstamp; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bool kern) { @@ -1501,7 +1573,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return NULL; pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -1531,6 +1602,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, list_add(&subflow->node, &msk->conn_list); inet_sk_state_store(newsk, TCP_ESTABLISHED); + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); @@ -1547,21 +1619,82 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_token_destroy(msk->token); + mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); sk_sockets_allocated_dec(sk); } +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + } + release_sock(sk); + return ret; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow @@ -1569,11 +1702,13 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_setsockopt(ssock->sk, level, optname, optval, - optlen); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); return -EOPNOTSUPP; } @@ -1582,7 +1717,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -1593,11 +1728,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_getsockopt(ssock->sk, level, optname, optval, - option); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); return -EOPNOTSUPP; } @@ -1636,6 +1770,20 @@ static void mptcp_release_cb(struct sock *sk) } } +static int mptcp_hash(struct sock *sk) +{ + /* should never be called, + * we hash the TCP subflows not the master socket + */ + WARN_ON_ONCE(1); + return 0; +} + +static void mptcp_unhash(struct sock *sk) +{ + /* called from sk_common_release(), but nothing to do here */ +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1660,12 +1808,6 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - if (!subflow->mp_capable) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - return; - } - pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); @@ -1679,13 +1821,14 @@ void mptcp_finish_connect(struct sock *ssk) */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->token, subflow->token); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); atomic64_set(&msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); + + mptcp_rcv_space_init(msk, ssk); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1761,8 +1904,8 @@ static struct proto mptcp_prot = { .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = mptcp_hash, + .unhash = mptcp_unhash, .get_port = mptcp_get_port, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -1771,6 +1914,7 @@ static struct proto mptcp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), + .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; @@ -1781,9 +1925,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } @@ -1800,6 +1944,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; struct socket *ssock; int err; @@ -1812,19 +1957,24 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto do_connect; } - ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; + subflow->request_mptcp = 0; #endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + subflow->request_mptcp = 0; do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -1843,42 +1993,6 @@ unlock: return err; } -static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcp_prot) { - /* we are being invoked from __sys_accept4, after - * mptcp_accept() has just accepted a non-mp-capable - * flow: sk is a tcp_sk, not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - sock->ops = &inet_stream_ops; - } - - return inet_getname(sock, uaddr, peer); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcpv6_prot) { - /* we are being invoked from __sys_accept4 after - * mptcp_accept() has accepted a non-mp-capable - * subflow: sk is a tcp_sk, not mptcp. - * - * Hand the socket over to tcp so all further - * socket ops bypass mptcp. - */ - sock->ops = &inet6_stream_ops; - } - - return inet6_getname(sock, uaddr, peer); -} -#endif - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -1888,12 +2002,14 @@ static int mptcp_listen(struct socket *sock, int backlog) pr_debug("msk=%p", msk); lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, TCP_LISTEN); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_LISTEN); sock_set_flag(sock->sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); @@ -1906,15 +2022,6 @@ unlock: return err; } -static bool is_tcp_proto(const struct proto *p) -{ -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - return p == &tcp_prot || p == &tcpv6_prot; -#else - return p == &tcp_prot; -#endif -} - static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { @@ -1932,11 +2039,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssock) goto unlock_fail; + clear_bit(MPTCP_DATA_READY, &msk->flags); sock_hold(ssock->sk); release_sock(sock->sk); err = ssock->ops->accept(sock, newsock, flags, kern); - if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; @@ -1952,6 +2060,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } + if (inet_csk_listen_poll(ssock->sk)) + set_bit(MPTCP_DATA_READY, &msk->flags); sock_put(ssock->sk); return err; @@ -1960,39 +2070,36 @@ unlock_fail: return -EINVAL; } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : + 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; - struct socket *ssock; __poll_t mask = 0; + int state; msk = mptcp_sk(sk); - lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (!ssock) - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - mask = ssock->ops->poll(file, ssock, wait); - release_sock(sk); - return mask; - } - - release_sock(sk); sock_poll_wait(file, sock, wait); - lock_sock(sk); - if (test_bit(MPTCP_DATA_READY, &msk->flags)) - mask = EPOLLIN | EPOLLRDNORM; - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + state = inet_sk_state_load(sk); + if (state == TCP_LISTEN) + return mptcp_check_readable(msk); + + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { + mask |= mptcp_check_readable(msk); + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - release_sock(sk); - return mask; } @@ -2000,18 +2107,11 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; - struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); - ssock = __mptcp_tcp_fallback(msk); - if (ssock) { - release_sock(sock->sk); - return inet_shutdown(ssock, how); - } - if (how == SHUT_WR || how == SHUT_RDWR) inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); @@ -2037,6 +2137,9 @@ static int mptcp_shutdown(struct socket *sock, int how) mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); } + /* Wake up anyone sleeping in poll. */ + sock->sk->sk_state_change(sock->sk); + out_unlock: release_sock(sock->sk); @@ -2051,7 +2154,7 @@ static const struct proto_ops mptcp_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v4_getname, + .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, @@ -2077,7 +2180,7 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; -void mptcp_proto_init(void) +void __init mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; @@ -2086,6 +2189,7 @@ void mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); @@ -2104,7 +2208,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v6_getname, + .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, @@ -2139,7 +2243,7 @@ static struct inet_protosw mptcp_v6_protosw = { .flags = INET_PROTOSW_ICSK, }; -int mptcp_proto_v6_init(void) +int __init mptcp_proto_v6_init(void) { int err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c6eeaf3e8dcb..e5baaef5ec89 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 +#define MPTCP_FALLBACK_DONE 4 struct mptcp_options_received { u64 sndr_key; @@ -173,8 +174,6 @@ struct mptcp_pm_data { u8 local_addr_max; u8 subflows_max; u8 status; - - struct work_struct work; }; struct mptcp_data_frag { @@ -208,6 +207,12 @@ struct mptcp_sock { struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; + struct { + u32 space; /* bytes copied in last measurement window */ + u32 copied; /* bytes copied in this measurement window */ + u64 time; /* start time of measurement window */ + u64 rtt_us; /* last maximum rtt of subflows */ + } rcvq_space; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -250,6 +255,7 @@ struct mptcp_subflow_request_sock { u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; + struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * @@ -337,7 +343,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) int mptcp_is_enabled(struct net *net); bool mptcp_subflow_data_available(struct sock *sk); -void mptcp_subflow_init(void); +void __init mptcp_subflow_init(void); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, int ifindex, @@ -355,14 +361,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -extern const struct inet_connection_sock_af_ops ipv4_specific; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -extern const struct inet_connection_sock_af_ops ipv6_specific; -#endif - -void mptcp_proto_init(void); +void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) -int mptcp_proto_v6_init(void); +int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone(const struct sock *sk, @@ -372,17 +373,27 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); +void __init mptcp_token_init(void); +static inline void mptcp_token_init_request(struct request_sock *req) +{ + mptcp_subflow_rsk(req)->token_node.pprev = NULL; +} + int mptcp_token_new_request(struct request_sock *req); -void mptcp_token_destroy_request(u32 token); +void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *sk); -int mptcp_token_new_accept(u32 token, struct sock *conn); +void mptcp_token_accept(struct mptcp_subflow_request_sock *r, + struct mptcp_sock *msk); struct mptcp_sock *mptcp_token_get_sock(u32 token); -void mptcp_token_destroy(u32 token); +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num); +void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) @@ -399,9 +410,8 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -void mptcp_pm_init(void); +void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -433,7 +443,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -void mptcp_pm_nl_init(void); +void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); @@ -454,4 +464,46 @@ static inline bool before64(__u64 seq1, __u64 seq2) void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); +static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline bool mptcp_check_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + return __mptcp_check_fallback(msk); +} + +static inline void __mptcp_do_fallback(struct mptcp_sock *msk) +{ + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) { + pr_debug("TCP fallback already done (msk=%p)", msk); + return; + } + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline void mptcp_do_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + __mptcp_do_fallback(msk); +} + +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) + +static inline bool subflow_simultaneous_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + return sk->sk_state == TCP_ESTABLISHED && + !mptcp_sk(parent)->pm.server_side && + !subflow->conn_finished; +} + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3838a0b3a21f..9f7f3772c13c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -29,40 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } -static int subflow_rebuild_header(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int local_id, err = 0; - - if (subflow->request_mptcp && !subflow->token) { - pr_debug("subflow=%p", sk); - err = mptcp_token_new_connect(sk); - } else if (subflow->request_join && !subflow->local_nonce) { - struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; - - pr_debug("subflow=%p", sk); - - do { - get_random_bytes(&subflow->local_nonce, sizeof(u32)); - } while (!subflow->local_nonce); - - if (subflow->local_id) - goto out; - - local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); - if (local_id < 0) - return -EINVAL; - - subflow->local_id = local_id; - } - -out: - if (err) - return err; - - return subflow->icsk_af_ops->rebuild_header(sk); -} - static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -72,8 +38,7 @@ static void subflow_req_destructor(struct request_sock *req) if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); - if (subflow_req->mp_capable) - mptcp_token_destroy_request(subflow_req->token); + mptcp_token_destroy_request(req); tcp_request_sock_ops.destructor(req); } @@ -135,6 +100,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->msk = NULL; + mptcp_token_init_request(req); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -222,7 +188,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; - struct tcp_sock *tp = tcp_sk(sk); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -236,6 +201,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) return; subflow->conn_finished = 1; + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp && mp_opt.mp_capable) { @@ -250,22 +217,23 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); - } else if (subflow->request_mptcp) { - tp->is_mptcp = 0; + } else { + if (subflow->request_mptcp) + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); } - if (!tp->is_mptcp) + if (mptcp_check_fallback(sk)) { + mptcp_rcv_space_init(mptcp_sk(parent), sk); return; + } if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), subflow->remote_key); mptcp_finish_connect(sk); - - if (skb) { - pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - } } else if (subflow->mp_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -285,9 +253,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (skb) - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - if (!mptcp_finish_join(sk)) goto do_reset; @@ -386,7 +351,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } - mptcp_token_destroy(mptcp_sk(sk)->token); + mptcp_token_destroy(mptcp_sk(sk)); inet_sock_destruct(sk); } @@ -505,6 +470,7 @@ create_child: */ new_msk->sk_destruct = mptcp_sock_destruct; mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; @@ -562,7 +528,8 @@ enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, - MAPPING_DATA_FIN + MAPPING_DATA_FIN, + MAPPING_DUMMY }; static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) @@ -626,6 +593,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!skb) return MAPPING_EMPTY; + if (mptcp_check_fallback(ssk)) + return MAPPING_DUMMY; + mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { @@ -767,6 +737,16 @@ static bool subflow_check_data_avail(struct sock *ssk) ssk->sk_err = EBADMSG; goto fatal; } + if (status == MAPPING_DUMMY) { + __mptcp_do_fallback(msk); + skb = skb_peek(&ssk->sk_receive_queue); + subflow->map_valid = 1; + subflow->map_seq = READ_ONCE(msk->ack_seq); + subflow->map_data_len = skb->len; + subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - + subflow->ssn_offset; + return true; + } if (status != MAPPING_OK) return false; @@ -890,14 +870,18 @@ static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; - if (!subflow->mp_capable && !subflow->mp_join) { - subflow->tcp_data_ready(sk); - + msk = mptcp_sk(parent); + if ((1 << inet_sk_state_load(sk)) & (TCPF_LISTEN | TCPF_CLOSE)) { + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join); + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); } @@ -974,7 +958,9 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; + int local_id = loc->id; struct socket *sf; + struct sock *ssk; u32 remote_token; int addrlen; int err; @@ -986,7 +972,20 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (err) return err; - subflow = mptcp_subflow_ctx(sf->sk); + ssk = sf->sk; + subflow = mptcp_subflow_ctx(ssk); + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (!local_id) { + err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); + if (err < 0) + goto failed; + + local_id = err; + } + subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -997,15 +996,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (loc->family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - sf->sk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u", msk, remote_token); + pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token, + local_id); subflow->remote_token = remote_token; - subflow->local_id = loc->id; + subflow->local_id = local_id; subflow->request_join = 1; subflow->request_bkup = 1; mptcp_info2sockaddr(remote, &addr); @@ -1118,11 +1118,22 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); + if (subflow_simultaneous_connect(sk)) { + mptcp_do_fallback(sk); + mptcp_rcv_space_init(mptcp_sk(parent), sk); + pr_fallback(mptcp_sk(parent)); + subflow->conn_finished = 1; + if (inet_sk_state_load(parent) == TCP_SYN_SENT) { + inet_sk_state_store(parent, TCP_ESTABLISHED); + parent->sk_state_change(parent); + } + } + /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ - if (subflow->mp_capable && mptcp_subflow_data_available(sk)) + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); if (!(parent->sk_shutdown & RCV_SHUTDOWN) && @@ -1255,7 +1266,7 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) return 0; } -void mptcp_subflow_init(void) +void __init mptcp_subflow_init(void) { subflow_request_sock_ops = tcp_request_sock_ops; if (subflow_ops_init(&subflow_request_sock_ops) != 0) @@ -1268,7 +1279,6 @@ void mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_specific.rebuild_header = subflow_rebuild_header; #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; @@ -1278,7 +1288,6 @@ void mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_v6_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 33352dd99d4d..7d8106026081 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -24,7 +24,7 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/radix-tree.h> +#include <linux/memblock.h> #include <linux/ip.h> #include <linux/tcp.h> #include <net/sock.h> @@ -33,10 +33,55 @@ #include <net/mptcp.h> #include "protocol.h" -static RADIX_TREE(token_tree, GFP_ATOMIC); -static RADIX_TREE(token_req_tree, GFP_ATOMIC); -static DEFINE_SPINLOCK(token_tree_lock); -static int token_used __read_mostly; +#define TOKEN_MAX_RETRIES 4 +#define TOKEN_MAX_CHAIN_LEN 4 + +struct token_bucket { + spinlock_t lock; + int chain_len; + struct hlist_nulls_head req_chain; + struct hlist_nulls_head msk_chain; +}; + +static struct token_bucket *token_hash __read_mostly; +static unsigned int token_mask __read_mostly; + +static struct token_bucket *token_bucket(u32 token) +{ + return &token_hash[token & token_mask]; +} + +/* called with bucket lock held */ +static struct mptcp_subflow_request_sock * +__token_lookup_req(struct token_bucket *t, u32 token) +{ + struct mptcp_subflow_request_sock *req; + struct hlist_nulls_node *pos; + + hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node) + if (req->token == token) + return req; + return NULL; +} + +/* called with bucket lock held */ +static struct mptcp_sock * +__token_lookup_msk(struct token_bucket *t, u32 token) +{ + struct hlist_nulls_node *pos; + struct sock *sk; + + sk_nulls_for_each_rcu(sk, pos, &t->msk_chain) + if (mptcp_sk(sk)->token == token) + return mptcp_sk(sk); + return NULL; +} + +static bool __token_bucket_busy(struct token_bucket *t, u32 token) +{ + return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN || + __token_lookup_req(t, token) || __token_lookup_msk(t, token); +} /** * mptcp_token_new_request - create new key/idsn/token for subflow_request @@ -52,30 +97,32 @@ static int token_used __read_mostly; int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - int err; - - while (1) { - u32 token; - - mptcp_crypto_key_gen_sha(&subflow_req->local_key, - &subflow_req->token, - &subflow_req->idsn); - pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", - req, subflow_req->local_key, subflow_req->token, - subflow_req->idsn); - - token = subflow_req->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; + u32 token; + +again: + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + bucket = token_bucket(token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_req_tree, - subflow_req->token, &token_used); - spin_unlock_bh(&token_tree_lock); - return err; + hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** @@ -97,48 +144,56 @@ int mptcp_token_new_request(struct request_sock *req) int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *mptcp_sock = subflow->conn; - int err; - - while (1) { - u32 token; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; - mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, - &subflow->idsn); + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); +again: + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); - token = subflow->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(subflow->token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, subflow->token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); - spin_unlock_bh(&token_tree_lock); - return err; + WRITE_ONCE(msk->token, subflow->token); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** - * mptcp_token_new_accept - insert token for later processing - * @token: the token to insert to the tree - * @conn: the just cloned socket linked to the new connection + * mptcp_token_accept - replace a req sk with full sock in token hash + * @req: the request socket to be removed + * @msk: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. */ -int mptcp_token_new_accept(u32 token, struct sock *conn) +void mptcp_token_accept(struct mptcp_subflow_request_sock *req, + struct mptcp_sock *msk) { - int err; + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; - spin_lock_bh(&token_tree_lock); - err = radix_tree_insert(&token_tree, token, conn); - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(req->token); + spin_lock_bh(&bucket->lock); - return err; + /* pedantic lookup check for the moved token */ + pos = __token_lookup_req(bucket, req->token); + if (!WARN_ON_ONCE(pos != req)) + hlist_nulls_del_init_rcu(&req->token_node); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + spin_unlock_bh(&bucket->lock); } /** @@ -152,45 +207,171 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) */ struct mptcp_sock *mptcp_token_get_sock(u32 token) { - struct sock *conn; - - spin_lock_bh(&token_tree_lock); - conn = radix_tree_lookup(&token_tree, token); - if (conn) { - /* token still reserved? */ - if (conn == (struct sock *)&token_used) - conn = NULL; - else - sock_hold(conn); + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) != token) + continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto not_found; + if (READ_ONCE(msk->token) != token) { + sock_put(sk); + goto again; + } + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + +not_found: + msk = NULL; + +found: + rcu_read_unlock(); + return msk; +} +EXPORT_SYMBOL_GPL(mptcp_token_get_sock); + +/** + * mptcp_token_iter_next - iterate over the token container from given pos + * @net: namespace to be iterated + * @s_slot: start slot number + * @s_num: start number inside the given lock + * + * This function returns the first mptcp connection structure found inside the + * token container starting from the specified position, or NULL. + * + * On successful iteration, the iterator is move to the next position and the + * the acquires a reference to the returned socket. + */ +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num) +{ + struct mptcp_sock *ret = NULL; + struct hlist_nulls_node *pos; + int slot, num; + + for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { + struct token_bucket *bucket = &token_hash[slot]; + struct sock *sk; + + num = 0; + + if (hlist_nulls_empty(&bucket->msk_chain)) + continue; + + rcu_read_lock(); + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + ++num; + if (!net_eq(sock_net(sk), net)) + continue; + + if (num <= *s_num) + continue; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + continue; + + if (!net_eq(sock_net(sk), net)) { + sock_put(sk); + continue; + } + + ret = mptcp_sk(sk); + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); } - spin_unlock_bh(&token_tree_lock); - return mptcp_sk(conn); +out: + *s_slot = slot; + *s_num = num; + return ret; } +EXPORT_SYMBOL_GPL(mptcp_token_iter_next); /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @req: mptcp request socket dropping the token * - * Remove not-yet-fully-established incoming connection identified - * by @token. + * Remove the token associated to @req. */ -void mptcp_token_destroy_request(u32 token) +void mptcp_token_destroy_request(struct request_sock *req) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_req_tree, token); - spin_unlock_bh(&token_tree_lock); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; + + if (hlist_nulls_unhashed(&subflow_req->token_node)) + return; + + bucket = token_bucket(subflow_req->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_req(bucket, subflow_req->token); + if (!WARN_ON_ONCE(pos != subflow_req)) { + hlist_nulls_del_init_rcu(&pos->token_node); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } /** * mptcp_token_destroy - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @msk: mptcp connection dropping the token * - * Remove the connection identified by @token. + * Remove the token associated to @msk */ -void mptcp_token_destroy(u32 token) +void mptcp_token_destroy(struct mptcp_sock *msk) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_tree, token); - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + struct mptcp_sock *pos; + + if (sk_unhashed((struct sock *)msk)) + return; + + bucket = token_bucket(msk->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_msk(bucket, msk->token); + if (!WARN_ON_ONCE(pos != msk)) { + __sk_nulls_del_node_init_rcu((struct sock *)pos); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } + +void __init mptcp_token_init(void) +{ + int i; + + token_hash = alloc_large_system_hash("MPTCP token", + sizeof(struct token_bucket), + 0, + 20,/* one slot per 1MB of memory */ + 0, + NULL, + &token_mask, + 0, + 64 * 1024); + for (i = 0; i < token_mask + 1; ++i) { + INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i); + INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i); + spin_lock_init(&token_hash[i].lock); + } +} + +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_token_new_request); +EXPORT_SYMBOL_GPL(mptcp_token_new_connect); +EXPORT_SYMBOL_GPL(mptcp_token_accept); +EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); +EXPORT_SYMBOL_GPL(mptcp_token_destroy); +#endif diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c new file mode 100644 index 000000000000..e1bd6f0a0676 --- /dev/null +++ b/net/mptcp/token_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req; + + req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); + mptcp_token_init_request((struct request_sock *)req); + return req; +} + +static void mptcp_token_test_req_basic(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *null_msk = NULL; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + KUNIT_EXPECT_NE(test, 0, (int)req->token); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + + /* cleanup */ + mptcp_token_destroy_request((struct request_sock *)req); +} + +static struct inet_connection_sock *build_icsk(struct kunit *test) +{ + struct inet_connection_sock *icsk; + + icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk); + return icsk; +} + +static struct mptcp_subflow_context *build_ctx(struct kunit *test) +{ + struct mptcp_subflow_context *ctx; + + ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx); + return ctx; +} + +static struct mptcp_sock *build_msk(struct kunit *test) +{ + struct mptcp_sock *msk; + + msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); + refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + return msk; +} + +static void mptcp_token_test_msk_basic(struct kunit *test) +{ + struct inet_connection_sock *icsk = build_icsk(test); + struct mptcp_subflow_context *ctx = build_ctx(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + ctx->conn = (struct sock *)msk; + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_connect((struct sock *)icsk)); + KUNIT_EXPECT_NE(test, 0, (int)ctx->token); + KUNIT_EXPECT_EQ(test, ctx->token, msk->token); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); + + mptcp_token_destroy(msk); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); +} + +static void mptcp_token_test_accept(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* this is now a no-op */ + mptcp_token_destroy_request((struct request_sock *)req); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static void mptcp_token_test_destroyed(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + + /* simulate race on removal */ + refcount_set(&sk->sk_refcnt, 0); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static struct kunit_case mptcp_token_test_cases[] = { + KUNIT_CASE(mptcp_token_test_req_basic), + KUNIT_CASE(mptcp_token_test_msk_basic), + KUNIT_CASE(mptcp_token_test_accept), + KUNIT_CASE(mptcp_token_test_destroyed), + {} +}; + +static struct kunit_suite mptcp_token_suite = { + .name = "mptcp-token", + .test_cases = mptcp_token_test_cases, +}; + +kunit_test_suite(mptcp_token_suite); + +MODULE_LICENSE("GPL"); |