summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Kconfig24
-rw-r--r--net/mptcp/Makefile6
-rw-r--r--net/mptcp/crypto.c63
-rw-r--r--net/mptcp/crypto_test.c72
-rw-r--r--net/mptcp/mptcp_diag.c169
-rw-r--r--net/mptcp/options.c9
-rw-r--r--net/mptcp/pm.c46
-rw-r--r--net/mptcp/pm_netlink.c2
-rw-r--r--net/mptcp/protocol.c526
-rw-r--r--net/mptcp/protocol.h84
-rw-r--r--net/mptcp/subflow.c131
-rw-r--r--net/mptcp/token.c339
-rw-r--r--net/mptcp/token_test.c140
13 files changed, 1131 insertions, 480 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index a9ed3bf1d93f..698bc3525160 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -13,17 +13,29 @@ config MPTCP
if MPTCP
+config INET_MPTCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
config MPTCP_IPV6
bool "MPTCP: IPv6 support for Multipath TCP"
select IPV6
default y
-config MPTCP_HMAC_TEST
- bool "Tests for MPTCP HMAC implementation"
+endif
+
+config MPTCP_KUNIT_TESTS
+ tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS
+ select MPTCP
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
help
- This option enable boot time self-test for the HMAC implementation
- used by the MPTCP code
+ Currently covers the MPTCP crypto and token helpers.
+ Only useful for kernel devs running KUnit test harness and are not
+ for inclusion into a production build.
- Say N if you are unsure.
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
-endif
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index baa0640527c7..2360cbd27d59 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -3,3 +3,9 @@ obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o
+
+obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+
+mptcp_crypto_test-objs := crypto_test.o
+mptcp_token_test-objs := token_test.o
+obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 3d980713a9e2..6c4ea979dfd4 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -87,65 +87,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
sha256_final(&state, (u8 *)hmac);
}
-#ifdef CONFIG_MPTCP_HMAC_TEST
-struct test_cast {
- char *key;
- char *msg;
- char *result;
-};
-
-/* we can't reuse RFC 4231 test vectors, as we have constraint on the
- * input and key size.
- */
-static struct test_cast tests[] = {
- {
- .key = "0b0b0b0b0b0b0b0b",
- .msg = "48692054",
- .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
- },
- {
- .key = "aaaaaaaaaaaaaaaa",
- .msg = "dddddddd",
- .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
- },
- {
- .key = "0102030405060708",
- .msg = "cdcdcdcd",
- .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
- },
-};
-
-static int __init test_mptcp_crypto(void)
-{
- char hmac[32], hmac_hex[65];
- u32 nonce1, nonce2;
- u64 key1, key2;
- u8 msg[8];
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(tests); ++i) {
- /* mptcp hmap will convert to be before computing the hmac */
- key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
- key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
- nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
- nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
-
- put_unaligned_be32(nonce1, &msg[0]);
- put_unaligned_be32(nonce2, &msg[4]);
-
- mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
- for (j = 0; j < 32; ++j)
- sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
- hmac_hex[64] = 0;
-
- if (memcmp(hmac_hex, tests[i].result, 64))
- pr_err("test %d failed, got %s expected %s", i,
- hmac_hex, tests[i].result);
- else
- pr_info("test %d [ ok ]", i);
- }
- return 0;
-}
-
-late_initcall(test_mptcp_crypto);
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS)
+EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha);
#endif
diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c
new file mode 100644
index 000000000000..017248dea038
--- /dev/null
+++ b/net/mptcp/crypto_test.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+struct test_case {
+ char *key;
+ char *msg;
+ char *result;
+};
+
+/* we can't reuse RFC 4231 test vectors, as we have constraint on the
+ * input and key size.
+ */
+static struct test_case tests[] = {
+ {
+ .key = "0b0b0b0b0b0b0b0b",
+ .msg = "48692054",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
+ },
+ {
+ .key = "aaaaaaaaaaaaaaaa",
+ .msg = "dddddddd",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
+ },
+ {
+ .key = "0102030405060708",
+ .msg = "cdcdcdcd",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
+ },
+};
+
+static void mptcp_crypto_test_basic(struct kunit *test)
+{
+ char hmac[32], hmac_hex[65];
+ u32 nonce1, nonce2;
+ u64 key1, key2;
+ u8 msg[8];
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ /* mptcp hmap will convert to be before computing the hmac */
+ key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
+ key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
+ nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
+ nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+ for (j = 0; j < 32; ++j)
+ sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
+ hmac_hex[64] = 0;
+
+ KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result);
+ }
+}
+
+static struct kunit_case mptcp_crypto_test_cases[] = {
+ KUNIT_CASE(mptcp_crypto_test_basic),
+ {}
+};
+
+static struct kunit_suite mptcp_crypto_suite = {
+ .name = "mptcp-crypto",
+ .test_cases = mptcp_crypto_test_cases,
+};
+
+kunit_test_suite(mptcp_crypto_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
new file mode 100644
index 000000000000..5f390a97f556
--- /dev/null
+++ b/net/mptcp/mptcp_diag.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2020 Red Hat
+ *
+ * Author: Paolo Abeni <pabeni@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
+ net_admin);
+}
+
+static int mptcp_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sk_buff *in_skb = cb->skb;
+ struct mptcp_sock *msk = NULL;
+ struct sk_buff *rep;
+ int err = -ENOENT;
+ struct net *net;
+ struct sock *sk;
+
+ net = sock_net(in_skb->sk);
+ msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
+ if (!msk)
+ goto out_nosk;
+
+ err = -ENOMEM;
+ sk = (struct sock *)msk;
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct mptcp_info)) +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ sock_put(sk);
+
+out_nosk:
+ return err;
+}
+
+static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
+ struct mptcp_sock *msk;
+ struct nlattr *bc;
+
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
+
+ while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) !=
+ NULL) {
+ struct inet_sock *inet = (struct inet_sock *)msk;
+ struct sock *sk = (struct sock *)msk;
+ int ret = 0;
+
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+next:
+ sock_put(sk);
+ if (ret < 0) {
+ /* will retry on the same position */
+ cb->args[1]--;
+ break;
+ }
+ cond_resched();
+ }
+}
+
+static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_info *info = _info;
+ u32 flags = 0;
+ bool slow;
+ u8 val;
+
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+ if (!info)
+ return;
+
+ slow = lock_sock_fast(sk);
+ info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+ info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
+ info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
+ info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
+ val = READ_ONCE(msk->pm.add_addr_signal_max);
+ info->mptcpi_add_addr_signal_max = val;
+ val = READ_ONCE(msk->pm.add_addr_accept_max);
+ info->mptcpi_add_addr_accepted_max = val;
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
+ flags |= MPTCP_INFO_FLAG_FALLBACK;
+ if (READ_ONCE(msk->can_ack))
+ flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
+ info->mptcpi_flags = flags;
+ info->mptcpi_token = READ_ONCE(msk->token);
+ info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
+ info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
+ info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
+ unlock_sock_fast(sk, slow);
+}
+
+static const struct inet_diag_handler mptcp_diag_handler = {
+ .dump = mptcp_diag_dump,
+ .dump_one = mptcp_diag_dump_one,
+ .idiag_get_info = mptcp_diag_get_info,
+ .idiag_type = IPPROTO_MPTCP,
+ .idiag_info_size = sizeof(struct mptcp_info),
+};
+
+static int __init mptcp_diag_init(void)
+{
+ return inet_diag_register(&mptcp_diag_handler);
+}
+
+static void __exit mptcp_diag_exit(void)
+{
+ inet_diag_unregister(&mptcp_diag_handler);
+}
+
+module_init(mptcp_diag_init);
+module_exit(mptcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 8f940be42f98..19707c07efc1 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
+ if (unlikely(mptcp_check_fallback(sk)))
+ return false;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
*/
if (!mp_opt->mp_capable) {
subflow->mp_capable = 0;
- tcp_sk(sk)->is_mptcp = 0;
+ pr_fallback(msk);
+ __mptcp_do_fallback(msk);
return false;
}
@@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext;
+ if (__mptcp_check_fallback(msk))
+ return;
+
mptcp_get_options(skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 977d9c8b1453..a8ad20559aaa 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -10,8 +10,6 @@
#include <net/mptcp.h>
#include "protocol.h"
-static struct workqueue_struct *pm_wq;
-
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
@@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return false;
msk->pm.status |= BIT(new_status);
- if (queue_work(pm_wq, &msk->pm.work))
+ if (schedule_work(&msk->work))
sock_hold((struct sock *)msk);
return true;
}
@@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, skc);
}
-static void pm_worker(struct work_struct *work)
-{
- struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
- work);
- struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
- struct sock *sk = (struct sock *)msk;
-
- lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
-
- pr_debug("msk=%p status=%x", msk, pm->status);
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
- mptcp_pm_nl_add_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
- mptcp_pm_nl_fully_established(msk);
- }
- if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
- mptcp_pm_nl_subflow_established(msk);
- }
-
- spin_unlock_bh(&msk->pm.lock);
- release_sock(sk);
- sock_put(sk);
-}
-
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
@@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
- INIT_WORK(&msk->pm.work, pm_worker);
mptcp_pm_nl_data_init(msk);
}
-void mptcp_pm_close(struct mptcp_sock *msk)
-{
- if (cancel_work_sync(&msk->pm.work))
- sock_put((struct sock *)msk);
-}
-
-void mptcp_pm_init(void)
+void __init mptcp_pm_init(void)
{
- pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
- if (!pm_wq)
- panic("Failed to allocate workqueue");
-
mptcp_pm_nl_init();
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index b78edf237ba0..c8820c4156e6 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = {
.size = sizeof(struct pm_nl_pernet),
};
-void mptcp_pm_nl_init(void)
+void __init mptcp_pm_nl_init(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3980fbb6f31e..dbe43e0cd734 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -52,18 +52,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
return msk->subflow;
}
-static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
-{
- return msk->first && !sk_is_mptcp(msk->first);
-}
-
-static struct socket *mptcp_is_tcpsk(struct sock *sk)
+static bool mptcp_is_tcpsk(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
- if (sock->sk != sk)
- return NULL;
-
if (unlikely(sk->sk_prot == &tcp_prot)) {
/* we are being invoked after mptcp_accept() has
* accepted a non-mp-capable flow: sk is a tcp_sk,
@@ -73,59 +65,37 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk)
* bypass mptcp.
*/
sock->ops = &inet_stream_ops;
- return sock;
+ return true;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
} else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
sock->ops = &inet6_stream_ops;
- return sock;
+ return true;
#endif
}
- return NULL;
+ return false;
}
-static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
+static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
- struct socket *sock;
-
sock_owned_by_me((const struct sock *)msk);
- sock = mptcp_is_tcpsk((struct sock *)msk);
- if (unlikely(sock))
- return sock;
-
- if (likely(!__mptcp_needs_tcp_fallback(msk)))
+ if (likely(!__mptcp_check_fallback(msk)))
return NULL;
- return msk->subflow;
+ return msk->first;
}
-static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
-{
- return !msk->first;
-}
-
-static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
+static int __mptcp_socket_create(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
struct socket *ssock;
int err;
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- return ssock;
-
- ssock = __mptcp_nmpc_socket(msk);
- if (ssock)
- goto set_state;
-
- if (!__mptcp_can_create_subflow(msk))
- return ERR_PTR(-EINVAL);
-
err = mptcp_subflow_create_socket(sk, &ssock);
if (err)
- return ERR_PTR(err);
+ return err;
msk->first = ssock->sk;
msk->subflow = ssock;
@@ -133,10 +103,12 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
list_add(&subflow->node, &msk->conn_list);
subflow->request_mptcp = 1;
-set_state:
- if (state != MPTCP_SAME_STATE)
- inet_sk_state_store(sk, state);
- return ssock;
+ /* accept() will wait on first subflow sk_wq, and we always wakes up
+ * via msk->sk_socket
+ */
+ RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
+
+ return 0;
}
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -207,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
return false;
}
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
-
- if (rcvbuf > sk->sk_rcvbuf)
- sk->sk_rcvbuf = rcvbuf;
- }
-
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -229,6 +194,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (!skb)
break;
+ if (__mptcp_check_fallback(msk)) {
+ /* if we are running under the workqueue, TCP could have
+ * collapsed skbs between dummy map creation and now
+ * be sure to adjust the size
+ */
+ map_remaining = skb->len;
+ subflow->map_data_len = skb->len;
+ }
+
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (fin) {
@@ -466,8 +440,15 @@ static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- u64 snd_una = atomic64_read(&msk->snd_una);
bool cleaned = false;
+ u64 snd_una;
+
+ /* on fallback we just need to ignore snd_una, as this is really
+ * plain TCP
+ */
+ if (__mptcp_check_fallback(msk))
+ atomic64_set(&msk->snd_una, msk->write_seq);
+ snd_una = atomic64_read(&msk->snd_una);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
@@ -740,7 +721,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
- struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
bool tx_ok;
@@ -759,15 +739,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto out;
}
-fallback:
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock)) {
- release_sock(sk);
- pr_debug("fallback passthrough");
- ret = sock_sendmsg(ssock, msg);
- return ret >= 0 ? ret + copied : (copied ? copied : ret);
- }
-
pfrag = sk_page_frag(sk);
restart:
mptcp_clean_una(sk);
@@ -819,17 +790,6 @@ wait_for_sndbuf:
}
break;
}
- if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
- /* Can happen for passive sockets:
- * 3WHS negotiated MPTCP, but first packet after is
- * plain TCP (e.g. due to middlebox filtering unknown
- * options).
- *
- * Fall back to TCP.
- */
- release_sock(ssk);
- goto fallback;
- }
copied += ret;
@@ -949,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
return copied;
}
+/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
+ *
+ * Only difference: Use highest rtt estimate of the subflows in use.
+ */
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ u32 time, advmss = 1;
+ u64 rtt_us, mstamp;
+
+ sock_owned_by_me(sk);
+
+ if (copied <= 0)
+ return;
+
+ msk->rcvq_space.copied += copied;
+
+ mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+ time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+
+ rtt_us = msk->rcvq_space.rtt_us;
+ if (rtt_us && time < (rtt_us >> 3))
+ return;
+
+ rtt_us = 0;
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct tcp_sock *tp;
+ u64 sf_rtt_us;
+ u32 sf_advmss;
+
+ tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
+
+ sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
+ sf_advmss = READ_ONCE(tp->advmss);
+
+ rtt_us = max(sf_rtt_us, rtt_us);
+ advmss = max(sf_advmss, advmss);
+ }
+
+ msk->rcvq_space.rtt_us = rtt_us;
+ if (time < (rtt_us >> 3) || rtt_us == 0)
+ return;
+
+ if (msk->rcvq_space.copied <= msk->rcvq_space.space)
+ goto new_measure;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
+
+ rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
+
+ grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
+
+ do_div(grow, msk->rcvq_space.space);
+ rcvwin += (grow << 1);
+
+ rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(sk, rcvmem) < advmss)
+ rcvmem += 128;
+
+ do_div(rcvwin, advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+
+ if (rcvbuf > sk->sk_rcvbuf) {
+ u32 window_clamp;
+
+ window_clamp = tcp_win_from_space(sk, rcvbuf);
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
+ tcp_sk(ssk)->window_clamp = window_clamp;
+ }
+ }
+ }
+
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+new_measure:
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.time = mstamp;
+}
+
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
unsigned int moved = 0;
@@ -972,7 +1026,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
int copied = 0;
int target;
long timeo;
@@ -981,16 +1034,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return -EOPNOTSUPP;
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock)) {
-fallback:
- release_sock(sk);
- pr_debug("fallback-read subflow=%p",
- mptcp_subflow_ctx(ssock->sk));
- copied = sock_recvmsg(ssock, msg, flags);
- return copied;
- }
-
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
@@ -1056,9 +1099,6 @@ fallback:
pr_debug("block timeout %ld", timeo);
mptcp_wait_data(sk, &timeo);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- goto fallback;
}
if (skb_queue_empty(&sk->sk_receive_queue)) {
@@ -1075,6 +1115,8 @@ fallback:
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ mptcp_rcv_space_adjust(msk, copied);
+
release_sock(sk);
return copied;
}
@@ -1172,6 +1214,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void pm_work(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1188,6 +1253,9 @@ static void mptcp_worker(struct work_struct *work)
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
+ if (msk->pm.status)
+ pm_work(msk);
+
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
@@ -1283,7 +1351,12 @@ static int mptcp_init_sock(struct sock *sk)
if (ret)
return ret;
+ ret = __mptcp_socket_create(mptcp_sk(sk));
+ if (ret)
+ return ret;
+
sk_sockets_allocated_inc(sk);
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
return 0;
@@ -1335,8 +1408,6 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how,
break;
}
- /* Wake up anyone sleeping in poll. */
- ssk->sk_state_change(ssk);
release_sock(ssk);
}
@@ -1375,7 +1446,6 @@ static void mptcp_close(struct sock *sk, long timeout)
}
mptcp_cancel_work(sk);
- mptcp_pm_close(msk);
__skb_queue_purge(&sk->sk_receive_queue);
@@ -1448,20 +1518,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->token = subflow_req->token;
msk->subflow = NULL;
- if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) {
- nsk->sk_state = TCP_CLOSE;
- bh_unlock_sock(nsk);
-
- /* we can't call into mptcp_close() here - possible BH context
- * free the sock directly.
- * sk_clone_lock() sets nsk refcnt to two, hence call sk_free()
- * too.
- */
- sk_common_release(nsk);
- sk_free(nsk);
- return NULL;
- }
-
msk->write_seq = subflow_req->idsn + 1;
atomic64_set(&msk->snd_una, msk->write_seq);
if (mp_opt->mp_capable) {
@@ -1482,6 +1538,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
return nsk;
}
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ msk->rcvq_space.time = tp->tcp_mstamp;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
bool kern)
{
@@ -1501,7 +1573,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
return NULL;
pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
-
if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
@@ -1531,6 +1602,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
list_add(&subflow->node, &msk->conn_list);
inet_sk_state_store(newsk, TCP_ESTABLISHED);
+ mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(new_mptcp_sock);
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
@@ -1547,21 +1619,82 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_token_destroy(msk->token);
+ mptcp_token_destroy(msk);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
sk_sockets_allocated_dec(sk);
}
+static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct socket *ssock;
+ int ret;
+
+ switch (optname) {
+ case SO_REUSEPORT:
+ case SO_REUSEADDR:
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
+ if (ret == 0) {
+ if (optname == SO_REUSEPORT)
+ sk->sk_reuseport = ssock->sk->sk_reuseport;
+ else if (optname == SO_REUSEADDR)
+ sk->sk_reuse = ssock->sk->sk_reuse;
+ }
+ release_sock(sk);
+ return ret;
+ }
+
+ return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
+}
+
+static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret = -EOPNOTSUPP;
+ struct socket *ssock;
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
+ if (ret == 0)
+ sk->sk_ipv6only = ssock->sk->sk_ipv6only;
+
+ release_sock(sk);
+ break;
+ }
+
+ return ret;
+}
+
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
+ struct sock *ssk;
pr_debug("msk=%p", msk);
+ if (level == SOL_SOCKET)
+ return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
+
/* @@ the meaning of setsockopt() when the socket is connected and
* there are multiple subflows is not yet defined. It is up to the
* MPTCP-level socket to configure the subflows until the subflow
@@ -1569,11 +1702,13 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
* to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
+ ssk = __mptcp_tcp_fallback(msk);
release_sock(sk);
- if (ssock)
- return tcp_setsockopt(ssock->sk, level, optname, optval,
- optlen);
+ if (ssk)
+ return tcp_setsockopt(ssk, level, optname, optval, optlen);
+
+ if (level == SOL_IPV6)
+ return mptcp_setsockopt_v6(msk, optname, optval, optlen);
return -EOPNOTSUPP;
}
@@ -1582,7 +1717,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *option)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
+ struct sock *ssk;
pr_debug("msk=%p", msk);
@@ -1593,11 +1728,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
* to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
+ ssk = __mptcp_tcp_fallback(msk);
release_sock(sk);
- if (ssock)
- return tcp_getsockopt(ssock->sk, level, optname, optval,
- option);
+ if (ssk)
+ return tcp_getsockopt(ssk, level, optname, optval, option);
return -EOPNOTSUPP;
}
@@ -1636,6 +1770,20 @@ static void mptcp_release_cb(struct sock *sk)
}
}
+static int mptcp_hash(struct sock *sk)
+{
+ /* should never be called,
+ * we hash the TCP subflows not the master socket
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void mptcp_unhash(struct sock *sk)
+{
+ /* called from sk_common_release(), but nothing to do here */
+}
+
static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1660,12 +1808,6 @@ void mptcp_finish_connect(struct sock *ssk)
sk = subflow->conn;
msk = mptcp_sk(sk);
- if (!subflow->mp_capable) {
- MPTCP_INC_STATS(sock_net(sk),
- MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
- return;
- }
-
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -1679,13 +1821,14 @@ void mptcp_finish_connect(struct sock *ssk)
*/
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
- WRITE_ONCE(msk->token, subflow->token);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
atomic64_set(&msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
+
+ mptcp_rcv_space_init(msk, ssk);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -1761,8 +1904,8 @@ static struct proto mptcp_prot = {
.sendmsg = mptcp_sendmsg,
.recvmsg = mptcp_recvmsg,
.release_cb = mptcp_release_cb,
- .hash = inet_hash,
- .unhash = inet_unhash,
+ .hash = mptcp_hash,
+ .unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
@@ -1771,6 +1914,7 @@ static struct proto mptcp_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.no_autobind = true,
};
@@ -1781,9 +1925,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
int err;
lock_sock(sock->sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
@@ -1800,6 +1944,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct mptcp_subflow_context *subflow;
struct socket *ssock;
int err;
@@ -1812,19 +1957,24 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
goto do_connect;
}
- ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sock->sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssock->sk);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
+ subflow->request_mptcp = 0;
#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
+ subflow->request_mptcp = 0;
do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
@@ -1843,42 +1993,6 @@ unlock:
return err;
}
-static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
-{
- if (sock->sk->sk_prot == &tcp_prot) {
- /* we are being invoked from __sys_accept4, after
- * mptcp_accept() has just accepted a non-mp-capable
- * flow: sk is a tcp_sk, not an mptcp one.
- *
- * Hand the socket over to tcp so all further socket ops
- * bypass mptcp.
- */
- sock->ops = &inet_stream_ops;
- }
-
- return inet_getname(sock, uaddr, peer);
-}
-
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
-{
- if (sock->sk->sk_prot == &tcpv6_prot) {
- /* we are being invoked from __sys_accept4 after
- * mptcp_accept() has accepted a non-mp-capable
- * subflow: sk is a tcp_sk, not mptcp.
- *
- * Hand the socket over to tcp so all further
- * socket ops bypass mptcp.
- */
- sock->ops = &inet6_stream_ops;
- }
-
- return inet6_getname(sock, uaddr, peer);
-}
-#endif
-
static int mptcp_listen(struct socket *sock, int backlog)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
@@ -1888,12 +2002,14 @@ static int mptcp_listen(struct socket *sock, int backlog)
pr_debug("msk=%p", msk);
lock_sock(sock->sk);
- ssock = __mptcp_socket_create(msk, TCP_LISTEN);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sock->sk, TCP_LISTEN);
sock_set_flag(sock->sk, SOCK_RCU_FREE);
err = ssock->ops->listen(ssock, backlog);
@@ -1906,15 +2022,6 @@ unlock:
return err;
}
-static bool is_tcp_proto(const struct proto *p)
-{
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- return p == &tcp_prot || p == &tcpv6_prot;
-#else
- return p == &tcp_prot;
-#endif
-}
-
static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
int flags, bool kern)
{
@@ -1932,11 +2039,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssock)
goto unlock_fail;
+ clear_bit(MPTCP_DATA_READY, &msk->flags);
sock_hold(ssock->sk);
release_sock(sock->sk);
err = ssock->ops->accept(sock, newsock, flags, kern);
- if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) {
+ if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
struct mptcp_sock *msk = mptcp_sk(newsock->sk);
struct mptcp_subflow_context *subflow;
@@ -1952,6 +2060,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
}
}
+ if (inet_csk_listen_poll(ssock->sk))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
sock_put(ssock->sk);
return err;
@@ -1960,39 +2070,36 @@ unlock_fail:
return -EINVAL;
}
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
+ 0;
+}
+
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
struct sock *sk = sock->sk;
struct mptcp_sock *msk;
- struct socket *ssock;
__poll_t mask = 0;
+ int state;
msk = mptcp_sk(sk);
- lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (!ssock)
- ssock = __mptcp_nmpc_socket(msk);
- if (ssock) {
- mask = ssock->ops->poll(file, ssock, wait);
- release_sock(sk);
- return mask;
- }
-
- release_sock(sk);
sock_poll_wait(file, sock, wait);
- lock_sock(sk);
- if (test_bit(MPTCP_DATA_READY, &msk->flags))
- mask = EPOLLIN | EPOLLRDNORM;
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
- mask |= EPOLLOUT | EPOLLWRNORM;
+ state = inet_sk_state_load(sk);
+ if (state == TCP_LISTEN)
+ return mptcp_check_readable(msk);
+
+ if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
+ mask |= mptcp_check_readable(msk);
+ if (sk_stream_is_writeable(sk) &&
+ test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ }
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- release_sock(sk);
-
return mask;
}
@@ -2000,18 +2107,11 @@ static int mptcp_shutdown(struct socket *sock, int how)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
- struct socket *ssock;
int ret = 0;
pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (ssock) {
- release_sock(sock->sk);
- return inet_shutdown(ssock, how);
- }
-
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
@@ -2037,6 +2137,9 @@ static int mptcp_shutdown(struct socket *sock, int how)
mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq);
}
+ /* Wake up anyone sleeping in poll. */
+ sock->sk->sk_state_change(sock->sk);
+
out_unlock:
release_sock(sock->sk);
@@ -2051,7 +2154,7 @@ static const struct proto_ops mptcp_stream_ops = {
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
- .getname = mptcp_v4_getname,
+ .getname = inet_getname,
.poll = mptcp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
@@ -2077,7 +2180,7 @@ static struct inet_protosw mptcp_protosw = {
.flags = INET_PROTOSW_ICSK,
};
-void mptcp_proto_init(void)
+void __init mptcp_proto_init(void)
{
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
@@ -2086,6 +2189,7 @@ void mptcp_proto_init(void)
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n");
@@ -2104,7 +2208,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
- .getname = mptcp_v6_getname,
+ .getname = inet6_getname,
.poll = mptcp_poll,
.ioctl = inet6_ioctl,
.gettstamp = sock_gettstamp,
@@ -2139,7 +2243,7 @@ static struct inet_protosw mptcp_v6_protosw = {
.flags = INET_PROTOSW_ICSK,
};
-int mptcp_proto_v6_init(void)
+int __init mptcp_proto_v6_init(void)
{
int err;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index c6eeaf3e8dcb..e5baaef5ec89 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -89,6 +89,7 @@
#define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
+#define MPTCP_FALLBACK_DONE 4
struct mptcp_options_received {
u64 sndr_key;
@@ -173,8 +174,6 @@ struct mptcp_pm_data {
u8 local_addr_max;
u8 subflows_max;
u8 status;
-
- struct work_struct work;
};
struct mptcp_data_frag {
@@ -208,6 +207,12 @@ struct mptcp_sock {
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
+ struct {
+ u32 space; /* bytes copied in last measurement window */
+ u32 copied; /* bytes copied in this measurement window */
+ u64 time; /* start time of measurement window */
+ u64 rtt_us; /* last maximum rtt of subflows */
+ } rcvq_space;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -250,6 +255,7 @@ struct mptcp_subflow_request_sock {
u32 local_nonce;
u32 remote_nonce;
struct mptcp_sock *msk;
+ struct hlist_nulls_node token_node;
};
static inline struct mptcp_subflow_request_sock *
@@ -337,7 +343,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk);
-void mptcp_subflow_init(void);
+void __init mptcp_subflow_init(void);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, int ifindex,
@@ -355,14 +361,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
}
-extern const struct inet_connection_sock_af_ops ipv4_specific;
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-extern const struct inet_connection_sock_af_ops ipv6_specific;
-#endif
-
-void mptcp_proto_init(void);
+void __init mptcp_proto_init(void);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-int mptcp_proto_v6_init(void);
+int __init mptcp_proto_v6_init(void);
#endif
struct sock *mptcp_sk_clone(const struct sock *sk,
@@ -372,17 +373,27 @@ void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
+void __init mptcp_token_init(void);
+static inline void mptcp_token_init_request(struct request_sock *req)
+{
+ mptcp_subflow_rsk(req)->token_node.pprev = NULL;
+}
+
int mptcp_token_new_request(struct request_sock *req);
-void mptcp_token_destroy_request(u32 token);
+void mptcp_token_destroy_request(struct request_sock *req);
int mptcp_token_new_connect(struct sock *sk);
-int mptcp_token_new_accept(u32 token, struct sock *conn);
+void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
+ struct mptcp_sock *msk);
struct mptcp_sock *mptcp_token_get_sock(u32 token);
-void mptcp_token_destroy(u32 token);
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num);
+void mptcp_token_destroy(struct mptcp_sock *msk);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
@@ -399,9 +410,8 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
-void mptcp_pm_init(void);
+void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
-void mptcp_pm_close(struct mptcp_sock *msk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
void mptcp_pm_fully_established(struct mptcp_sock *msk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
@@ -433,7 +443,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-void mptcp_pm_nl_init(void);
+void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
@@ -454,4 +464,46 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+{
+ return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline bool mptcp_check_fallback(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ return __mptcp_check_fallback(msk);
+}
+
+static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
+{
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+ pr_debug("TCP fallback already done (msk=%p)", msk);
+ return;
+ }
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline void mptcp_do_fallback(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ __mptcp_do_fallback(msk);
+}
+
+#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+
+static inline bool subflow_simultaneous_connect(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ return sk->sk_state == TCP_ESTABLISHED &&
+ !mptcp_sk(parent)->pm.server_side &&
+ !subflow->conn_finished;
+}
+
#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 3838a0b3a21f..9f7f3772c13c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -29,40 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
}
-static int subflow_rebuild_header(struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- int local_id, err = 0;
-
- if (subflow->request_mptcp && !subflow->token) {
- pr_debug("subflow=%p", sk);
- err = mptcp_token_new_connect(sk);
- } else if (subflow->request_join && !subflow->local_nonce) {
- struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
-
- pr_debug("subflow=%p", sk);
-
- do {
- get_random_bytes(&subflow->local_nonce, sizeof(u32));
- } while (!subflow->local_nonce);
-
- if (subflow->local_id)
- goto out;
-
- local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
- if (local_id < 0)
- return -EINVAL;
-
- subflow->local_id = local_id;
- }
-
-out:
- if (err)
- return err;
-
- return subflow->icsk_af_ops->rebuild_header(sk);
-}
-
static void subflow_req_destructor(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -72,8 +38,7 @@ static void subflow_req_destructor(struct request_sock *req)
if (subflow_req->msk)
sock_put((struct sock *)subflow_req->msk);
- if (subflow_req->mp_capable)
- mptcp_token_destroy_request(subflow_req->token);
+ mptcp_token_destroy_request(req);
tcp_request_sock_ops.destructor(req);
}
@@ -135,6 +100,7 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 0;
subflow_req->mp_join = 0;
subflow_req->msk = NULL;
+ mptcp_token_init_request(req);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
@@ -222,7 +188,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn;
- struct tcp_sock *tp = tcp_sk(sk);
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -236,6 +201,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
return;
subflow->conn_finished = 1;
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp && mp_opt.mp_capable) {
@@ -250,22 +217,23 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->remote_nonce = mp_opt.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce);
- } else if (subflow->request_mptcp) {
- tp->is_mptcp = 0;
+ } else {
+ if (subflow->request_mptcp)
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ mptcp_do_fallback(sk);
+ pr_fallback(mptcp_sk(subflow->conn));
}
- if (!tp->is_mptcp)
+ if (mptcp_check_fallback(sk)) {
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
return;
+ }
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
-
- if (skb) {
- pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
- subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
- }
} else if (subflow->mp_join) {
u8 hmac[SHA256_DIGEST_SIZE];
@@ -285,9 +253,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
- if (skb)
- subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-
if (!mptcp_finish_join(sk))
goto do_reset;
@@ -386,7 +351,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
- mptcp_token_destroy(mptcp_sk(sk)->token);
+ mptcp_token_destroy(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -505,6 +470,7 @@ create_child:
*/
new_msk->sk_destruct = mptcp_sock_destruct;
mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
+ mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
ctx->conn = new_msk;
new_msk = NULL;
@@ -562,7 +528,8 @@ enum mapping_status {
MAPPING_OK,
MAPPING_INVALID,
MAPPING_EMPTY,
- MAPPING_DATA_FIN
+ MAPPING_DATA_FIN,
+ MAPPING_DUMMY
};
static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
@@ -626,6 +593,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (!skb)
return MAPPING_EMPTY;
+ if (mptcp_check_fallback(ssk))
+ return MAPPING_DUMMY;
+
mpext = mptcp_get_ext(skb);
if (!mpext || !mpext->use_map) {
if (!subflow->map_valid && !skb->len) {
@@ -767,6 +737,16 @@ static bool subflow_check_data_avail(struct sock *ssk)
ssk->sk_err = EBADMSG;
goto fatal;
}
+ if (status == MAPPING_DUMMY) {
+ __mptcp_do_fallback(msk);
+ skb = skb_peek(&ssk->sk_receive_queue);
+ subflow->map_valid = 1;
+ subflow->map_seq = READ_ONCE(msk->ack_seq);
+ subflow->map_data_len = skb->len;
+ subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
+ subflow->ssn_offset;
+ return true;
+ }
if (status != MAPPING_OK)
return false;
@@ -890,14 +870,18 @@ static void subflow_data_ready(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
- if (!subflow->mp_capable && !subflow->mp_join) {
- subflow->tcp_data_ready(sk);
-
+ msk = mptcp_sk(parent);
+ if ((1 << inet_sk_state_load(sk)) & (TCPF_LISTEN | TCPF_CLOSE)) {
+ set_bit(MPTCP_DATA_READY, &msk->flags);
parent->sk_data_ready(parent);
return;
}
+ WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
+ !subflow->mp_join);
+
if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk);
}
@@ -974,7 +958,9 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
struct sockaddr_storage addr;
+ int local_id = loc->id;
struct socket *sf;
+ struct sock *ssk;
u32 remote_token;
int addrlen;
int err;
@@ -986,7 +972,20 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (err)
return err;
- subflow = mptcp_subflow_ctx(sf->sk);
+ ssk = sf->sk;
+ subflow = mptcp_subflow_ctx(ssk);
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (!local_id) {
+ err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
+ if (err < 0)
+ goto failed;
+
+ local_id = err;
+ }
+
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
@@ -997,15 +996,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- sf->sk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
- pr_debug("msk=%p remote_token=%u", msk, remote_token);
+ pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token,
+ local_id);
subflow->remote_token = remote_token;
- subflow->local_id = loc->id;
+ subflow->local_id = local_id;
subflow->request_join = 1;
subflow->request_bkup = 1;
mptcp_info2sockaddr(remote, &addr);
@@ -1118,11 +1118,22 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk);
+ if (subflow_simultaneous_connect(sk)) {
+ mptcp_do_fallback(sk);
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
+ pr_fallback(mptcp_sk(parent));
+ subflow->conn_finished = 1;
+ if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
+ inet_sk_state_store(parent, TCP_ESTABLISHED);
+ parent->sk_state_change(parent);
+ }
+ }
+
/* as recvmsg() does not acquire the subflow socket for ssk selection
* a fin packet carrying a DSS can be unnoticed if we don't trigger
* the data available machinery here.
*/
- if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+ if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk);
if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
@@ -1255,7 +1266,7 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops)
return 0;
}
-void mptcp_subflow_init(void)
+void __init mptcp_subflow_init(void)
{
subflow_request_sock_ops = tcp_request_sock_ops;
if (subflow_ops_init(&subflow_request_sock_ops) != 0)
@@ -1268,7 +1279,6 @@ void mptcp_subflow_init(void)
subflow_specific.conn_request = subflow_v4_conn_request;
subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_specific.sk_rx_dst_set = subflow_finish_connect;
- subflow_specific.rebuild_header = subflow_rebuild_header;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
@@ -1278,7 +1288,6 @@ void mptcp_subflow_init(void)
subflow_v6_specific.conn_request = subflow_v6_conn_request;
subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
- subflow_v6_specific.rebuild_header = subflow_rebuild_header;
subflow_v6m_specific = subflow_v6_specific;
subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 33352dd99d4d..7d8106026081 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -24,7 +24,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/radix-tree.h>
+#include <linux/memblock.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/sock.h>
@@ -33,10 +33,55 @@
#include <net/mptcp.h>
#include "protocol.h"
-static RADIX_TREE(token_tree, GFP_ATOMIC);
-static RADIX_TREE(token_req_tree, GFP_ATOMIC);
-static DEFINE_SPINLOCK(token_tree_lock);
-static int token_used __read_mostly;
+#define TOKEN_MAX_RETRIES 4
+#define TOKEN_MAX_CHAIN_LEN 4
+
+struct token_bucket {
+ spinlock_t lock;
+ int chain_len;
+ struct hlist_nulls_head req_chain;
+ struct hlist_nulls_head msk_chain;
+};
+
+static struct token_bucket *token_hash __read_mostly;
+static unsigned int token_mask __read_mostly;
+
+static struct token_bucket *token_bucket(u32 token)
+{
+ return &token_hash[token & token_mask];
+}
+
+/* called with bucket lock held */
+static struct mptcp_subflow_request_sock *
+__token_lookup_req(struct token_bucket *t, u32 token)
+{
+ struct mptcp_subflow_request_sock *req;
+ struct hlist_nulls_node *pos;
+
+ hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node)
+ if (req->token == token)
+ return req;
+ return NULL;
+}
+
+/* called with bucket lock held */
+static struct mptcp_sock *
+__token_lookup_msk(struct token_bucket *t, u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct sock *sk;
+
+ sk_nulls_for_each_rcu(sk, pos, &t->msk_chain)
+ if (mptcp_sk(sk)->token == token)
+ return mptcp_sk(sk);
+ return NULL;
+}
+
+static bool __token_bucket_busy(struct token_bucket *t, u32 token)
+{
+ return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN ||
+ __token_lookup_req(t, token) || __token_lookup_msk(t, token);
+}
/**
* mptcp_token_new_request - create new key/idsn/token for subflow_request
@@ -52,30 +97,32 @@ static int token_used __read_mostly;
int mptcp_token_new_request(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- int err;
-
- while (1) {
- u32 token;
-
- mptcp_crypto_key_gen_sha(&subflow_req->local_key,
- &subflow_req->token,
- &subflow_req->idsn);
- pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
- req, subflow_req->local_key, subflow_req->token,
- subflow_req->idsn);
-
- token = subflow_req->token;
- spin_lock_bh(&token_tree_lock);
- if (!radix_tree_lookup(&token_req_tree, token) &&
- !radix_tree_lookup(&token_tree, token))
- break;
- spin_unlock_bh(&token_tree_lock);
+ int retries = TOKEN_MAX_RETRIES;
+ struct token_bucket *bucket;
+ u32 token;
+
+again:
+ mptcp_crypto_key_gen_sha(&subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
+ req, subflow_req->local_key, subflow_req->token,
+ subflow_req->idsn);
+
+ token = subflow_req->token;
+ bucket = token_bucket(token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, token)) {
+ spin_unlock_bh(&bucket->lock);
+ if (!--retries)
+ return -EBUSY;
+ goto again;
}
- err = radix_tree_insert(&token_req_tree,
- subflow_req->token, &token_used);
- spin_unlock_bh(&token_tree_lock);
- return err;
+ hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ return 0;
}
/**
@@ -97,48 +144,56 @@ int mptcp_token_new_request(struct request_sock *req)
int mptcp_token_new_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *mptcp_sock = subflow->conn;
- int err;
-
- while (1) {
- u32 token;
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ int retries = TOKEN_MAX_RETRIES;
+ struct token_bucket *bucket;
- mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
- &subflow->idsn);
+ pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
+ sk, subflow->local_key, subflow->token, subflow->idsn);
- pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
- sk, subflow->local_key, subflow->token, subflow->idsn);
+again:
+ mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
+ &subflow->idsn);
- token = subflow->token;
- spin_lock_bh(&token_tree_lock);
- if (!radix_tree_lookup(&token_req_tree, token) &&
- !radix_tree_lookup(&token_tree, token))
- break;
- spin_unlock_bh(&token_tree_lock);
+ bucket = token_bucket(subflow->token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, subflow->token)) {
+ spin_unlock_bh(&bucket->lock);
+ if (!--retries)
+ return -EBUSY;
+ goto again;
}
- err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock);
- spin_unlock_bh(&token_tree_lock);
- return err;
+ WRITE_ONCE(msk->token, subflow->token);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ return 0;
}
/**
- * mptcp_token_new_accept - insert token for later processing
- * @token: the token to insert to the tree
- * @conn: the just cloned socket linked to the new connection
+ * mptcp_token_accept - replace a req sk with full sock in token hash
+ * @req: the request socket to be removed
+ * @msk: the just cloned socket linked to the new connection
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
*/
-int mptcp_token_new_accept(u32 token, struct sock *conn)
+void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
+ struct mptcp_sock *msk)
{
- int err;
+ struct mptcp_subflow_request_sock *pos;
+ struct token_bucket *bucket;
- spin_lock_bh(&token_tree_lock);
- err = radix_tree_insert(&token_tree, token, conn);
- spin_unlock_bh(&token_tree_lock);
+ bucket = token_bucket(req->token);
+ spin_lock_bh(&bucket->lock);
- return err;
+ /* pedantic lookup check for the moved token */
+ pos = __token_lookup_req(bucket, req->token);
+ if (!WARN_ON_ONCE(pos != req))
+ hlist_nulls_del_init_rcu(&req->token_node);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ spin_unlock_bh(&bucket->lock);
}
/**
@@ -152,45 +207,171 @@ int mptcp_token_new_accept(u32 token, struct sock *conn)
*/
struct mptcp_sock *mptcp_token_get_sock(u32 token)
{
- struct sock *conn;
-
- spin_lock_bh(&token_tree_lock);
- conn = radix_tree_lookup(&token_tree, token);
- if (conn) {
- /* token still reserved? */
- if (conn == (struct sock *)&token_used)
- conn = NULL;
- else
- sock_hold(conn);
+ struct hlist_nulls_node *pos;
+ struct token_bucket *bucket;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ rcu_read_lock();
+ bucket = token_bucket(token);
+
+again:
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ msk = mptcp_sk(sk);
+ if (READ_ONCE(msk->token) != token)
+ continue;
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto not_found;
+ if (READ_ONCE(msk->token) != token) {
+ sock_put(sk);
+ goto again;
+ }
+ goto found;
+ }
+ if (get_nulls_value(pos) != (token & token_mask))
+ goto again;
+
+not_found:
+ msk = NULL;
+
+found:
+ rcu_read_unlock();
+ return msk;
+}
+EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
+
+/**
+ * mptcp_token_iter_next - iterate over the token container from given pos
+ * @net: namespace to be iterated
+ * @s_slot: start slot number
+ * @s_num: start number inside the given lock
+ *
+ * This function returns the first mptcp connection structure found inside the
+ * token container starting from the specified position, or NULL.
+ *
+ * On successful iteration, the iterator is move to the next position and the
+ * the acquires a reference to the returned socket.
+ */
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num)
+{
+ struct mptcp_sock *ret = NULL;
+ struct hlist_nulls_node *pos;
+ int slot, num;
+
+ for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) {
+ struct token_bucket *bucket = &token_hash[slot];
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&bucket->msk_chain))
+ continue;
+
+ rcu_read_lock();
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ ++num;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num <= *s_num)
+ continue;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ continue;
+
+ if (!net_eq(sock_net(sk), net)) {
+ sock_put(sk);
+ continue;
+ }
+
+ ret = mptcp_sk(sk);
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
}
- spin_unlock_bh(&token_tree_lock);
- return mptcp_sk(conn);
+out:
+ *s_slot = slot;
+ *s_num = num;
+ return ret;
}
+EXPORT_SYMBOL_GPL(mptcp_token_iter_next);
/**
* mptcp_token_destroy_request - remove mptcp connection/token
- * @token: token of mptcp connection to remove
+ * @req: mptcp request socket dropping the token
*
- * Remove not-yet-fully-established incoming connection identified
- * by @token.
+ * Remove the token associated to @req.
*/
-void mptcp_token_destroy_request(u32 token)
+void mptcp_token_destroy_request(struct request_sock *req)
{
- spin_lock_bh(&token_tree_lock);
- radix_tree_delete(&token_req_tree, token);
- spin_unlock_bh(&token_tree_lock);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_request_sock *pos;
+ struct token_bucket *bucket;
+
+ if (hlist_nulls_unhashed(&subflow_req->token_node))
+ return;
+
+ bucket = token_bucket(subflow_req->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_req(bucket, subflow_req->token);
+ if (!WARN_ON_ONCE(pos != subflow_req)) {
+ hlist_nulls_del_init_rcu(&pos->token_node);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
}
/**
* mptcp_token_destroy - remove mptcp connection/token
- * @token: token of mptcp connection to remove
+ * @msk: mptcp connection dropping the token
*
- * Remove the connection identified by @token.
+ * Remove the token associated to @msk
*/
-void mptcp_token_destroy(u32 token)
+void mptcp_token_destroy(struct mptcp_sock *msk)
{
- spin_lock_bh(&token_tree_lock);
- radix_tree_delete(&token_tree, token);
- spin_unlock_bh(&token_tree_lock);
+ struct token_bucket *bucket;
+ struct mptcp_sock *pos;
+
+ if (sk_unhashed((struct sock *)msk))
+ return;
+
+ bucket = token_bucket(msk->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_msk(bucket, msk->token);
+ if (!WARN_ON_ONCE(pos != msk)) {
+ __sk_nulls_del_node_init_rcu((struct sock *)pos);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
}
+
+void __init mptcp_token_init(void)
+{
+ int i;
+
+ token_hash = alloc_large_system_hash("MPTCP token",
+ sizeof(struct token_bucket),
+ 0,
+ 20,/* one slot per 1MB of memory */
+ 0,
+ NULL,
+ &token_mask,
+ 0,
+ 64 * 1024);
+ for (i = 0; i < token_mask + 1; ++i) {
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i);
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i);
+ spin_lock_init(&token_hash[i].lock);
+ }
+}
+
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS)
+EXPORT_SYMBOL_GPL(mptcp_token_new_request);
+EXPORT_SYMBOL_GPL(mptcp_token_new_connect);
+EXPORT_SYMBOL_GPL(mptcp_token_accept);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy_request);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy);
+#endif
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
new file mode 100644
index 000000000000..e1bd6f0a0676
--- /dev/null
+++ b/net/mptcp/token_test.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req;
+
+ req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req);
+ mptcp_token_init_request((struct request_sock *)req);
+ return req;
+}
+
+static void mptcp_token_test_req_basic(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *null_msk = NULL;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ KUNIT_EXPECT_NE(test, 0, (int)req->token);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token));
+
+ /* cleanup */
+ mptcp_token_destroy_request((struct request_sock *)req);
+}
+
+static struct inet_connection_sock *build_icsk(struct kunit *test)
+{
+ struct inet_connection_sock *icsk;
+
+ icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk);
+ return icsk;
+}
+
+static struct mptcp_subflow_context *build_ctx(struct kunit *test)
+{
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx);
+ return ctx;
+}
+
+static struct mptcp_sock *build_msk(struct kunit *test)
+{
+ struct mptcp_sock *msk;
+
+ msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
+ refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
+ return msk;
+}
+
+static void mptcp_token_test_msk_basic(struct kunit *test)
+{
+ struct inet_connection_sock *icsk = build_icsk(test);
+ struct mptcp_subflow_context *ctx = build_ctx(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ ctx->conn = (struct sock *)msk;
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_connect((struct sock *)icsk));
+ KUNIT_EXPECT_NE(test, 0, (int)ctx->token);
+ KUNIT_EXPECT_EQ(test, ctx->token, msk->token);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token));
+ KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt));
+
+ mptcp_token_destroy(msk);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token));
+}
+
+static void mptcp_token_test_accept(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token));
+
+ /* this is now a no-op */
+ mptcp_token_destroy_request((struct request_sock *)req);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static void mptcp_token_test_destroyed(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+
+ /* simulate race on removal */
+ refcount_set(&sk->sk_refcnt, 0);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static struct kunit_case mptcp_token_test_cases[] = {
+ KUNIT_CASE(mptcp_token_test_req_basic),
+ KUNIT_CASE(mptcp_token_test_msk_basic),
+ KUNIT_CASE(mptcp_token_test_accept),
+ KUNIT_CASE(mptcp_token_test_destroyed),
+ {}
+};
+
+static struct kunit_suite mptcp_token_suite = {
+ .name = "mptcp-token",
+ .test_cases = mptcp_token_test_cases,
+};
+
+kunit_test_suite(mptcp_token_suite);
+
+MODULE_LICENSE("GPL");