From b03d2142bea8cf7407a0a668ce8f5f115bd226c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Jun 2020 15:31:12 -0700 Subject: tcp: move ipv6_specific declaration to remove a warning ipv6_specific should be declared in tcp include files, not mptcp. This removes the following warning : CHECK net/ipv6/tcp_ipv6.c net/ipv6/tcp_ipv6.c:78:42: warning: symbol 'ipv6_specific' was not declared. Should it be static? Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index db56535dfc29..d4294b6d23e4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -355,9 +355,6 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, } extern const struct inet_connection_sock_af_ops ipv4_specific; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -extern const struct inet_connection_sock_af_ops ipv6_specific; -#endif void mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) -- cgit v1.2.3 From 9b9e2f250e3e6f59ad07e6d03838c27a100e0042 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Jun 2020 15:31:13 -0700 Subject: tcp: move ipv4_specific to tcp include file Declare ipv4_specific once, in tcp.h were it belongs. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ include/net/transp_v6.h | 3 --- net/mptcp/protocol.h | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/tcp.h b/include/net/tcp.h index e6920ae0765c..b0f0f93c681c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -907,6 +907,8 @@ static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) TCP_SKB_CB(skb)->bpf.sk_redir = NULL; } +extern const struct inet_connection_sock_af_ops ipv4_specific; + #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index a8f6020f1196..da06613c9603 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -56,9 +56,6 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) -/* address family specific functions */ -extern const struct inet_connection_sock_af_ops ipv4_specific; - void inet6_destroy_sock(struct sock *sk); #define IPV6_SEQ_DGRAM_HEADER \ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d4294b6d23e4..06661781c9af 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -354,8 +354,6 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -extern const struct inet_connection_sock_af_ops ipv4_specific; - void mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcp_proto_v6_init(void); -- cgit v1.2.3 From d39dceca388ad0e4f748836806349ebe09282283 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Jun 2020 19:29:59 +0200 Subject: mptcp: add __init annotation on setup functions Add the missing annotation in some setup-only functions. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 2 +- net/mptcp/protocol.c | 4 ++-- net/mptcp/protocol.h | 10 +++++----- net/mptcp/subflow.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 977d9c8b1453..7de09fdd42a3 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -234,7 +234,7 @@ void mptcp_pm_close(struct mptcp_sock *msk) sock_put((struct sock *)msk); } -void mptcp_pm_init(void) +void __init mptcp_pm_init(void) { pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); if (!pm_wq) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b78edf237ba0..c8820c4156e6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = { .size = sizeof(struct pm_nl_pernet), }; -void mptcp_pm_nl_init(void) +void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3980fbb6f31e..9163a05b9e46 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2077,7 +2077,7 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; -void mptcp_proto_init(void) +void __init mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; @@ -2139,7 +2139,7 @@ static struct inet_protosw mptcp_v6_protosw = { .flags = INET_PROTOSW_ICSK, }; -int mptcp_proto_v6_init(void) +int __init mptcp_proto_v6_init(void) { int err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 482f53aed30a..571d39a1a17c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -337,7 +337,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) int mptcp_is_enabled(struct net *net); bool mptcp_subflow_data_available(struct sock *sk); -void mptcp_subflow_init(void); +void __init mptcp_subflow_init(void); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, int ifindex, @@ -355,9 +355,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -void mptcp_proto_init(void); +void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) -int mptcp_proto_v6_init(void); +int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone(const struct sock *sk, @@ -394,7 +394,7 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -void mptcp_pm_init(void); +void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); @@ -428,7 +428,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -void mptcp_pm_nl_init(void); +void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3838a0b3a21f..c2389ba2d4ee 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1255,7 +1255,7 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) return 0; } -void mptcp_subflow_init(void) +void __init mptcp_subflow_init(void) { subflow_request_sock_ops = tcp_request_sock_ops; if (subflow_ops_init(&subflow_request_sock_ops) != 0) -- cgit v1.2.3 From 2c5ebd001d4f0c64a2dfda94eb1d9b31a8863c8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Jun 2020 19:30:00 +0200 Subject: mptcp: refactor token container Replace the radix tree with a hash table allocated at boot time. The radix tree has some shortcoming: a single lock is contented by all the mptcp operation, the lookup currently use such lock, and traversing all the items would require a lock, too. With hash table instead we trade a little memory to address all the above - a per bucket lock is used. To hash the MPTCP sockets, we re-use the msk' sk_node entry: the MPTCP sockets are never hashed by the stack. Replace the existing hash proto callbacks with a dummy implementation, annotating the above constraint. Additionally refactor the token creation to code to: - limit the number of consecutive attempts to a fixed maximum. Hitting a hash bucket with a long chain is considered a failed attempt - accept() no longer can fail to token management. - if token creation fails at connect() time, we do fallback to TCP (before the connection was closed) v1 -> v2: - fix "no newline at end of file" - Jakub Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 45 +++++---- net/mptcp/protocol.h | 14 ++- net/mptcp/subflow.c | 19 ++-- net/mptcp/token.c | 271 ++++++++++++++++++++++++++++++++++++--------------- 4 files changed, 236 insertions(+), 113 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9163a05b9e46..be09fd525f8f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1448,20 +1448,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; - if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { - nsk->sk_state = TCP_CLOSE; - bh_unlock_sock(nsk); - - /* we can't call into mptcp_close() here - possible BH context - * free the sock directly. - * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() - * too. - */ - sk_common_release(nsk); - sk_free(nsk); - return NULL; - } - msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); if (mp_opt->mp_capable) { @@ -1547,7 +1533,7 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_token_destroy(msk->token); + mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); @@ -1636,6 +1622,20 @@ static void mptcp_release_cb(struct sock *sk) } } +static int mptcp_hash(struct sock *sk) +{ + /* should never be called, + * we hash the TCP subflows not the master socket + */ + WARN_ON_ONCE(1); + return 0; +} + +static void mptcp_unhash(struct sock *sk) +{ + /* called from sk_common_release(), but nothing to do here */ +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1679,7 +1679,6 @@ void mptcp_finish_connect(struct sock *ssk) */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->token, subflow->token); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); @@ -1761,8 +1760,8 @@ static struct proto mptcp_prot = { .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = mptcp_hash, + .unhash = mptcp_unhash, .get_port = mptcp_get_port, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -1771,6 +1770,7 @@ static struct proto mptcp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), + .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; @@ -1800,6 +1800,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; struct socket *ssock; int err; @@ -1812,19 +1813,23 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto do_connect; } + mptcp_token_destroy(msk); ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); if (IS_ERR(ssock)) { err = PTR_ERR(ssock); goto unlock; } + subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; + subflow->request_mptcp = 0; #endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + subflow->request_mptcp = 0; do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -1888,6 +1893,7 @@ static int mptcp_listen(struct socket *sock, int backlog) pr_debug("msk=%p", msk); lock_sock(sock->sk); + mptcp_token_destroy(msk); ssock = __mptcp_socket_create(msk, TCP_LISTEN); if (IS_ERR(ssock)) { err = PTR_ERR(ssock); @@ -2086,6 +2092,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 571d39a1a17c..c05552e5fa23 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -250,6 +250,7 @@ struct mptcp_subflow_request_sock { u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; + struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * @@ -372,12 +373,19 @@ bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); +void __init mptcp_token_init(void); +static inline void mptcp_token_init_request(struct request_sock *req) +{ + mptcp_subflow_rsk(req)->token_node.pprev = NULL; +} + int mptcp_token_new_request(struct request_sock *req); -void mptcp_token_destroy_request(u32 token); +void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *sk); -int mptcp_token_new_accept(u32 token, struct sock *conn); +void mptcp_token_accept(struct mptcp_subflow_request_sock *r, + struct mptcp_sock *msk); struct mptcp_sock *mptcp_token_get_sock(u32 token); -void mptcp_token_destroy(u32 token); +void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c2389ba2d4ee..102db8c88e97 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -32,12 +32,9 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, static int subflow_rebuild_header(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int local_id, err = 0; + int local_id; - if (subflow->request_mptcp && !subflow->token) { - pr_debug("subflow=%p", sk); - err = mptcp_token_new_connect(sk); - } else if (subflow->request_join && !subflow->local_nonce) { + if (subflow->request_join && !subflow->local_nonce) { struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; pr_debug("subflow=%p", sk); @@ -57,9 +54,6 @@ static int subflow_rebuild_header(struct sock *sk) } out: - if (err) - return err; - return subflow->icsk_af_ops->rebuild_header(sk); } @@ -72,8 +66,7 @@ static void subflow_req_destructor(struct request_sock *req) if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); - if (subflow_req->mp_capable) - mptcp_token_destroy_request(subflow_req->token); + mptcp_token_destroy_request(req); tcp_request_sock_ops.destructor(req); } @@ -135,6 +128,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->msk = NULL; + mptcp_token_init_request(req); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -250,7 +244,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); - } else if (subflow->request_mptcp) { + } else { tp->is_mptcp = 0; } @@ -386,7 +380,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } - mptcp_token_destroy(mptcp_sk(sk)->token); + mptcp_token_destroy(mptcp_sk(sk)); inet_sock_destruct(sk); } @@ -505,6 +499,7 @@ create_child: */ new_msk->sk_destruct = mptcp_sock_destruct; mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 33352dd99d4d..9c0771774815 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -33,10 +33,55 @@ #include #include "protocol.h" -static RADIX_TREE(token_tree, GFP_ATOMIC); -static RADIX_TREE(token_req_tree, GFP_ATOMIC); -static DEFINE_SPINLOCK(token_tree_lock); -static int token_used __read_mostly; +#define TOKEN_MAX_RETRIES 4 +#define TOKEN_MAX_CHAIN_LEN 4 + +struct token_bucket { + spinlock_t lock; + int chain_len; + struct hlist_nulls_head req_chain; + struct hlist_nulls_head msk_chain; +}; + +static struct token_bucket *token_hash __read_mostly; +static unsigned int token_mask __read_mostly; + +static struct token_bucket *token_bucket(u32 token) +{ + return &token_hash[token & token_mask]; +} + +/* called with bucket lock held */ +static struct mptcp_subflow_request_sock * +__token_lookup_req(struct token_bucket *t, u32 token) +{ + struct mptcp_subflow_request_sock *req; + struct hlist_nulls_node *pos; + + hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node) + if (req->token == token) + return req; + return NULL; +} + +/* called with bucket lock held */ +static struct mptcp_sock * +__token_lookup_msk(struct token_bucket *t, u32 token) +{ + struct hlist_nulls_node *pos; + struct sock *sk; + + sk_nulls_for_each_rcu(sk, pos, &t->msk_chain) + if (mptcp_sk(sk)->token == token) + return mptcp_sk(sk); + return NULL; +} + +static bool __token_bucket_busy(struct token_bucket *t, u32 token) +{ + return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN || + __token_lookup_req(t, token) || __token_lookup_msk(t, token); +} /** * mptcp_token_new_request - create new key/idsn/token for subflow_request @@ -52,30 +97,32 @@ static int token_used __read_mostly; int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - int err; - - while (1) { - u32 token; - - mptcp_crypto_key_gen_sha(&subflow_req->local_key, - &subflow_req->token, - &subflow_req->idsn); - pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", - req, subflow_req->local_key, subflow_req->token, - subflow_req->idsn); - - token = subflow_req->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; + u32 token; + +again: + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + bucket = token_bucket(token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_req_tree, - subflow_req->token, &token_used); - spin_unlock_bh(&token_tree_lock); - return err; + hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** @@ -97,48 +144,56 @@ int mptcp_token_new_request(struct request_sock *req) int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *mptcp_sock = subflow->conn; - int err; - - while (1) { - u32 token; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; - mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, - &subflow->idsn); + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); +again: + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); - token = subflow->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(subflow->token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, subflow->token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); - spin_unlock_bh(&token_tree_lock); - return err; + WRITE_ONCE(msk->token, subflow->token); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** - * mptcp_token_new_accept - insert token for later processing - * @token: the token to insert to the tree - * @conn: the just cloned socket linked to the new connection + * mptcp_token_accept - replace a req sk with full sock in token hash + * @req: the request socket to be removed + * @msk: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. */ -int mptcp_token_new_accept(u32 token, struct sock *conn) +void mptcp_token_accept(struct mptcp_subflow_request_sock *req, + struct mptcp_sock *msk) { - int err; + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; - spin_lock_bh(&token_tree_lock); - err = radix_tree_insert(&token_tree, token, conn); - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(req->token); + spin_lock_bh(&bucket->lock); - return err; + /* pedantic lookup check for the moved token */ + pos = __token_lookup_req(bucket, req->token); + if (!WARN_ON_ONCE(pos != req)) + hlist_nulls_del_init_rcu(&req->token_node); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + spin_unlock_bh(&bucket->lock); } /** @@ -152,45 +207,103 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) */ struct mptcp_sock *mptcp_token_get_sock(u32 token) { - struct sock *conn; - - spin_lock_bh(&token_tree_lock); - conn = radix_tree_lookup(&token_tree, token); - if (conn) { - /* token still reserved? */ - if (conn == (struct sock *)&token_used) - conn = NULL; - else - sock_hold(conn); + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) != token) + continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto not_found; + if (READ_ONCE(msk->token) != token) { + sock_put(sk); + goto again; + } + goto found; } - spin_unlock_bh(&token_tree_lock); + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + +not_found: + msk = NULL; - return mptcp_sk(conn); +found: + rcu_read_unlock(); + return msk; } /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @req: mptcp request socket dropping the token * - * Remove not-yet-fully-established incoming connection identified - * by @token. + * Remove the token associated to @req. */ -void mptcp_token_destroy_request(u32 token) +void mptcp_token_destroy_request(struct request_sock *req) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_req_tree, token); - spin_unlock_bh(&token_tree_lock); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; + + if (hlist_nulls_unhashed(&subflow_req->token_node)) + return; + + bucket = token_bucket(subflow_req->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_req(bucket, subflow_req->token); + if (!WARN_ON_ONCE(pos != subflow_req)) { + hlist_nulls_del_init_rcu(&pos->token_node); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } /** * mptcp_token_destroy - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @msk: mptcp connection dropping the token * - * Remove the connection identified by @token. + * Remove the token associated to @msk */ -void mptcp_token_destroy(u32 token) +void mptcp_token_destroy(struct mptcp_sock *msk) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_tree, token); - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + struct mptcp_sock *pos; + + if (sk_unhashed((struct sock *)msk)) + return; + + bucket = token_bucket(msk->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_msk(bucket, msk->token); + if (!WARN_ON_ONCE(pos != msk)) { + __sk_nulls_del_node_init_rcu((struct sock *)pos); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); +} + +void __init mptcp_token_init(void) +{ + int i; + + token_hash = alloc_large_system_hash("MPTCP token", + sizeof(struct token_bucket), + 0, + 20,/* one slot per 1MB of memory */ + 0, + NULL, + &token_mask, + 0, + 64 * 1024); + for (i = 0; i < token_mask + 1; ++i) { + INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i); + INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i); + spin_lock_init(&token_hash[i].lock); + } } -- cgit v1.2.3 From a00a582203dbc43ea311a50e979038fc0c8ee19f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Jun 2020 19:30:01 +0200 Subject: mptcp: move crypto test to KUNIT currently MPTCP uses a custom hook to executed unit tests at boot time. Let's use the KUNIT framework instead. Additionally move the relevant code to a separate file and export the function needed by the test when self-tests are build as a module. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/Kconfig | 20 +++++++++----- net/mptcp/Makefile | 3 +++ net/mptcp/crypto.c | 63 ++----------------------------------------- net/mptcp/crypto_test.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 67 deletions(-) create mode 100644 net/mptcp/crypto_test.c (limited to 'net/mptcp') diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a9ed3bf1d93f..d7d5f9349366 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -18,12 +18,20 @@ config MPTCP_IPV6 select IPV6 default y -config MPTCP_HMAC_TEST - bool "Tests for MPTCP HMAC implementation" +endif + +config MPTCP_KUNIT_TESTS + tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS + select MPTCP + depends on KUNIT + default KUNIT_ALL_TESTS help - This option enable boot time self-test for the HMAC implementation - used by the MPTCP code + Currently covers the MPTCP crypto helpers. + Only useful for kernel devs running KUnit test harness and are not + for inclusion into a production build. - Say N if you are unsure. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. -endif diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index baa0640527c7..f9039804207b 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,3 +3,6 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o + +mptcp_crypto_test-objs := crypto_test.o +obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o \ No newline at end of file diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 3d980713a9e2..6c4ea979dfd4 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -87,65 +87,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256_final(&state, (u8 *)hmac); } -#ifdef CONFIG_MPTCP_HMAC_TEST -struct test_cast { - char *key; - char *msg; - char *result; -}; - -/* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size. - */ -static struct test_cast tests[] = { - { - .key = "0b0b0b0b0b0b0b0b", - .msg = "48692054", - .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", - }, - { - .key = "aaaaaaaaaaaaaaaa", - .msg = "dddddddd", - .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", - }, - { - .key = "0102030405060708", - .msg = "cdcdcdcd", - .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", - }, -}; - -static int __init test_mptcp_crypto(void) -{ - char hmac[32], hmac_hex[65]; - u32 nonce1, nonce2; - u64 key1, key2; - u8 msg[8]; - int i, j; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - /* mptcp hmap will convert to be before computing the hmac */ - key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); - key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); - nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); - nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - - put_unaligned_be32(nonce1, &msg[0]); - put_unaligned_be32(nonce2, &msg[4]); - - mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); - for (j = 0; j < 32; ++j) - sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); - hmac_hex[64] = 0; - - if (memcmp(hmac_hex, tests[i].result, 64)) - pr_err("test %d failed, got %s expected %s", i, - hmac_hex, tests[i].result); - else - pr_info("test %d [ ok ]", i); - } - return 0; -} - -late_initcall(test_mptcp_crypto); +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c new file mode 100644 index 000000000000..017248dea038 --- /dev/null +++ b/net/mptcp/crypto_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "protocol.h" + +struct test_case { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size. + */ +static struct test_case tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", + }, +}; + +static void mptcp_crypto_test_basic(struct kunit *test) +{ + char hmac[32], hmac_hex[65]; + u32 nonce1, nonce2; + u64 key1, key2; + u8 msg[8]; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); + for (j = 0; j < 32; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[64] = 0; + + KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result); + } +} + +static struct kunit_case mptcp_crypto_test_cases[] = { + KUNIT_CASE(mptcp_crypto_test_basic), + {} +}; + +static struct kunit_suite mptcp_crypto_suite = { + .name = "mptcp-crypto", + .test_cases = mptcp_crypto_test_cases, +}; + +kunit_test_suite(mptcp_crypto_suite); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From a8ee9c9b58199be2692f7eb761d7a01749f79655 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Jun 2020 19:30:02 +0200 Subject: mptcp: introduce token KUNIT self-tests Unit tests for the internal MPTCP token APIs, using KUNIT v1 -> v2: - use the correct RCU annotation when initializing icsk ulp - fix a few checkpatch issues Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/Kconfig | 2 +- net/mptcp/Makefile | 3 +- net/mptcp/token.c | 9 ++++ net/mptcp/token_test.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 net/mptcp/token_test.c (limited to 'net/mptcp') diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index d7d5f9349366..af84fce70bb0 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -26,7 +26,7 @@ config MPTCP_KUNIT_TESTS depends on KUNIT default KUNIT_ALL_TESTS help - Currently covers the MPTCP crypto helpers. + Currently covers the MPTCP crypto and token helpers. Only useful for kernel devs running KUnit test harness and are not for inclusion into a production build. diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index f9039804207b..c53f9b845523 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -5,4 +5,5 @@ mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o mptcp_crypto_test-objs := crypto_test.o -obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o \ No newline at end of file +mptcp_token_test-objs := token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 9c0771774815..66a4990bd897 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -307,3 +307,12 @@ void __init mptcp_token_init(void) spin_lock_init(&token_hash[i].lock); } } + +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_token_new_request); +EXPORT_SYMBOL_GPL(mptcp_token_new_connect); +EXPORT_SYMBOL_GPL(mptcp_token_accept); +EXPORT_SYMBOL_GPL(mptcp_token_get_sock); +EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); +EXPORT_SYMBOL_GPL(mptcp_token_destroy); +#endif diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c new file mode 100644 index 000000000000..e1bd6f0a0676 --- /dev/null +++ b/net/mptcp/token_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "protocol.h" + +static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req; + + req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); + mptcp_token_init_request((struct request_sock *)req); + return req; +} + +static void mptcp_token_test_req_basic(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *null_msk = NULL; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + KUNIT_EXPECT_NE(test, 0, (int)req->token); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + + /* cleanup */ + mptcp_token_destroy_request((struct request_sock *)req); +} + +static struct inet_connection_sock *build_icsk(struct kunit *test) +{ + struct inet_connection_sock *icsk; + + icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk); + return icsk; +} + +static struct mptcp_subflow_context *build_ctx(struct kunit *test) +{ + struct mptcp_subflow_context *ctx; + + ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx); + return ctx; +} + +static struct mptcp_sock *build_msk(struct kunit *test) +{ + struct mptcp_sock *msk; + + msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); + refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + return msk; +} + +static void mptcp_token_test_msk_basic(struct kunit *test) +{ + struct inet_connection_sock *icsk = build_icsk(test); + struct mptcp_subflow_context *ctx = build_ctx(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + ctx->conn = (struct sock *)msk; + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_connect((struct sock *)icsk)); + KUNIT_EXPECT_NE(test, 0, (int)ctx->token); + KUNIT_EXPECT_EQ(test, ctx->token, msk->token); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); + + mptcp_token_destroy(msk); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); +} + +static void mptcp_token_test_accept(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* this is now a no-op */ + mptcp_token_destroy_request((struct request_sock *)req); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static void mptcp_token_test_destroyed(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + + /* simulate race on removal */ + refcount_set(&sk->sk_refcnt, 0); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static struct kunit_case mptcp_token_test_cases[] = { + KUNIT_CASE(mptcp_token_test_req_basic), + KUNIT_CASE(mptcp_token_test_msk_basic), + KUNIT_CASE(mptcp_token_test_accept), + KUNIT_CASE(mptcp_token_test_destroyed), + {} +}; + +static struct kunit_suite mptcp_token_suite = { + .name = "mptcp-token", + .test_cases = mptcp_token_test_cases, +}; + +kunit_test_suite(mptcp_token_suite); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e1ff9e82e2ea53d01540692a85c16a77e1089537 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 29 Jun 2020 22:26:20 +0200 Subject: net: mptcp: improve fallback to TCP Keep using MPTCP sockets and a use "dummy mapping" in case of fallback to regular TCP. When fallback is triggered, skip addition of the MPTCP option on send. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/11 Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/22 Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 9 ++++- net/mptcp/protocol.c | 98 ++++++++++++++-------------------------------------- net/mptcp/protocol.h | 33 ++++++++++++++++++ net/mptcp/subflow.c | 47 ++++++++++++++++--------- 4 files changed, 98 insertions(+), 89 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index df9a51425c6f..b96d3660562f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + if (unlikely(mptcp_check_fallback(sk))) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(sk)->is_mptcp = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); return false; } @@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; + if (__mptcp_check_fallback(msk)) + return; + mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index be09fd525f8f..84ae96be9837 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -52,11 +52,6 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) -{ - return msk->first && !sk_is_mptcp(msk->first); -} - static struct socket *mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -94,7 +89,7 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) if (unlikely(sock)) return sock; - if (likely(!__mptcp_needs_tcp_fallback(msk))) + if (likely(!__mptcp_check_fallback(msk))) return NULL; return msk->subflow; @@ -133,6 +128,11 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; + /* accept() will wait on first subflow sk_wq, and we always wakes up + * via msk->sk_socket + */ + RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + set_state: if (state != MPTCP_SAME_STATE) inet_sk_state_store(sk, state); @@ -229,6 +229,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (!skb) break; + if (__mptcp_check_fallback(msk)) { + /* if we are running under the workqueue, TCP could have + * collapsed skbs between dummy map creation and now + * be sure to adjust the size + */ + map_remaining = skb->len; + subflow->map_data_len = skb->len; + } + offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { @@ -466,8 +475,15 @@ static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - u64 snd_una = atomic64_read(&msk->snd_una); bool cleaned = false; + u64 snd_una; + + /* on fallback we just need to ignore snd_una, as this is really + * plain TCP + */ + if (__mptcp_check_fallback(msk)) + atomic64_set(&msk->snd_una, msk->write_seq); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -740,7 +756,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; struct sock *ssk; bool tx_ok; @@ -759,15 +774,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } -fallback: - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { - release_sock(sk); - pr_debug("fallback passthrough"); - ret = sock_sendmsg(ssock, msg); - return ret >= 0 ? ret + copied : (copied ? copied : ret); - } - pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); @@ -819,17 +825,6 @@ wait_for_sndbuf: } break; } - if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { - /* Can happen for passive sockets: - * 3WHS negotiated MPTCP, but first packet after is - * plain TCP (e.g. due to middlebox filtering unknown - * options). - * - * Fall back to TCP. - */ - release_sock(ssk); - goto fallback; - } copied += ret; @@ -972,7 +967,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int copied = 0; int target; long timeo; @@ -981,16 +975,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { -fallback: - release_sock(sk); - pr_debug("fallback-read subflow=%p", - mptcp_subflow_ctx(ssock->sk)); - copied = sock_recvmsg(ssock, msg, flags); - return copied; - } - timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); @@ -1056,9 +1040,6 @@ fallback: pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - goto fallback; } if (skb_queue_empty(&sk->sk_receive_queue)) { @@ -1335,8 +1316,6 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, break; } - /* Wake up anyone sleeping in poll. */ - ssk->sk_state_change(ssk); release_sock(ssk); } @@ -1660,12 +1639,6 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - if (!subflow->mp_capable) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - return; - } - pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); @@ -1971,23 +1944,10 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; struct mptcp_sock *msk; - struct socket *ssock; __poll_t mask = 0; msk = mptcp_sk(sk); - lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (!ssock) - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - mask = ssock->ops->poll(file, ssock, wait); - release_sock(sk); - return mask; - } - - release_sock(sk); sock_poll_wait(file, sock, wait); - lock_sock(sk); if (test_bit(MPTCP_DATA_READY, &msk->flags)) mask = EPOLLIN | EPOLLRDNORM; @@ -1997,8 +1957,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - release_sock(sk); - return mask; } @@ -2006,18 +1964,11 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; - struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); - ssock = __mptcp_tcp_fallback(msk); - if (ssock) { - release_sock(sock->sk); - return inet_shutdown(ssock, how); - } - if (how == SHUT_WR || how == SHUT_RDWR) inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); @@ -2043,6 +1994,9 @@ static int mptcp_shutdown(struct socket *sock, int how) mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); } + /* Wake up anyone sleeping in poll. */ + sock->sk->sk_state_change(sock->sk); + out_unlock: release_sock(sock->sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c05552e5fa23..a709df659ae0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 +#define MPTCP_FALLBACK_DONE 4 struct mptcp_options_received { u64 sndr_key; @@ -457,4 +458,36 @@ static inline bool before64(__u64 seq1, __u64 seq2) void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); +static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline bool mptcp_check_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + return __mptcp_check_fallback(msk); +} + +static inline void __mptcp_do_fallback(struct mptcp_sock *msk) +{ + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) { + pr_debug("TCP fallback already done (msk=%p)", msk); + return; + } + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline void mptcp_do_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + __mptcp_do_fallback(msk); +} + +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 102db8c88e97..cb8a42ff4646 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -216,7 +216,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; - struct tcp_sock *tp = tcp_sk(sk); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -230,6 +229,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) return; subflow->conn_finished = 1; + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp && mp_opt.mp_capable) { @@ -245,21 +246,20 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); } else { - tp->is_mptcp = 0; + if (subflow->request_mptcp) + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); } - if (!tp->is_mptcp) + if (mptcp_check_fallback(sk)) return; if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), subflow->remote_key); mptcp_finish_connect(sk); - - if (skb) { - pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - } } else if (subflow->mp_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -279,9 +279,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (skb) - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - if (!mptcp_finish_join(sk)) goto do_reset; @@ -557,7 +554,8 @@ enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, - MAPPING_DATA_FIN + MAPPING_DATA_FIN, + MAPPING_DUMMY }; static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) @@ -621,6 +619,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!skb) return MAPPING_EMPTY; + if (mptcp_check_fallback(ssk)) + return MAPPING_DUMMY; + mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { @@ -762,6 +763,16 @@ static bool subflow_check_data_avail(struct sock *ssk) ssk->sk_err = EBADMSG; goto fatal; } + if (status == MAPPING_DUMMY) { + __mptcp_do_fallback(msk); + skb = skb_peek(&ssk->sk_receive_queue); + subflow->map_valid = 1; + subflow->map_seq = READ_ONCE(msk->ack_seq); + subflow->map_data_len = skb->len; + subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - + subflow->ssn_offset; + return true; + } if (status != MAPPING_OK) return false; @@ -885,14 +896,18 @@ static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; - if (!subflow->mp_capable && !subflow->mp_join) { - subflow->tcp_data_ready(sk); - + msk = mptcp_sk(parent); + if (inet_sk_state_load(sk) == TCP_LISTEN) { + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join); + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); } @@ -1117,7 +1132,7 @@ static void subflow_state_change(struct sock *sk) * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ - if (subflow->mp_capable && mptcp_subflow_data_available(sk)) + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); if (!(parent->sk_shutdown & RCV_SHUTDOWN) && -- cgit v1.2.3 From 8fd738049ac3d67a937d36577763b47180aae1ad Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 29 Jun 2020 22:26:21 +0200 Subject: mptcp: fallback in case of simultaneous connect when a MPTCP client tries to connect to itself, tcp_finish_connect() is never reached. Because of this, depending on the socket current state, multiple faulty behaviours can be observed: 1) a WARN_ON() in subflow_data_ready() is hit WARNING: CPU: 2 PID: 882 at net/mptcp/subflow.c:911 subflow_data_ready+0x18b/0x230 [...] CPU: 2 PID: 882 Comm: gh35 Not tainted 5.7.0+ #187 [...] RIP: 0010:subflow_data_ready+0x18b/0x230 [...] Call Trace: tcp_data_queue+0xd2f/0x4250 tcp_rcv_state_process+0xb1c/0x49d3 tcp_v4_do_rcv+0x2bc/0x790 __release_sock+0x153/0x2d0 release_sock+0x4f/0x170 mptcp_shutdown+0x167/0x4e0 __sys_shutdown+0xe6/0x180 __x64_sys_shutdown+0x50/0x70 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 2) client is stuck forever in mptcp_sendmsg() because the socket is not TCP_ESTABLISHED crash> bt 4847 PID: 4847 TASK: ffff88814b2fb100 CPU: 1 COMMAND: "gh35" #0 [ffff8881376ff680] __schedule at ffffffff97248da4 #1 [ffff8881376ff778] schedule at ffffffff9724a34f #2 [ffff8881376ff7a0] schedule_timeout at ffffffff97252ba0 #3 [ffff8881376ff8a8] wait_woken at ffffffff958ab4ba #4 [ffff8881376ff940] sk_stream_wait_connect at ffffffff96c2d859 #5 [ffff8881376ffa28] mptcp_sendmsg at ffffffff97207fca #6 [ffff8881376ffbc0] sock_sendmsg at ffffffff96be1b5b #7 [ffff8881376ffbe8] sock_write_iter at ffffffff96be1daa #8 [ffff8881376ffce8] new_sync_write at ffffffff95e5cb52 #9 [ffff8881376ffe50] vfs_write at ffffffff95e6547f #10 [ffff8881376ffe90] ksys_write at ffffffff95e65d26 #11 [ffff8881376fff28] do_syscall_64 at ffffffff956088ba #12 [ffff8881376fff50] entry_SYSCALL_64_after_hwframe at ffffffff9740008c RIP: 00007f126f6956ed RSP: 00007ffc2a320278 RFLAGS: 00000217 RAX: ffffffffffffffda RBX: 0000000020000044 RCX: 00007f126f6956ed RDX: 0000000000000004 RSI: 00000000004007b8 RDI: 0000000000000003 RBP: 00007ffc2a3202a0 R8: 0000000000400720 R9: 0000000000400720 R10: 0000000000400720 R11: 0000000000000217 R12: 00000000004004b0 R13: 00007ffc2a320380 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b 3) tcpdump captures show that DSS is exchanged even when MP_CAPABLE handshake didn't complete. $ tcpdump -tnnr bad.pcap IP 127.0.0.1.20000 > 127.0.0.1.20000: Flags [S], seq 3208913911, win 65483, options [mss 65495,sackOK,TS val 3291706876 ecr 3291694721,nop,wscale 7,mptcp capable v1], length 0 IP 127.0.0.1.20000 > 127.0.0.1.20000: Flags [S.], seq 3208913911, ack 3208913912, win 65483, options [mss 65495,sackOK,TS val 3291706876 ecr 3291706876,nop,wscale 7,mptcp capable v1], length 0 IP 127.0.0.1.20000 > 127.0.0.1.20000: Flags [.], ack 1, win 512, options [nop,nop,TS val 3291706876 ecr 3291706876], length 0 IP 127.0.0.1.20000 > 127.0.0.1.20000: Flags [F.], seq 1, ack 1, win 512, options [nop,nop,TS val 3291707876 ecr 3291706876,mptcp dss fin seq 0 subseq 0 len 1,nop,nop], length 0 IP 127.0.0.1.20000 > 127.0.0.1.20000: Flags [.], ack 2, win 512, options [nop,nop,TS val 3291707876 ecr 3291707876], length 0 force a fallback to TCP in these cases, and adjust the main socket state to avoid hanging in mptcp_sendmsg(). Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/35 Reported-by: Christoph Paasch Suggested-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 10 ++++++++++ net/mptcp/subflow.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a709df659ae0..1d05d9841b5c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -490,4 +490,14 @@ static inline void mptcp_do_fallback(struct sock *sk) #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +static inline bool subflow_simultaneous_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + return sk->sk_state == TCP_ESTABLISHED && + !mptcp_sk(parent)->pm.server_side && + !subflow->conn_finished; +} + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index cb8a42ff4646..548f9e347ff5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1128,6 +1128,16 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); + if (subflow_simultaneous_connect(sk)) { + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(parent)); + subflow->conn_finished = 1; + if (inet_sk_state_load(parent) == TCP_SYN_SENT) { + inet_sk_state_store(parent, TCP_ESTABLISHED); + parent->sk_state_change(parent); + } + } + /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. -- cgit v1.2.3 From d2f77c53342e3e13ce1fd7f9638ee200026e14f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 29 Jun 2020 22:26:22 +0200 Subject: mptcp: check for plain TCP sock at accept time This cleanup the code a bit and avoid corrupted states on weird syscall sequence (accept(), connect()). Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 69 ++++++---------------------------------------------- 1 file changed, 7 insertions(+), 62 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 84ae96be9837..dbeb6fe374f5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -52,13 +52,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -static struct socket *mptcp_is_tcpsk(struct sock *sk) +static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; - if (sock->sk != sk) - return NULL; - if (unlikely(sk->sk_prot == &tcp_prot)) { /* we are being invoked after mptcp_accept() has * accepted a non-mp-capable flow: sk is a tcp_sk, @@ -68,27 +65,21 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk) * bypass mptcp. */ sock->ops = &inet_stream_ops; - return sock; + return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { sock->ops = &inet6_stream_ops; - return sock; + return true; #endif } - return NULL; + return false; } static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - struct socket *sock; - sock_owned_by_me((const struct sock *)msk); - sock = mptcp_is_tcpsk((struct sock *)msk); - if (unlikely(sock)) - return sock; - if (likely(!__mptcp_check_fallback(msk))) return NULL; @@ -1466,7 +1457,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return NULL; pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -1821,42 +1811,6 @@ unlock: return err; } -static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcp_prot) { - /* we are being invoked from __sys_accept4, after - * mptcp_accept() has just accepted a non-mp-capable - * flow: sk is a tcp_sk, not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - sock->ops = &inet_stream_ops; - } - - return inet_getname(sock, uaddr, peer); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcpv6_prot) { - /* we are being invoked from __sys_accept4 after - * mptcp_accept() has accepted a non-mp-capable - * subflow: sk is a tcp_sk, not mptcp. - * - * Hand the socket over to tcp so all further - * socket ops bypass mptcp. - */ - sock->ops = &inet6_stream_ops; - } - - return inet6_getname(sock, uaddr, peer); -} -#endif - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -1885,15 +1839,6 @@ unlock: return err; } -static bool is_tcp_proto(const struct proto *p) -{ -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - return p == &tcp_prot || p == &tcpv6_prot; -#else - return p == &tcp_prot; -#endif -} - static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { @@ -1915,7 +1860,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, release_sock(sock->sk); err = ssock->ops->accept(sock, newsock, flags, kern); - if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; @@ -2011,7 +1956,7 @@ static const struct proto_ops mptcp_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v4_getname, + .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, @@ -2065,7 +2010,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v6_getname, + .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, -- cgit v1.2.3 From fa68018dc45e3faee9d866d5dc484d141e8f1093 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 29 Jun 2020 22:26:23 +0200 Subject: mptcp: create first subflow at msk creation time This cleans the code a bit and makes the behavior more consistent. Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 53 ++++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dbeb6fe374f5..ad619bda71cc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -86,32 +86,16 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) return msk->subflow; } -static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) -{ - return !msk->first; -} - -static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock; - - ssock = __mptcp_nmpc_socket(msk); - if (ssock) - goto set_state; - - if (!__mptcp_can_create_subflow(msk)) - return ERR_PTR(-EINVAL); - err = mptcp_subflow_create_socket(sk, &ssock); if (err) - return ERR_PTR(err); + return err; msk->first = ssock->sk; msk->subflow = ssock; @@ -124,10 +108,7 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) */ RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); -set_state: - if (state != MPTCP_SAME_STATE) - inet_sk_state_store(sk, state); - return ssock; + return 0; } static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -1255,6 +1236,10 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + ret = __mptcp_socket_create(mptcp_sk(sk)); + if (ret) + return ret; + sk_sockets_allocated_inc(sk); sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; @@ -1744,9 +1729,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } @@ -1776,13 +1761,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto do_connect; } - mptcp_token_destroy(msk); - ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -1820,13 +1806,14 @@ static int mptcp_listen(struct socket *sock, int backlog) pr_debug("msk=%p", msk); lock_sock(sock->sk); - mptcp_token_destroy(msk); - ssock = __mptcp_socket_create(msk, TCP_LISTEN); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_LISTEN); sock_set_flag(sock->sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); -- cgit v1.2.3 From 76660afbb7a1ac6bef0be34c4c6e76d7e07b74d7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 29 Jun 2020 22:26:24 +0200 Subject: mptcp: __mptcp_tcp_fallback() returns a struct sock Currently __mptcp_tcp_fallback() always return NULL on incoming connections, because MPTCP does not create the additional socket for the first subflow. Since the previous commit no __mptcp_tcp_fallback() caller needs a struct socket, so let __mptcp_tcp_fallback() return the first subflow sock and cope correctly even with incoming connections. Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ad619bda71cc..f2b2bd37e371 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -76,14 +76,14 @@ static bool mptcp_is_tcpsk(struct sock *sk) return false; } -static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; - return msk->subflow; + return msk->first; } static int __mptcp_socket_create(struct mptcp_sock *msk) @@ -1498,7 +1498,7 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -1509,11 +1509,10 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_setsockopt(ssock->sk, level, optname, optval, - optlen); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); return -EOPNOTSUPP; } @@ -1522,7 +1521,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -1533,11 +1532,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_getsockopt(ssock->sk, level, optname, optval, - option); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); return -EOPNOTSUPP; } -- cgit v1.2.3 From 8a05661b2b266b6dc45af255b3037b00ef31d85d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 29 Jun 2020 22:26:25 +0200 Subject: mptcp: close poll() races mptcp_poll always return POLLOUT for unblocking connect(), ensure that the socket is a suitable state. The MPTCP_DATA_READY bit is never cleared on accept: ensure we don't leave mptcp_accept() with an empty accept queue and such bit set. Signed-off-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2b2bd37e371..28ec26d97f96 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1841,6 +1841,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssock) goto unlock_fail; + clear_bit(MPTCP_DATA_READY, &msk->flags); sock_hold(ssock->sk); release_sock(sock->sk); @@ -1861,6 +1862,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } + if (inet_csk_listen_poll(ssock->sk)) + set_bit(MPTCP_DATA_READY, &msk->flags); sock_put(ssock->sk); return err; @@ -1869,21 +1872,33 @@ unlock_fail: return -EINVAL; } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : + 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; + int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); - if (test_bit(MPTCP_DATA_READY, &msk->flags)) - mask = EPOLLIN | EPOLLRDNORM; - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + state = inet_sk_state_load(sk); + if (state == TCP_LISTEN) + return mptcp_check_readable(msk); + + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { + mask |= mptcp_check_readable(msk); + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; -- cgit v1.2.3 From 6bad912b7e5ab51c23d8fa8362ca2d4ceeebdb74 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Jun 2020 16:38:26 +0200 Subject: mptcp: do nonce initialization at subflow creation time This clean-up the code a bit, reduces the number of used hooks and indirect call requested, and allow better error reporting from __mptcp_subflow_connect() Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 54 ++++++++++++++++++++--------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 548f9e347ff5..664aa9158363 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -29,34 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } -static int subflow_rebuild_header(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int local_id; - - if (subflow->request_join && !subflow->local_nonce) { - struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; - - pr_debug("subflow=%p", sk); - - do { - get_random_bytes(&subflow->local_nonce, sizeof(u32)); - } while (!subflow->local_nonce); - - if (subflow->local_id) - goto out; - - local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); - if (local_id < 0) - return -EINVAL; - - subflow->local_id = local_id; - } - -out: - return subflow->icsk_af_ops->rebuild_header(sk); -} - static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -984,7 +956,9 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; + int local_id = loc->id; struct socket *sf; + struct sock *ssk; u32 remote_token; int addrlen; int err; @@ -996,7 +970,20 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (err) return err; - subflow = mptcp_subflow_ctx(sf->sk); + ssk = sf->sk; + subflow = mptcp_subflow_ctx(ssk); + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (!local_id) { + err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); + if (err < 0) + goto failed; + + local_id = err; + } + subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -1007,15 +994,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (loc->family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - sf->sk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u", msk, remote_token); + pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token, + local_id); subflow->remote_token = remote_token; - subflow->local_id = loc->id; + subflow->local_id = local_id; subflow->request_join = 1; subflow->request_bkup = 1; mptcp_info2sockaddr(remote, &addr); @@ -1288,7 +1276,6 @@ void __init mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_specific.rebuild_header = subflow_rebuild_header; #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; @@ -1298,7 +1285,6 @@ void __init mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_v6_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; -- cgit v1.2.3 From a6b118febbab3f6454057612b355d0b667c1fafa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Jun 2020 21:24:45 +0200 Subject: mptcp: add receive buffer auto-tuning When mptcp is used, userspace doesn't read from the tcp (subflow) socket but from the parent (mptcp) socket receive queue. skbs are moved from the subflow socket to the mptcp rx queue either from 'data_ready' callback (if mptcp socket can be locked), a work queue, or the socket receive function. This means tcp_rcv_space_adjust() is never called and thus no receive buffer size auto-tuning is done. An earlier (not merged) patch added tcp_rcv_space_adjust() calls to the function that moves skbs from subflow to mptcp socket. While this enabled autotuning, it also meant tuning was done even if userspace was reading the mptcp socket very slowly. This adds mptcp_rcv_space_adjust() and calls it after userspace has read data from the mptcp socket rx queue. Its very similar to tcp_rcv_space_adjust, with two differences: 1. The rtt estimate is the largest one observed on a subflow 2. The rcvbuf size and window clamp of all subflows is adjusted to the mptcp-level rcvbuf. Otherwise, we get spurious drops at tcp (subflow) socket level if the skbs are not moved to the mptcp socket fast enough. Before: time mptcp_connect.sh -t -f $((4*1024*1024)) -d 300 -l 0.01% -r 0 -e "" -m mmap [..] ns4 MPTCP -> ns3 (10.0.3.2:10108 ) MPTCP (duration 40823ms) [ OK ] ns4 MPTCP -> ns3 (10.0.3.2:10109 ) TCP (duration 23119ms) [ OK ] ns4 TCP -> ns3 (10.0.3.2:10110 ) MPTCP (duration 5421ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10111) MPTCP (duration 41446ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10112) TCP (duration 23427ms) [ OK ] ns4 TCP -> ns3 (dead:beef:3::2:10113) MPTCP (duration 5426ms) [ OK ] Time: 1396 seconds After: ns4 MPTCP -> ns3 (10.0.3.2:10108 ) MPTCP (duration 5417ms) [ OK ] ns4 MPTCP -> ns3 (10.0.3.2:10109 ) TCP (duration 5427ms) [ OK ] ns4 TCP -> ns3 (10.0.3.2:10110 ) MPTCP (duration 5422ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10111) MPTCP (duration 5415ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10112) TCP (duration 5422ms) [ OK ] ns4 TCP -> ns3 (dead:beef:3::2:10113) MPTCP (duration 5423ms) [ OK ] Time: 296 seconds Signed-off-by: Florian Westphal Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++--- net/mptcp/protocol.h | 7 +++ net/mptcp/subflow.c | 5 ++- 3 files changed, 127 insertions(+), 8 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 28ec26d97f96..fa137a9c42d1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -179,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, return false; } - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); - - if (rcvbuf > sk->sk_rcvbuf) - sk->sk_rcvbuf = rcvbuf; - } - tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -916,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, return copied; } +/* receive buffer autotuning. See tcp_rcv_space_adjust for more information. + * + * Only difference: Use highest rtt estimate of the subflows in use. + */ +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 time, advmss = 1; + u64 rtt_us, mstamp; + + sock_owned_by_me(sk); + + if (copied <= 0) + return; + + msk->rcvq_space.copied += copied; + + mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); + time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + + rtt_us = msk->rcvq_space.rtt_us; + if (rtt_us && time < (rtt_us >> 3)) + return; + + rtt_us = 0; + mptcp_for_each_subflow(msk, subflow) { + const struct tcp_sock *tp; + u64 sf_rtt_us; + u32 sf_advmss; + + tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); + + sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); + sf_advmss = READ_ONCE(tp->advmss); + + rtt_us = max(sf_rtt_us, rtt_us); + advmss = max(sf_advmss, advmss); + } + + msk->rcvq_space.rtt_us = rtt_us; + if (time < (rtt_us >> 3) || rtt_us == 0) + return; + + if (msk->rcvq_space.copied <= msk->rcvq_space.space) + goto new_measure; + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvmem, rcvbuf; + u64 rcvwin, grow; + + rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; + + grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); + + do_div(grow, msk->rcvq_space.space); + rcvwin += (grow << 1); + + rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(sk, rcvmem) < advmss) + rcvmem += 128; + + do_div(rcvwin, advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + + if (rcvbuf > sk->sk_rcvbuf) { + u32 window_clamp; + + window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); + tcp_sk(ssk)->window_clamp = window_clamp; + } + } + } + + msk->rcvq_space.space = msk->rcvq_space.copied; +new_measure: + msk->rcvq_space.copied = 0; + msk->rcvq_space.time = mstamp; +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { unsigned int moved = 0; @@ -1028,6 +1115,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + mptcp_rcv_space_adjust(msk, copied); + release_sock(sk); return copied; } @@ -1241,6 +1330,7 @@ static int mptcp_init_sock(struct sock *sk) return ret; sk_sockets_allocated_inc(sk); + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; return 0; @@ -1423,6 +1513,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk, return nsk; } +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + msk->rcvq_space.time = tp->tcp_mstamp; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bool kern) { @@ -1471,6 +1577,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, list_add(&subflow->node, &msk->conn_list); inet_sk_state_store(newsk, TCP_ESTABLISHED); + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); @@ -1631,6 +1738,8 @@ void mptcp_finish_connect(struct sock *ssk) atomic64_set(&msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); + + mptcp_rcv_space_init(msk, ssk); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 1d05d9841b5c..a6412ff0fddb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -209,6 +209,12 @@ struct mptcp_sock { struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; + struct { + u32 space; /* bytes copied in last measurement window */ + u32 copied; /* bytes copied in this measurement window */ + u64 time; /* start time of measurement window */ + u64 rtt_us; /* last maximum rtt of subflows */ + } rcvq_space; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -369,6 +375,7 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 664aa9158363..e1e19c76e267 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -225,8 +225,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) pr_fallback(mptcp_sk(subflow->conn)); } - if (mptcp_check_fallback(sk)) + if (mptcp_check_fallback(sk)) { + mptcp_rcv_space_init(mptcp_sk(parent), sk); return; + } if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), @@ -1118,6 +1120,7 @@ static void subflow_state_change(struct sock *sk) if (subflow_simultaneous_connect(sk)) { mptcp_do_fallback(sk); + mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; if (inet_sk_state_load(parent) == TCP_SYN_SENT) { -- cgit v1.2.3 From 83f0c10bc36f956102ce4a33c5fe596ae9891297 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 5 Jul 2020 01:30:15 +0200 Subject: net: use mptcp setsockopt function for SOL_SOCKET on mptcp sockets setsockopt(mptcp_fd, SOL_SOCKET, ...)... appears to work (returns 0), but it has no effect -- this is because the MPTCP layer never has a chance to copy the settings to the subflow socket. Skip the generic handling for the mptcp case and instead call the mptcp specific handler instead for SOL_SOCKET too. Next patch adds more specific handling for SOL_SOCKET to mptcp. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 +++ net/socket.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index fa137a9c42d1..320f306ea85c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1609,6 +1609,9 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); + if (level == SOL_SOCKET) + return sock_setsockopt(sk->sk_socket, level, optname, optval, optlen); + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow diff --git a/net/socket.c b/net/socket.c index 976426d03f09..d87812a9ed4b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2080,6 +2080,17 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } +static bool sock_use_custom_sol_socket(const struct socket *sock) +{ + const struct sock *sk = sock->sk; + + /* Use sock->ops->setsockopt() for MPTCP */ + return IS_ENABLED(CONFIG_MPTCP) && + sk->sk_protocol == IPPROTO_MPTCP && + sk->sk_type == SOCK_STREAM && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6); +} + /* * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. @@ -2118,7 +2129,7 @@ static int __sys_setsockopt(int fd, int level, int optname, optval = (char __user __force *)kernel_optval; } - if (level == SOL_SOCKET) + if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); -- cgit v1.2.3 From fd1452d8ef988d228f5265147fde1017084404e4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 5 Jul 2020 01:30:16 +0200 Subject: mptcp: add REUSEADDR/REUSEPORT support This will e.g. make 'sshd restart' work when MPTCP is used, as we will now set this option on the listener socket instead of only the mptcp socket (where it has no effect). We still need to copy the setting to the master socket so that a subsequent getsockopt() returns the expected value. Reported-by: Christoph Paasch Suggested-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 320f306ea85c..612f6d49f1bb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1601,6 +1601,37 @@ static void mptcp_destroy(struct sock *sk) sk_sockets_allocated_dec(sk); } +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + } + release_sock(sk); + return ret; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -1610,7 +1641,7 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); if (level == SOL_SOCKET) - return sock_setsockopt(sk->sk_socket, level, optname, optval, optlen); + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the -- cgit v1.2.3 From c9b95a13598750e2840d99322f844ec0ff9e6246 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 5 Jul 2020 01:30:17 +0200 Subject: mptcp: support IPV6_V6ONLY setsockopt Without this, Opensshd fails to open an ipv6 socket listening socket: error: setsockopt IPV6_V6ONLY: Operation not supported error: Bind to port 22 on :: failed: Address already in use. Opensshd opens an ipv4 and and ipv6 listening socket, but because IPV6_V6ONLY setsockopt fails, the port number is already in use. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 612f6d49f1bb..3ab060e30038 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1632,6 +1632,33 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); } +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -1655,6 +1682,9 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, if (ssk) return tcp_setsockopt(ssk, level, optname, optval, optlen); + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); + return -EOPNOTSUPP; } -- cgit v1.2.3 From d47a72152097d7be7cfc453d205196c0aa976c33 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 6 Jul 2020 21:06:12 +0200 Subject: mptcp: fix race in subflow_data_ready() syzkaller was able to make the kernel reach subflow_data_ready() for a server subflow that was closed before subflow_finish_connect() completed. In these cases we can avoid using the path for regular/fallback MPTCP data, and just wake the main socket, to avoid the following warning: WARNING: CPU: 0 PID: 9370 at net/mptcp/subflow.c:885 subflow_data_ready+0x1e6/0x290 net/mptcp/subflow.c:885 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 9370 Comm: syz-executor.0 Not tainted 5.7.0 #106 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xb7/0xfe lib/dump_stack.c:118 panic+0x29e/0x692 kernel/panic.c:221 __warn.cold+0x2f/0x3d kernel/panic.c:582 report_bug+0x28b/0x2f0 lib/bug.c:195 fixup_bug arch/x86/kernel/traps.c:105 [inline] fixup_bug arch/x86/kernel/traps.c:100 [inline] do_error_trap+0x10f/0x180 arch/x86/kernel/traps.c:197 do_invalid_op+0x32/0x40 arch/x86/kernel/traps.c:216 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:1027 RIP: 0010:subflow_data_ready+0x1e6/0x290 net/mptcp/subflow.c:885 Code: 04 02 84 c0 74 06 0f 8e 91 00 00 00 41 0f b6 5e 48 31 ff 83 e3 18 89 de e8 37 ec 3d fe 84 db 0f 85 65 ff ff ff e8 fa ea 3d fe <0f> 0b e9 59 ff ff ff e8 ee ea 3d fe 48 89 ee 4c 89 ef e8 f3 77 ff RSP: 0018:ffff88811b2099b0 EFLAGS: 00010206 RAX: ffff888111197000 RBX: 0000000000000000 RCX: ffffffff82fbc609 RDX: 0000000000000100 RSI: ffffffff82fbc616 RDI: 0000000000000001 RBP: ffff8881111bc800 R08: ffff888111197000 R09: ffffed10222a82af R10: ffff888111541577 R11: ffffed10222a82ae R12: 1ffff11023641336 R13: ffff888111541000 R14: ffff88810fd4ca00 R15: ffff888111541570 tcp_child_process+0x754/0x920 net/ipv4/tcp_minisocks.c:841 tcp_v4_do_rcv+0x749/0x8b0 net/ipv4/tcp_ipv4.c:1642 tcp_v4_rcv+0x2666/0x2e60 net/ipv4/tcp_ipv4.c:1999 ip_protocol_deliver_rcu+0x29/0x1f0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:421 [inline] ip_local_deliver+0x2da/0x390 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:441 [inline] ip_rcv_finish net/ipv4/ip_input.c:428 [inline] ip_rcv_finish net/ipv4/ip_input.c:414 [inline] NF_HOOK include/linux/netfilter.h:421 [inline] ip_rcv+0xef/0x140 net/ipv4/ip_input.c:539 __netif_receive_skb_one_core+0x197/0x1e0 net/core/dev.c:5268 __netif_receive_skb+0x27/0x1c0 net/core/dev.c:5382 process_backlog+0x1e5/0x6d0 net/core/dev.c:6226 napi_poll net/core/dev.c:6671 [inline] net_rx_action+0x3e3/0xd70 net/core/dev.c:6739 __do_softirq+0x18c/0x634 kernel/softirq.c:292 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082 do_softirq.part.0+0x26/0x30 kernel/softirq.c:337 do_softirq arch/x86/include/asm/preempt.h:26 [inline] __local_bh_enable_ip+0x46/0x50 kernel/softirq.c:189 local_bh_enable include/linux/bottom_half.h:32 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:723 [inline] ip_finish_output2+0x78a/0x19c0 net/ipv4/ip_output.c:229 __ip_finish_output+0x471/0x720 net/ipv4/ip_output.c:306 dst_output include/net/dst.h:435 [inline] ip_local_out+0x181/0x1e0 net/ipv4/ip_output.c:125 __ip_queue_xmit+0x7a1/0x14e0 net/ipv4/ip_output.c:530 __tcp_transmit_skb+0x19dc/0x35e0 net/ipv4/tcp_output.c:1238 __tcp_send_ack.part.0+0x3c2/0x5b0 net/ipv4/tcp_output.c:3785 __tcp_send_ack net/ipv4/tcp_output.c:3791 [inline] tcp_send_ack+0x7d/0xa0 net/ipv4/tcp_output.c:3791 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:6040 [inline] tcp_rcv_state_process+0x36a4/0x49c2 net/ipv4/tcp_input.c:6209 tcp_v4_do_rcv+0x343/0x8b0 net/ipv4/tcp_ipv4.c:1651 sk_backlog_rcv include/net/sock.h:996 [inline] __release_sock+0x1ad/0x310 net/core/sock.c:2548 release_sock+0x54/0x1a0 net/core/sock.c:3064 inet_wait_for_connect net/ipv4/af_inet.c:594 [inline] __inet_stream_connect+0x57e/0xd50 net/ipv4/af_inet.c:686 inet_stream_connect+0x53/0xa0 net/ipv4/af_inet.c:725 mptcp_stream_connect+0x171/0x5f0 net/mptcp/protocol.c:1920 __sys_connect_file net/socket.c:1854 [inline] __sys_connect+0x267/0x2f0 net/socket.c:1871 __do_sys_connect net/socket.c:1882 [inline] __se_sys_connect net/socket.c:1879 [inline] __x64_sys_connect+0x6f/0xb0 net/socket.c:1879 do_syscall_64+0xb7/0x3d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fb577d06469 Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ff 49 2b 00 f7 d8 64 89 01 48 RSP: 002b:00007fb5783d5dd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 000000000068bfa0 RCX: 00007fb577d06469 RDX: 000000000000004d RSI: 0000000020000040 RDI: 0000000000000003 RBP: 00000000ffffffff R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000041427c R14: 00007fb5783d65c0 R15: 0000000000000003 Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/39 Reported-by: Christoph Paasch Fixes: e1ff9e82e2ea ("net: mptcp: improve fallback to TCP") Suggested-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e1e19c76e267..9f7f3772c13c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -873,7 +873,7 @@ static void subflow_data_ready(struct sock *sk) struct mptcp_sock *msk; msk = mptcp_sk(parent); - if (inet_sk_state_load(sk) == TCP_LISTEN) { + if ((1 << inet_sk_state_load(sk)) & (TCPF_LISTEN | TCPF_CLOSE)) { set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; -- cgit v1.2.3 From b416268b7a819c0508ed0dc81461e513b110f2ac Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Jul 2020 14:40:48 +0200 Subject: mptcp: use mptcp worker for path management We can re-use the existing work queue to handle path management instead of a dedicated work queue. Just move pm_worker to protocol.c, call it from the mptcp worker and get rid of the msk lock (already held). Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm.c | 44 +------------------------------------------- net/mptcp/protocol.c | 27 ++++++++++++++++++++++++++- net/mptcp/protocol.h | 3 --- 3 files changed, 27 insertions(+), 47 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7de09fdd42a3..a8ad20559aaa 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,8 +10,6 @@ #include #include "protocol.h" -static struct workqueue_struct *pm_wq; - /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (queue_work(pm_wq, &msk->pm.work)) + if (schedule_work(&msk->work)) sock_hold((struct sock *)msk); return true; } @@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } -static void pm_worker(struct work_struct *work) -{ - struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, - work); - struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); - struct sock *sk = (struct sock *)msk; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - sock_put(sk); -} - void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; @@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.status = 0; spin_lock_init(&msk->pm.lock); - INIT_WORK(&msk->pm.work, pm_worker); mptcp_pm_nl_data_init(msk); } -void mptcp_pm_close(struct mptcp_sock *msk) -{ - if (cancel_work_sync(&msk->pm.work)) - sock_put((struct sock *)msk); -} - void __init mptcp_pm_init(void) { - pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); - if (!pm_wq) - panic("Failed to allocate workqueue"); - mptcp_pm_nl_init(); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3ab060e30038..dbe43e0cd734 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1214,6 +1214,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void pm_work(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1230,6 +1253,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (msk->pm.status) + pm_work(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -1420,7 +1446,6 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); - mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a6412ff0fddb..39bfec3f1586 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -174,8 +174,6 @@ struct mptcp_pm_data { u8 local_addr_max; u8 subflows_max; u8 status; - - struct work_struct work; }; struct mptcp_data_frag { @@ -412,7 +410,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); -- cgit v1.2.3 From 96d890daad05a3e47e914451f07b79275b325c95 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:40 +0200 Subject: mptcp: add msk interations helper mptcp_token_iter_next() allow traversing all the MPTCP sockets inside the token container belonging to the given network namespace with a quite standard iterator semantic. That will be used by the next patch, but keep the API generic, as we plan to use this later for PM's sake. Additionally export mptcp_token_get_sock(), as it also will be used by the diag module. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 2 ++ net/mptcp/token.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 39bfec3f1586..e5baaef5ec89 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -391,6 +391,8 @@ int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); struct mptcp_sock *mptcp_token_get_sock(u32 token); +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 66a4990bd897..7d8106026081 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -238,6 +238,66 @@ found: rcu_read_unlock(); return msk; } +EXPORT_SYMBOL_GPL(mptcp_token_get_sock); + +/** + * mptcp_token_iter_next - iterate over the token container from given pos + * @net: namespace to be iterated + * @s_slot: start slot number + * @s_num: start number inside the given lock + * + * This function returns the first mptcp connection structure found inside the + * token container starting from the specified position, or NULL. + * + * On successful iteration, the iterator is move to the next position and the + * the acquires a reference to the returned socket. + */ +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num) +{ + struct mptcp_sock *ret = NULL; + struct hlist_nulls_node *pos; + int slot, num; + + for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { + struct token_bucket *bucket = &token_hash[slot]; + struct sock *sk; + + num = 0; + + if (hlist_nulls_empty(&bucket->msk_chain)) + continue; + + rcu_read_lock(); + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + ++num; + if (!net_eq(sock_net(sk), net)) + continue; + + if (num <= *s_num) + continue; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + continue; + + if (!net_eq(sock_net(sk), net)) { + sock_put(sk); + continue; + } + + ret = mptcp_sk(sk); + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } + +out: + *s_slot = slot; + *s_num = num; + return ret; +} +EXPORT_SYMBOL_GPL(mptcp_token_iter_next); /** * mptcp_token_destroy_request - remove mptcp connection/token @@ -312,7 +372,6 @@ void __init mptcp_token_init(void) EXPORT_SYMBOL_GPL(mptcp_token_new_request); EXPORT_SYMBOL_GPL(mptcp_token_new_connect); EXPORT_SYMBOL_GPL(mptcp_token_accept); -EXPORT_SYMBOL_GPL(mptcp_token_get_sock); EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); EXPORT_SYMBOL_GPL(mptcp_token_destroy); #endif -- cgit v1.2.3 From ac3b45f6095452a9731f8825be1513d326dbfa15 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:41 +0200 Subject: mptcp: add MPTCP socket diag interface exposes basic inet socket attribute, plus some MPTCP socket fields comprising PM status and MPTCP-level sequence numbers. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 17 +++++ net/mptcp/Kconfig | 4 ++ net/mptcp/Makefile | 2 + net/mptcp/mptcp_diag.c | 169 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 net/mptcp/mptcp_diag.c (limited to 'net/mptcp') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5f2c77082d9e..9762660df741 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -86,4 +86,21 @@ enum { __MPTCP_PM_CMD_AFTER_LAST }; +#define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) + +struct mptcp_info { + __u8 mptcpi_subflows; + __u8 mptcpi_add_addr_signal; + __u8 mptcpi_add_addr_accepted; + __u8 mptcpi_subflows_max; + __u8 mptcpi_add_addr_signal_max; + __u8 mptcpi_add_addr_accepted_max; + __u32 mptcpi_flags; + __u32 mptcpi_token; + __u64 mptcpi_write_seq; + __u64 mptcpi_snd_una; + __u64 mptcpi_rcv_nxt; +}; + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index af84fce70bb0..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,6 +13,10 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index c53f9b845523..2360cbd27d59 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); -- cgit v1.2.3 From 8c728940487945e25cdfe020d58da42143aa98c1 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 15 Jul 2020 22:27:05 +0200 Subject: mptcp: silence warning in subflow_data_ready() since commit d47a72152097 ("mptcp: fix race in subflow_data_ready()"), it is possible to observe a regression in MP_JOIN kselftests. For sockets in TCP_CLOSE state, it's not sufficient to just wake up the main socket: we also need to ensure that received data are made available to the reader. Silence the WARN_ON_ONCE() in these cases: it preserves the syzkaller fix and restores kselftests when they are ran as follows: # while true; do > make KBUILD_OUTPUT=/tmp/kselftest TARGETS=net/mptcp kselftest > done Reported-by: Florian Westphal Fixes: d47a72152097 ("mptcp: fix race in subflow_data_ready()") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/47 Signed-off-by: Davide Caratti Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9f7f3772c13c..519122e66f17 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -869,18 +869,19 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(parent); - if ((1 << inet_sk_state_load(sk)) & (TCPF_LISTEN | TCPF_CLOSE)) { + if (state & TCPF_LISTEN) { set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && - !subflow->mp_join); + !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); -- cgit v1.2.3 From 8c918ffbbad49454ed26c53eb1b90bf98bb5e394 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jul 2020 08:23:14 +0200 Subject: net: remove compat_sock_common_{get,set}sockopt Add the compat handling to sock_common_{get,set}sockopt instead, keyed of in_compat_syscall(). This allow to remove the now unused ->compat_{get,set}sockopt methods from struct proto_ops. Signed-off-by: Christoph Hellwig Acked-by: Matthieu Baerts Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- include/linux/net.h | 6 ------ include/net/sock.h | 4 ---- net/core/sock.c | 30 ++++++------------------------ net/dccp/ipv4.c | 4 ---- net/dccp/ipv6.c | 2 -- net/ieee802154/socket.c | 8 -------- net/ipv4/af_inet.c | 6 ------ net/ipv6/af_inet6.c | 4 ---- net/ipv6/ipv6_sockglue.c | 12 ++---------- net/ipv6/raw.c | 2 -- net/l2tp/l2tp_ip.c | 4 ---- net/l2tp/l2tp_ip6.c | 2 -- net/mptcp/protocol.c | 6 ------ net/phonet/socket.c | 8 -------- net/sctp/ipv6.c | 2 -- net/sctp/protocol.c | 4 ---- 16 files changed, 8 insertions(+), 96 deletions(-) (limited to 'net/mptcp') diff --git a/include/linux/net.h b/include/linux/net.h index 016a9c5faa34..858ff1d98154 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -165,12 +165,6 @@ struct proto_ops { int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); -#ifdef CONFIG_COMPAT - int (*compat_setsockopt)(struct socket *sock, int level, - int optname, char __user *optval, unsigned int optlen); - int (*compat_getsockopt)(struct socket *sock, int level, - int optname, char __user *optval, int __user *optlen); -#endif void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); diff --git a/include/net/sock.h b/include/net/sock.h index 4bf884165148..1fd7cf5fc751 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1744,10 +1744,6 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); -int compat_sock_common_getsockopt(struct socket *sock, int level, - int optname, char __user *optval, int __user *optlen); -int compat_sock_common_setsockopt(struct socket *sock, int level, - int optname, char __user *optval, unsigned int optlen); void sk_common_release(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index e085df794825..018404d17626 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3199,23 +3199,14 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(sock_common_getsockopt); - #ifdef CONFIG_COMPAT -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_getsockopt != NULL) + if (in_compat_syscal() && sk->sk_prot->compat_getsockopt) return sk->sk_prot->compat_getsockopt(sk, level, optname, optval, optlen); +#endif return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(compat_sock_common_getsockopt); -#endif +EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -3240,23 +3231,14 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(sock_common_setsockopt); - #ifdef CONFIG_COMPAT -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_setsockopt != NULL) + if (in_compat_syscall() && sk->sk_prot->compat_setsockopt) return sk->sk_prot->compat_setsockopt(sk, level, optname, optval, optlen); +#endif return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(compat_sock_common_setsockopt); -#endif +EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a7e989919c53..316cc5ac0da7 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -999,10 +999,6 @@ static const struct proto_ops inet_dccp_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw dccp_v4_protosw = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 650187d68851..b50f85a72cd5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1083,8 +1083,6 @@ static const struct proto_ops inet6_dccp_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index d93d4531aa9b..94ae9662133e 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,10 +423,6 @@ static const struct proto_ops ieee802154_raw_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* DGRAM Sockets (802.15.4 dataframes) */ @@ -986,10 +982,6 @@ static const struct proto_ops ieee802154_dgram_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Create a socket. Initialise the socket, blank the addresses diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ff141d630bdf..4307503a6f0b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1043,8 +1043,6 @@ const struct proto_ops inet_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, @@ -1073,8 +1071,6 @@ const struct proto_ops inet_dgram_ops = { .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1105,8 +1101,6 @@ static const struct proto_ops inet_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b304b882e031..0306509ab063 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -688,8 +688,6 @@ const struct proto_ops inet6_stream_ops = { .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif .set_rcvlowat = tcp_set_rcvlowat, }; @@ -717,8 +715,6 @@ const struct proto_ops inet6_dgram_ops = { .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 20576e87a5f7..6ab44ec2c369 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -914,12 +914,8 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, { int err; - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_setsockopt != NULL) - return udp_prot.compat_setsockopt(sk, level, optname, - optval, optlen); + if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); - } if (level != SOL_IPV6) return -ENOPROTOOPT; @@ -1480,12 +1476,8 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, { int err; - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_getsockopt != NULL) - return udp_prot.compat_getsockopt(sk, level, optname, - optval, optlen); + if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); - } if (level != SOL_IPV6) return -ENOPROTOOPT; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8ef5a7b30524..e23c6b461758 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1378,8 +1378,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 955662a6dee7..f8d7412cfb3d 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -638,10 +638,6 @@ static const struct proto_ops l2tp_ip_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw l2tp_ip_protosw = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 526ed2c24dd5..2cdc0b7a7a43 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -773,8 +773,6 @@ static const struct proto_ops l2tp_ip6_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dbe43e0cd734..f0b0b503c262 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2166,10 +2166,6 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw mptcp_protosw = { @@ -2222,8 +2218,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 76d499f6af9a..87c60f83c180 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -441,10 +441,6 @@ const struct proto_ops phonet_dgram_ops = { .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = sock_no_setsockopt, - .compat_getsockopt = sock_no_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, @@ -466,10 +462,6 @@ const struct proto_ops phonet_stream_ops = { .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ccfa0ab3e7f4..ebda31b7747d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1033,8 +1033,6 @@ static const struct proto_ops inet6_seqpacket_ops = { .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index cde29f3c7fb3..8d25cc464efd 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1036,10 +1036,6 @@ static const struct proto_ops inet_seqpacket_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Registration with AF_INET family. */ -- cgit v1.2.3 From c1d069e3bfc9e4021f087016578fbff209f493fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 Jul 2020 21:08:54 +0200 Subject: mptcp: move helper to where its used Only used in token.c. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 11 ----------- net/mptcp/token.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e5baaef5ec89..6e114c09e5b4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -396,17 +396,6 @@ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); -static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) -{ - /* we might consider a faster version that computes the key as a - * hash of some information available in the MPTCP socket. Use - * random data at the moment, as it's probably the safest option - * in case multiple sockets are opened in different namespaces at - * the same time. - */ - get_random_bytes(key, sizeof(u64)); - mptcp_crypto_key_sha(*key, token, idsn); -} void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 7d8106026081..b25b390dbbff 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -83,6 +83,18 @@ static bool __token_bucket_busy(struct token_bucket *t, u32 token) __token_lookup_req(t, token) || __token_lookup_msk(t, token); } +static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} + /** * mptcp_token_new_request - create new key/idsn/token for subflow_request * @req: the request socket -- cgit v1.2.3 From 6ab301c98f174a8c25d5351b977a1113e2f1fb91 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 22 Jul 2020 17:20:50 +0200 Subject: mptcp: zero token hash at creation time. Otherwise the 'chain_len' filed will carry random values, some token creation calls will fail due to excessive chain length, causing unexpected fallback to TCP. Fixes: 2c5ebd001d4f ("mptcp: refactor token container") Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/token.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/token.c b/net/mptcp/token.c index b25b390dbbff..97cfc45bcc4f 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -368,7 +368,7 @@ void __init mptcp_token_init(void) sizeof(struct token_bucket), 0, 20,/* one slot per 1MB of memory */ - 0, + HASH_ZERO, NULL, &token_mask, 0, -- cgit v1.2.3 From b0977bb268db1df6decd3405903ca500721cdc5f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:29 +0200 Subject: subflow: always init 'rel_write_seq' Currently we do not init the subflow write sequence for MP_JOIN subflows. This will cause bad mapping being generated as soon as we will use non backup subflow. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 1 - net/mptcp/subflow.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f0b0b503c262..59c0eef807b3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1814,7 +1814,6 @@ void mptcp_finish_connect(struct sock *ssk) ack_seq++; subflow->map_seq = ack_seq; subflow->map_subflow_seq = 1; - subflow->rel_write_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 519122e66f17..84e70806b250 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -200,6 +200,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); -- cgit v1.2.3 From 53eb4c383deb97b59f1755fe3035ec7992488375 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:30 +0200 Subject: mptcp: avoid data corruption on reinsert When updating a partially acked data fragment, we actually corrupt it. This is irrelevant till we send data on a single subflow, as retransmitted data, if any are discarded by the peer as duplicate, but it will cause data corruption as soon as we will start creating non backup subflows. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 59c0eef807b3..254e6ef2b4e0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -460,15 +460,20 @@ static void mptcp_clean_una(struct sock *sk) dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { - u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + u64 delta = snd_una - dfrag->data_seq; + + if (WARN_ON_ONCE(delta > dfrag->data_len)) + goto out; dfrag->data_seq += delta; + dfrag->offset += delta; dfrag->data_len -= delta; dfrag_uncharge(sk, delta); cleaned = true; } +out: if (cleaned) { sk_mem_reclaim_partial(sk); -- cgit v1.2.3 From 0235d075a592dfde575df81f150feb0d95a5ef5c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:31 +0200 Subject: mptcp: mark as fallback even early ones In the unlikely event of a failure at connect time, we currently clear the request_mptcp flag - so that the MPC handshake is not started at all, but the msk is not explicitly marked as fallback. This would lead to later insertion of wrong DSS options in the xmitted packets, in violation of RFC specs and possibly fooling the peer. Fixes: e1ff9e82e2ea ("net: mptcp: improve fallback to TCP") Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 254e6ef2b4e0..2936413171be 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1944,6 +1944,13 @@ unlock: return err; } +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { @@ -1975,10 +1982,10 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - subflow->request_mptcp = 0; + mptcp_subflow_early_fallback(msk, subflow); #endif if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) - subflow->request_mptcp = 0; + mptcp_subflow_early_fallback(msk, subflow); do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); -- cgit v1.2.3 From b93df08ccda326ef89a6e80fb796588b9a30a980 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:32 +0200 Subject: mptcp: explicitly track the fully established status Currently accepted msk sockets become established only after accept() returns the new sk to user-space. As MP_JOIN request are refused as per RFC spec on non fully established socket, the above causes mp_join self-tests instabilities. This change lets the msk entering the established status as soon as it receives the 3rd ack and propagates the first subflow fully established status on the msk socket. Finally we can change the subflow acceptance condition to take in account both the sock state and the msk fully established flag. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 5 ++--- net/mptcp/protocol.c | 4 ++-- net/mptcp/protocol.h | 8 ++++++++ net/mptcp/subflow.c | 23 +++++++++++++++++++---- 4 files changed, 31 insertions(+), 9 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 19707c07efc1..3bc56eb608d8 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -709,6 +709,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, * additional ack. */ subflow->fully_established = 1; + WRITE_ONCE(msk->fully_established, true); goto fully_established; } @@ -724,9 +725,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); - subflow->fully_established = 1; - subflow->remote_key = mp_opt->sndr_key; - subflow->can_ack = 1; + mptcp_subflow_fully_established(subflow, mp_opt); fully_established: if (likely(subflow->pm_notified)) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2936413171be..979dfcd2aa14 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1522,6 +1522,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; msk->subflow = NULL; + WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); @@ -1605,7 +1606,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); - inet_sk_state_store(newsk, TCP_ESTABLISHED); mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); @@ -1855,7 +1855,7 @@ bool mptcp_finish_join(struct sock *sk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(parent)) return false; if (!msk->pm.server_side) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6e114c09e5b4..67634b595466 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -198,6 +198,7 @@ struct mptcp_sock { u32 token; unsigned long flags; bool can_ack; + bool fully_established; spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; @@ -342,6 +343,8 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) } int mptcp_is_enabled(struct net *net); +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); @@ -373,6 +376,11 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +static inline bool mptcp_is_fully_established(struct sock *sk) +{ + return inet_sk_state_load(sk) == TCP_ESTABLISHED && + READ_ONCE(mptcp_sk(sk)->fully_established); +} void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 84e70806b250..ea81842fc3b2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -387,6 +387,17 @@ static void subflow_drop_ctx(struct sock *ssk) kfree_rcu(ctx, rcu); } +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + subflow->remote_key = mp_opt->sndr_key; + subflow->fully_established = 1; + subflow->can_ack = 1; + WRITE_ONCE(msk->fully_established, true); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -466,6 +477,11 @@ create_child: } if (ctx->mp_capable) { + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ @@ -478,9 +494,8 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - ctx->remote_key = mp_opt.sndr_key; - ctx->fully_established = mp_opt.mp_capable; - ctx->can_ack = mp_opt.mp_capable; + if (mp_opt.mp_capable) + mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -967,7 +982,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, int addrlen; int err; - if (sk->sk_state != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(sk)) return -ENOTCONN; err = mptcp_subflow_create_socket(sk, &sf); -- cgit v1.2.3 From fa25e815d963115eb06036a8f6a50e724bc259e2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:33 +0200 Subject: mptcp: cleanup subflow_finish_connect() The mentioned function has several unneeded branches, handle each case - MP_CAPABLE, MP_JOIN, fallback - under a single conditional and drop quite a bit of duplicate code. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 56 ++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ea81842fc3b2..7f3ef1840df5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -206,44 +206,34 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); - if (subflow->request_mptcp && mp_opt.mp_capable) { + if (subflow->request_mptcp) { + if (!mp_opt.mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); + goto fallback; + } + subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); - } else if (subflow->request_join && mp_opt.mp_join) { - subflow->mp_join = 1; + mptcp_finish_connect(sk); + } else if (subflow->request_join) { + u8 hmac[SHA256_DIGEST_SIZE]; + + if (!mp_opt.mp_join) + goto do_reset; + subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); - } else { - if (subflow->request_mptcp) - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - mptcp_do_fallback(sk); - pr_fallback(mptcp_sk(subflow->conn)); - } - if (mptcp_check_fallback(sk)) { - mptcp_rcv_space_init(mptcp_sk(parent), sk); - return; - } - - if (subflow->mp_capable) { - pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), - subflow->remote_key); - mptcp_finish_connect(sk); - } else if (subflow->mp_join) { - u8 hmac[SHA256_DIGEST_SIZE]; - - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", - subflow, subflow->thmac, - subflow->remote_nonce); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); - subflow->mp_join = 0; goto do_reset; } @@ -251,18 +241,22 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->local_nonce, subflow->remote_nonce, hmac); - memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); if (!mptcp_finish_join(sk)) goto do_reset; + subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); - } else { -do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + } else if (mptcp_check_fallback(sk)) { +fallback: + mptcp_rcv_space_init(mptcp_sk(parent), sk); } + return; + +do_reset: + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); } static struct request_sock_ops subflow_request_sock_ops; -- cgit v1.2.3 From b7514694ed2952684a1e4fc44d83682140fd8cef Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:34 +0200 Subject: subflow: explicitly check for plain tcp rsk When syncookie are in use, the TCP stack may feed into subflow_syn_recv_sock() plain TCP request sockets. We can't access mptcp_subflow_request_sock-specific fields on such sockets. Explicitly check the rsk ops to do safe accesses. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 7f3ef1840df5..3ef445f59556 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -415,7 +415,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); - fallback_is_fatal = subflow_req->mp_join; + fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; -- cgit v1.2.3 From 97e617518cbc318113b034a5fb33f49c81701278 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:35 +0200 Subject: subflow: use rsk_ops->send_reset() tcp_send_active_reset() is more prone to transient errors (memory allocation or xmit queue full): in stress conditions the kernel may drop the egress packet, and the client will be stuck. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3ef445f59556..ada04df6f99f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -524,9 +524,9 @@ out: dispose_child: subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; - tcp_send_active_reset(child, GFP_ATOMIC); inet_csk_prepare_for_destroy_sock(child); tcp_done(child); + req->rsk_ops->send_reset(sk, skb); /* The last child reference will be released by the caller */ return child; -- cgit v1.2.3 From 4cf8b7e48a09745145881b311fe6a9154ba69ebc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jul 2020 13:02:36 +0200 Subject: subflow: introduce and use mptcp_can_accept_new_subflow() So that we can easily perform some basic PM-related adimission checks before creating the child socket. Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ada04df6f99f..e645483d1200 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -53,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } +static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) +{ + return mptcp_is_fully_established((void *)msk) && + READ_ONCE(msk->pm.accept_subflow); +} + /* validate received token and create truncated hmac and nonce for SYN-ACK */ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, const struct sk_buff *skb) @@ -443,6 +449,7 @@ create_msk: } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!mp_opt.mp_join || + !mptcp_can_accept_new_subflow(subflow_req->msk) || !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; -- cgit v1.2.3 From c8c1bbb6eb498109286739f8b6090e99313dd104 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jul 2020 08:08:50 +0200 Subject: net: switch sock_set_timeout to sockptr_t Pass a sockptr_t to prepare for set_fs-less handling of the kernel pointer from bpf-cgroup. Signed-off-by: Christoph Hellwig Acked-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- net/core/sock.c | 26 ++++++++++++-------------- net/mptcp/protocol.c | 6 ++++-- net/socket.c | 3 ++- 4 files changed, 20 insertions(+), 18 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/sock.h b/include/net/sock.h index 62e18fc8ac9f..bfb2fe2fc368 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -1669,7 +1670,7 @@ void sock_pfree(struct sk_buff *skb); #endif int sock_setsockopt(struct socket *sock, int level, int op, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); diff --git a/net/core/sock.c b/net/core/sock.c index 8b9eddaff868..1444d7d53ba2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -826,7 +826,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock_txtime sk_txtime; struct sock *sk = sock->sk; @@ -840,12 +840,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname, */ if (optname == SO_BINDTODEVICE) - return sock_setbindtodevice(sk, USER_SOCKPTR(optval), optlen); + return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -958,7 +958,7 @@ set_sndbuf: ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling, optval, sizeof(ling))) { + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } @@ -1052,21 +1052,20 @@ set_sndbuf: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, USER_SOCKPTR(optval), + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, USER_SOCKPTR(optval), + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; - ret = copy_bpf_fprog_from_user(&fprog, USER_SOCKPTR(optval), - optlen); + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; @@ -1077,7 +1076,7 @@ set_sndbuf: u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); @@ -1087,8 +1086,7 @@ set_sndbuf: case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; - ret = copy_bpf_fprog_from_user(&fprog, USER_SOCKPTR(optval), - optlen); + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; @@ -1099,7 +1097,7 @@ set_sndbuf: u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); @@ -1179,7 +1177,7 @@ set_sndbuf: if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && - get_user(ulval, (unsigned long __user *)optval)) { + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { ret = -EFAULT; break; } @@ -1222,7 +1220,7 @@ set_sndbuf: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; - } else if (copy_from_user(&sk_txtime, optval, + } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 979dfcd2aa14..7246847efa90 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1648,7 +1648,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EINVAL; } - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, + USER_SOCKPTR(optval), optlen); if (ret == 0) { if (optname == SO_REUSEPORT) sk->sk_reuseport = ssock->sk->sk_reuseport; @@ -1659,7 +1660,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return ret; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + USER_SOCKPTR(optval), optlen); } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, diff --git a/net/socket.c b/net/socket.c index 93846568c2fb..c97f83d879ae 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2130,7 +2130,8 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *optval, } if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) - err = sock_setsockopt(sock, level, optname, optval, optlen); + err = sock_setsockopt(sock, level, optname, + USER_SOCKPTR(optval), optlen); else if (unlikely(!sock->ops->setsockopt)) err = -EOPNOTSUPP; else -- cgit v1.2.3 From a7b75c5a8c41445f33efb663887ff5f5c3b4454b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jul 2020 08:09:07 +0200 Subject: net: pass a sockptr_t into ->setsockopt Rework the remaining setsockopt code to pass a sockptr_t instead of a plain user pointer. This removes the last remaining set_fs(KERNEL_DS) outside of architecture specific code. Signed-off-by: Christoph Hellwig Acked-by: Stefan Schmidt [ieee802154] Acked-by: Matthieu Baerts Signed-off-by: David S. Miller --- crypto/af_alg.c | 7 +++--- drivers/crypto/chelsio/chtls/chtls_main.c | 18 +++++++------- drivers/isdn/mISDN/socket.c | 4 ++-- include/linux/net.h | 4 +++- include/net/inet_connection_sock.h | 3 ++- include/net/ip.h | 2 +- include/net/ipv6.h | 4 ++-- include/net/sctp/structs.h | 2 +- include/net/sock.h | 4 ++-- include/net/tcp.h | 4 ++-- net/atm/common.c | 6 ++--- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 6 ++--- net/ax25/af_ax25.c | 6 ++--- net/bluetooth/hci_sock.c | 8 +++---- net/bluetooth/l2cap_sock.c | 22 ++++++++--------- net/bluetooth/rfcomm/sock.c | 12 ++++++---- net/bluetooth/sco.c | 6 ++--- net/caif/caif_socket.c | 8 +++---- net/can/j1939/socket.c | 12 +++++----- net/can/raw.c | 16 ++++++------- net/core/sock.c | 2 +- net/dccp/dccp.h | 2 +- net/dccp/proto.c | 20 ++++++++-------- net/decnet/af_decnet.c | 16 +++++++------ net/ieee802154/socket.c | 6 ++--- net/ipv4/ip_sockglue.c | 13 ++++------- net/ipv4/raw.c | 8 +++---- net/ipv4/tcp.c | 5 ++-- net/ipv4/udp.c | 6 ++--- net/ipv4/udp_impl.h | 4 ++-- net/ipv6/ipv6_sockglue.c | 10 ++++---- net/ipv6/raw.c | 10 ++++---- net/ipv6/udp.c | 6 ++--- net/ipv6/udp_impl.h | 4 ++-- net/iucv/af_iucv.c | 4 ++-- net/kcm/kcmsock.c | 6 ++--- net/l2tp/l2tp_ppp.c | 4 ++-- net/llc/af_llc.c | 4 ++-- net/mptcp/protocol.c | 12 ++++------ net/netlink/af_netlink.c | 4 ++-- net/netrom/af_netrom.c | 4 ++-- net/nfc/llcp_sock.c | 6 ++--- net/packet/af_packet.c | 39 ++++++++++++++++--------------- net/phonet/pep.c | 4 ++-- net/rds/af_rds.c | 30 +++++++++++------------- net/rds/rdma.c | 14 +++++------ net/rds/rds.h | 6 ++--- net/rose/af_rose.c | 4 ++-- net/rxrpc/af_rxrpc.c | 8 +++---- net/rxrpc/ar-internal.h | 4 ++-- net/rxrpc/key.c | 9 ++++--- net/sctp/socket.c | 4 ++-- net/smc/af_smc.c | 4 ++-- net/socket.c | 23 ++++++------------ net/tipc/socket.c | 8 +++---- net/tls/tls_main.c | 17 +++++++------- net/vmw_vsock/af_vsock.c | 4 ++-- net/x25/af_x25.c | 4 ++-- net/xdp/xsk.c | 8 +++---- 61 files changed, 246 insertions(+), 258 deletions(-) (limited to 'net/mptcp') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 29f71428520b..892242a42c3e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -197,8 +197,7 @@ unlock: return err; } -static int alg_setkey(struct sock *sk, char __user *ukey, - unsigned int keylen) +static int alg_setkey(struct sock *sk, sockptr_t ukey, unsigned int keylen) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type = ask->type; @@ -210,7 +209,7 @@ static int alg_setkey(struct sock *sk, char __user *ukey, return -ENOMEM; err = -EFAULT; - if (copy_from_user(key, ukey, keylen)) + if (copy_from_sockptr(key, ukey, keylen)) goto out; err = type->setkey(ask->private, key, keylen); @@ -222,7 +221,7 @@ out: } static int alg_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c index d98b89d0fa6e..c3058dcdb33c 100644 --- a/drivers/crypto/chelsio/chtls/chtls_main.c +++ b/drivers/crypto/chelsio/chtls/chtls_main.c @@ -488,7 +488,7 @@ static int chtls_getsockopt(struct sock *sk, int level, int optname, } static int do_chtls_setsockopt(struct sock *sk, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct tls_crypto_info *crypto_info, tmp_crypto_info; struct chtls_sock *csk; @@ -498,12 +498,12 @@ static int do_chtls_setsockopt(struct sock *sk, int optname, csk = rcu_dereference_sk_user_data(sk); - if (!optval || optlen < sizeof(*crypto_info)) { + if (sockptr_is_null(optval) || optlen < sizeof(*crypto_info)) { rc = -EINVAL; goto out; } - rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); + rc = copy_from_sockptr(&tmp_crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto out; @@ -525,8 +525,9 @@ static int do_chtls_setsockopt(struct sock *sk, int optname, /* Obtain version and type from previous copy */ crypto_info[0] = tmp_crypto_info; /* Now copy the following data */ - rc = copy_from_user((char *)crypto_info + sizeof(*crypto_info), - optval + sizeof(*crypto_info), + sockptr_advance(optval, sizeof(*crypto_info)); + rc = copy_from_sockptr((char *)crypto_info + sizeof(*crypto_info), + optval, sizeof(struct tls12_crypto_info_aes_gcm_128) - sizeof(*crypto_info)); @@ -541,8 +542,9 @@ static int do_chtls_setsockopt(struct sock *sk, int optname, } case TLS_CIPHER_AES_GCM_256: { crypto_info[0] = tmp_crypto_info; - rc = copy_from_user((char *)crypto_info + sizeof(*crypto_info), - optval + sizeof(*crypto_info), + sockptr_advance(optval, sizeof(*crypto_info)); + rc = copy_from_sockptr((char *)crypto_info + sizeof(*crypto_info), + optval, sizeof(struct tls12_crypto_info_aes_gcm_256) - sizeof(*crypto_info)); @@ -565,7 +567,7 @@ out: } static int chtls_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1b2b91479107..2835daae9e9f 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -401,7 +401,7 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } static int data_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int len) + sockptr_t optval, unsigned int len) { struct sock *sk = sock->sk; int err = 0, opt = 0; @@ -414,7 +414,7 @@ static int data_sock_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case MISDN_TIME_STAMP: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(int))) { err = -EFAULT; break; } diff --git a/include/linux/net.h b/include/linux/net.h index 858ff1d98154..d48ff1180879 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -162,7 +163,8 @@ struct proto_ops { int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, - int optname, char __user *optval, unsigned int optlen); + int optname, sockptr_t optval, + unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void (*show_fdinfo)(struct seq_file *m, struct socket *sock); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 157c60cca0ca..1e209ce7d1bd 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -45,7 +46,7 @@ struct inet_connection_sock_af_ops { u16 net_frag_header_len; u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); diff --git a/include/net/ip.h b/include/net/ip.h index d66ad3a95220..b09c48d862cc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -722,7 +722,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); -int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4c9d89b5d732..bd1f396cc9c7 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1084,8 +1084,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, * socket options (ipv6_sockglue.c) */ -int ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 233bbf7df5d6..b33f1aefad09 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -431,7 +431,7 @@ struct sctp_af { int (*setsockopt) (struct sock *sk, int level, int optname, - char __user *optval, + sockptr_t optval, unsigned int optlen); int (*getsockopt) (struct sock *sk, int level, diff --git a/include/net/sock.h b/include/net/sock.h index bfb2fe2fc368..2cc3ba667908 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1141,7 +1141,7 @@ struct proto { void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, - int optname, char __user *optval, + int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, @@ -1734,7 +1734,7 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index e3c8e1d82021..e0c35d56091f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -399,8 +399,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, diff --git a/net/atm/common.c b/net/atm/common.c index 9b28f1fb3c69..84367b844b14 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -745,7 +745,7 @@ static int check_qos(const struct atm_qos *qos) } int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct atm_vcc *vcc; unsigned long value; @@ -760,7 +760,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, { struct atm_qos qos; - if (copy_from_user(&qos, optval, sizeof(qos))) + if (copy_from_sockptr(&qos, optval, sizeof(qos))) return -EFAULT; error = check_qos(&qos); if (error) @@ -774,7 +774,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, return 0; } case SO_SETCLP: - if (get_user(value, (unsigned long __user *)optval)) + if (copy_from_sockptr(&value, optval, sizeof(value))) return -EFAULT; if (value) vcc->atm_options |= ATM_ATMOPT_CLP; diff --git a/net/atm/common.h b/net/atm/common.h index 5850649068bb..a1e56e8de698 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -21,7 +21,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int vcc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void vcc_process_recv_queue(struct atm_vcc *vcc); diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 02bd2a436bdf..53e7d3f39e26 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -63,7 +63,7 @@ static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, } static int pvc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int error; diff --git a/net/atm/svc.c b/net/atm/svc.c index ba144d035e3d..4a02bcaad279 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -451,7 +451,7 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) } static int svc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); @@ -464,7 +464,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (copy_from_user(&vcc->sap, optval, optlen)) { + if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } @@ -475,7 +475,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (get_user(value, (int __user *)optval)) { + if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index fd91cd34f25e..17bf31a89692 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -528,7 +528,7 @@ ax25_cb *ax25_create_cb(void) */ static int ax25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; ax25_cb *ax25; @@ -543,7 +543,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; lock_sock(sk); @@ -640,7 +640,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, memset(devname, 0, sizeof(devname)); - if (copy_from_user(devname, optval, optlen)) { + if (copy_from_sockptr(devname, optval, optlen)) { res = -EFAULT; break; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index caf38a8ea6a8..d5eff27d5b1e 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1842,7 +1842,7 @@ drop: } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int len) + sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1862,7 +1862,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1874,7 +1874,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1896,7 +1896,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, } len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_user(&uf, optval, len)) { + if (copy_from_sockptr(&uf, optval, len)) { err = -EFAULT; break; } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a995d2c51fa7..a3d104123f38 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -703,7 +703,7 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) } static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -736,7 +736,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.txwin_size = chan->tx_win; len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_user((char *) &opts, optval, len)) { + if (copy_from_sockptr(&opts, optval, len)) { err = -EFAULT; break; } @@ -782,7 +782,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -859,7 +859,7 @@ static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) } static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -891,7 +891,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -939,7 +939,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -954,7 +954,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -990,7 +990,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_user((char *) &pwr, optval, len)) { + if (copy_from_sockptr(&pwr, optval, len)) { err = -EFAULT; break; } @@ -1002,7 +1002,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -1050,7 +1050,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u16 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u16))) { err = -EFAULT; break; } @@ -1081,7 +1081,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u8 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u8))) { err = -EFAULT; break; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index df14eebe80da..dba4ea0e1b0d 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -644,7 +644,8 @@ static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg, return len; } -static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -656,7 +657,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u switch (optname) { case RFCOMM_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -685,7 +686,8 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u return err; } -static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct bt_security sec; @@ -713,7 +715,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -732,7 +734,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c8c3d38cdc7b..37260baf7150 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -791,7 +791,7 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; @@ -810,7 +810,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -831,7 +831,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_user((char *)&voice, optval, len)) { + if (copy_from_sockptr(&voice, optval, len)) { err = -EFAULT; break; } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index b94ecd931002..3ad0a1df6712 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -669,8 +669,8 @@ out_err: return sent ? : err; } -static int setsockopt(struct socket *sock, - int lvl, int opt, char __user *ov, unsigned int ol) +static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, + unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -685,7 +685,7 @@ static int setsockopt(struct socket *sock, return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; - if (copy_from_user(&linksel, ov, sizeof(int))) + if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; @@ -699,7 +699,7 @@ static int setsockopt(struct socket *sock, return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || - copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) { + copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f7587428febd..78ff9b3f1d40 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -627,14 +627,14 @@ static int j1939_sk_release(struct socket *sock) return 0; } -static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, +static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) @@ -646,7 +646,7 @@ static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); @@ -658,7 +658,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (optval) { + if (!sockptr_is_null(optval)) { struct j1939_filter *f; int c; @@ -670,7 +670,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; count = optlen / sizeof(*filters); - filters = memdup_user(optval, optlen); + filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); @@ -703,7 +703,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; diff --git a/net/can/raw.c b/net/can/raw.c index 59c039d73c6d..94a9405658dc 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -485,7 +485,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr, } static int raw_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); @@ -511,11 +511,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (count > 1) { /* filter does not fit into dfilter => alloc space */ - filter = memdup_user(optval, optlen); + filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { - if (copy_from_user(&sfilter, optval, sizeof(sfilter))) + if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } @@ -568,7 +568,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(err_mask)) return -EINVAL; - if (copy_from_user(&err_mask, optval, optlen)) + if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; @@ -607,7 +607,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->loopback)) return -EINVAL; - if (copy_from_user(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&ro->loopback, optval, optlen)) return -EFAULT; break; @@ -616,7 +616,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->recv_own_msgs)) return -EINVAL; - if (copy_from_user(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) return -EFAULT; break; @@ -625,7 +625,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->fd_frames)) return -EINVAL; - if (copy_from_user(&ro->fd_frames, optval, optlen)) + if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; break; @@ -634,7 +634,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->join_filters)) return -EINVAL; - if (copy_from_user(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&ro->join_filters, optval, optlen)) return -EFAULT; break; diff --git a/net/core/sock.c b/net/core/sock.c index 1444d7d53ba2..2c5dd1397775 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3211,7 +3211,7 @@ EXPORT_SYMBOL(sock_common_recvmsg); * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 434eea91b767..9cc9d1ee6cdb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -295,7 +295,7 @@ int dccp_disconnect(struct sock *sk, int flags); int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9e453611107f..2e9e8449698f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -411,7 +411,7 @@ out: EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; @@ -426,9 +426,9 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_user(sl->dccpsl_list, - optval + sizeof(service), - optlen - sizeof(service)) || + sockptr_advance(optval, sizeof(service)); + if (copy_from_sockptr(sl->dccpsl_list, optval, + optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; @@ -482,7 +482,7 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) } static int dccp_setsockopt_ccid(struct sock *sk, int type, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; @@ -490,7 +490,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; - val = memdup_user(optval, optlen); + val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); @@ -507,7 +507,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; @@ -529,7 +529,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, if (optlen < (int)sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) @@ -572,8 +572,8 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, return err; } -int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d51ab608fb3..3b53d766789d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -150,7 +150,8 @@ static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; static atomic_long_t decnet_memory_allocated; -static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); static struct hlist_head *dn_find_list(struct sock *sk) @@ -1320,7 +1321,8 @@ out: return err; } -static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err; @@ -1332,14 +1334,14 @@ static int dn_setsockopt(struct socket *sock, int level, int optname, char __use /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != DSO_LINKINFO && optname != DSO_STREAM && optname != DSO_SEQPACKET) - err = nf_setsockopt(sk, PF_DECnet, optname, - USER_SOCKPTR(optval), optlen); + err = nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); #endif return err; } -static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags) +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); @@ -1355,13 +1357,13 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us } u; int err; - if (optlen && !optval) + if (optlen && sockptr_is_null(optval)) return -EINVAL; if (optlen > sizeof(u)) return -EINVAL; - if (copy_from_user(&u, optval, optlen)) + if (copy_from_sockptr(&u, optval, optlen)) return -EFAULT; switch (optname) { diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 94ae9662133e..a45a0401adc5 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -382,7 +382,7 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -872,7 +872,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, } static int dgram_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); @@ -882,7 +882,7 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f7f1507b89fe..8dc027e54c5b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1401,21 +1401,19 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); } -int ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; if (level != SOL_IP) return -ENOPROTOOPT; - err = do_ip_setsockopt(sk, level, optname, USER_SOCKPTR(optval), - optlen); + err = do_ip_setsockopt(sk, level, optname, optval, optlen); #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) - err = bpfilter_ip_set_sockopt(sk, optname, USER_SOCKPTR(optval), - optlen); + err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); #endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ @@ -1423,8 +1421,7 @@ int ip_setsockopt(struct sock *sk, int level, optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && !ip_mroute_opt(optname)) - err = nf_setsockopt(sk, PF_INET, optname, USER_SOCKPTR(optval), - optlen); + err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); #endif return err; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2a57d633b31e..6fd4330287c2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -809,11 +809,11 @@ static int raw_sk_init(struct sock *sk) return 0; } -static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); - if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } @@ -838,7 +838,7 @@ out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) @@ -850,7 +850,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71cbc61c335f..27de9380ed14 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3323,7 +3323,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3331,8 +3331,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, if (level != SOL_TCP) return icsk->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); - return do_tcp_setsockopt(sk, level, optname, USER_SOCKPTR(optval), - optlen); + return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c6cb2d09dbc7..5a6a2f6d86b9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2703,12 +2703,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(udp_lib_setsockopt); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, - USER_SOCKPTR(optval), optlen, + optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index ab313702c87f..2878d8285caf 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -12,8 +12,8 @@ int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); void udp_v4_rehash(struct sock *sk); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index dcd000a5a9b1..d2282f5c9760 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -980,8 +980,8 @@ e_inval: return -EINVAL; } -int ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -991,14 +991,12 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_setsockopt(sk, level, optname, USER_SOCKPTR(optval), - optlen); + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) - err = nf_setsockopt(sk, PF_INET6, optname, USER_SOCKPTR(optval), - optlen); + err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 594e01ad670a..874f01cd7aec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -972,13 +972,13 @@ do_confirm: } static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); - if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: @@ -1015,12 +1015,12 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { @@ -1062,7 +1062,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, } static int rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2df1e6c9d7cb..15818e18655d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1618,12 +1618,12 @@ void udpv6_destroy_sock(struct sock *sk) /* * Socket option code for UDP */ -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, - USER_SOCKPTR(optval), optlen, + optval, optlen, udp_v6_push_pending_frames); return ipv6_setsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 30dfb6f1b762..b2fcc46c1630 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -17,8 +17,8 @@ void udp_v6_rehash(struct sock *sk); int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ee0add15497d..6ee9851ac7c6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ static int iucv_sock_release(struct socket *sock) /* getsockopt and setsockopt */ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); @@ -1507,7 +1507,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; rc = 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 56fac24a627a..56dad9565bc9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1265,7 +1265,7 @@ static void kcm_recv_enable(struct kcm_sock *kcm) } static int kcm_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; @@ -1277,8 +1277,8 @@ static int kcm_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; valbool = val ? 1 : 0; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e58fe7e3b884..4389df66af35 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1242,7 +1242,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; @@ -1256,7 +1256,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 6140a3e46c26..7180979114e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1053,7 +1053,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); @@ -1063,7 +1063,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; - rc = get_user(opt, (int __user *)optval); + rc = copy_from_sockptr(&opt, optval, sizeof(opt)); if (rc) goto out; rc = -EINVAL; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7246847efa90..2891ae8a1028 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1632,7 +1632,7 @@ static void mptcp_destroy(struct sock *sk) } static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct socket *ssock; @@ -1648,8 +1648,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EINVAL; } - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, - USER_SOCKPTR(optval), optlen); + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) sk->sk_reuseport = ssock->sk->sk_reuseport; @@ -1660,12 +1659,11 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return ret; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, - USER_SOCKPTR(optval), optlen); + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; @@ -1692,7 +1690,7 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, } static int mptcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3cd58f0c2de4..d8921b833744 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1621,7 +1621,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, } static int netlink_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -1632,7 +1632,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; if (optlen >= sizeof(int) && - get_user(val, (unsigned int __user *)optval)) + copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index f90ef6934b8f..6d16e1ab1a8a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -294,7 +294,7 @@ void nr_destroy_socket(struct sock *sk) */ static int nr_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -306,7 +306,7 @@ static int nr_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 6da1e2334bb6..d257ed3b732a 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -218,7 +218,7 @@ error: } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -241,7 +241,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -263,7 +263,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d8d4f78f78e4..0b8160d1a6e0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1558,7 +1558,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, return 0; } -static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1568,7 +1568,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return -EPERM; if (len != sizeof(fd)) return -EINVAL; - if (copy_from_user(&fd, data, len)) + if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); @@ -1579,12 +1579,12 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return 0; } -static int fanout_set_data(struct packet_sock *po, char __user *data, +static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: - return fanout_set_data_cbpf(po, USER_SOCKPTR(data), len); + return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: @@ -3652,7 +3652,8 @@ static void packet_flush_mclist(struct sock *sk) } static int -packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, + unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -3672,7 +3673,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); - if (copy_from_user(&mreq, optval, len)) + if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; @@ -3703,7 +3704,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < len) { ret = -EINVAL; } else { - if (copy_from_user(&req_u.req, optval, len)) + if (copy_from_sockptr(&req_u.req, optval, len)) ret = -EFAULT; else ret = packet_set_ring(sk, &req_u, 0, @@ -3718,7 +3719,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; @@ -3730,7 +3731,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: @@ -3756,7 +3757,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; @@ -3776,7 +3777,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3795,7 +3796,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3809,7 +3810,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3825,7 +3826,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3844,7 +3845,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->tp_tstamp = val; @@ -3856,7 +3857,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; return fanout_add(sk, val & 0xffff, val >> 16); @@ -3874,7 +3875,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; @@ -3888,7 +3889,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3907,7 +3908,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->xmit = val ? packet_direct_xmit : dev_queue_xmit; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4577e43cb777..e47d09aca4af 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -975,7 +975,7 @@ static int pep_init(struct sock *sk) } static int pep_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; @@ -983,7 +983,7 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; } diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 1a5bf3fa4578..b239120dd9ca 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -290,8 +290,7 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } -static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, - int len) +static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) { struct sockaddr_in6 sin6; struct sockaddr_in sin; @@ -308,14 +307,15 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, goto out; } else if (len < sizeof(struct sockaddr_in6)) { /* Assume IPv4 */ - if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + if (copy_from_sockptr(&sin, optval, + sizeof(struct sockaddr_in))) { ret = -EFAULT; goto out; } ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); sin6.sin6_port = sin.sin_port; } else { - if (copy_from_user(&sin6, optval, + if (copy_from_sockptr(&sin6, optval, sizeof(struct sockaddr_in6))) { ret = -EFAULT; goto out; @@ -327,21 +327,20 @@ out: return ret; } -static int rds_set_bool_option(unsigned char *optvar, char __user *optval, +static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, int optlen) { int value; if (optlen < sizeof(int)) return -EINVAL; - if (get_user(value, (int __user *) optval)) + if (copy_from_sockptr(&value, optval, sizeof(int))) return -EFAULT; *optvar = !!value; return 0; } -static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) { int ret; @@ -358,8 +357,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } -static int rds_set_transport(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) { int t_type; @@ -369,7 +367,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(int)) return -EINVAL; - if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) return -EFAULT; if (t_type < 0 || t_type >= RDS_TRANS_COUNT) @@ -380,7 +378,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } -static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, +static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, int optlen, int optname) { int val, valbool; @@ -388,7 +386,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, if (optlen != sizeof(int)) return -EFAULT; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; @@ -404,7 +402,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } -static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, +static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_rx_trace_so trace; @@ -413,7 +411,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(struct rds_rx_trace_so)) return -EFAULT; - if (copy_from_user(&trace, optval, sizeof(trace))) + if (copy_from_sockptr(&trace, optval, sizeof(trace))) return -EFAULT; if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) @@ -432,7 +430,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, } static int rds_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a7ae11846cd7..ccdd304eae0a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -353,21 +353,20 @@ out: return ret; } -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, - sizeof(struct rds_get_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; @@ -375,7 +374,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; @@ -394,7 +393,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) /* * Free the MR indicated by the given R_Key */ -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; @@ -403,8 +402,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, - sizeof(struct rds_free_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ diff --git a/net/rds/rds.h b/net/rds/rds.h index 106e862996b9..d35d1fc39807 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -924,9 +924,9 @@ int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ce85656ac9c1..cf7d974e0f61 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -365,7 +365,7 @@ void rose_destroy_socket(struct sock *sk) */ static int rose_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -377,7 +377,7 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) return -EFAULT; switch (optname) { diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index cd7d0d204c74..e6725a6de015 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -588,7 +588,7 @@ EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); * set RxRPC socket options */ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level; @@ -639,8 +639,8 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = get_user(min_sec_level, - (unsigned int __user *) optval); + ret = copy_from_sockptr(&min_sec_level, optval, + sizeof(unsigned int)); if (ret < 0) goto error; ret = -EINVAL; @@ -658,7 +658,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) goto error; ret = -EFAULT; - if (copy_from_user(service_upgrade, optval, + if (copy_from_sockptr(service_upgrade, optval, sizeof(service_upgrade)) != 0) goto error; ret = -EINVAL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9a2139ebd67d..6d29a3603a3e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -909,8 +909,8 @@ extern const struct rxrpc_security rxrpc_no_security; extern struct key_type key_type_rxrpc; extern struct key_type key_type_rxrpc_s; -int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); -int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); +int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 0c98313dd7a8..94c3df392651 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -896,7 +896,7 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) /* * grab the security key for a socket */ -int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -906,7 +906,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); @@ -926,8 +926,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) /* * grab the security keyring for a server socket */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, - int optlen) +int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -937,7 +936,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9a767f359718..144808dfea9e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4429,7 +4429,7 @@ out: * optlen - the size of the buffer. */ static int sctp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { void *kopt = NULL; int retval = 0; @@ -4449,7 +4449,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, } if (optlen > 0) { - kopt = memdup_user(optval, optlen); + kopt = memdup_sockptr(optval, optlen); if (IS_ERR(kopt)) return PTR_ERR(kopt); } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9711c9e0e515..4ac1d4de6676 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1731,7 +1731,7 @@ out: } static int smc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1754,7 +1754,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); diff --git a/net/socket.c b/net/socket.c index c97f83d879ae..e44b8ac47f6f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2094,10 +2094,10 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ -int __sys_setsockopt(int fd, int level, int optname, char __user *optval, +int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { - mm_segment_t oldfs = get_fs(); + sockptr_t optval = USER_SOCKPTR(user_optval); char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2115,7 +2115,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *optval, if (!in_compat_syscall()) err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, - optval, &optlen, + user_optval, &optlen, &kernel_optval); if (err < 0) goto out_put; @@ -2124,25 +2124,16 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *optval, goto out_put; } - if (kernel_optval) { - set_fs(KERNEL_DS); - optval = (char __user __force *)kernel_optval; - } - + if (kernel_optval) + optval = KERNEL_SOCKPTR(kernel_optval); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) - err = sock_setsockopt(sock, level, optname, - USER_SOCKPTR(optval), optlen); + err = sock_setsockopt(sock, level, optname, optval, optlen); else if (unlikely(!sock->ops->setsockopt)) err = -EOPNOTSUPP; else err = sock->ops->setsockopt(sock, level, optname, optval, optlen); - - if (kernel_optval) { - set_fs(oldfs); - kfree(kernel_optval); - } - + kfree(kernel_optval); out_put: fput_light(sock->file, fput_needed); return err; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index fc388cef6471..07419f36116a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3103,7 +3103,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) * Returns 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, - char __user *ov, unsigned int ol) + sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -3124,17 +3124,17 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; - if (get_user(value, (u32 __user *)ov)) + if (copy_from_sockptr(&value, ov, sizeof(u32))) return -EFAULT; break; case TIPC_GROUP_JOIN: if (ol < sizeof(mreq)) return -EINVAL; - if (copy_from_user(&mreq, ov, sizeof(mreq))) + if (copy_from_sockptr(&mreq, ov, sizeof(mreq))) return -EFAULT; break; default: - if (ov || ol) + if (!sockptr_is_null(ov) || ol) return -EINVAL; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ec10041c6b7d..d77f7d821130 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -450,7 +450,7 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, +static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; @@ -460,7 +460,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, int rc = 0; int conf; - if (!optval || (optlen < sizeof(*crypto_info))) { + if (sockptr_is_null(optval) || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } @@ -479,7 +479,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; } - rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); + rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -522,8 +522,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto err_crypto_info; } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); + sockptr_advance(optval, sizeof(*crypto_info)); + rc = copy_from_sockptr(crypto_info + 1, optval, + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -579,8 +580,8 @@ out: return rc; } -static int do_tls_setsockopt(struct sock *sk, int optname, - char __user *optval, unsigned int optlen) +static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, + unsigned int optlen) { int rc = 0; @@ -600,7 +601,7 @@ static int do_tls_setsockopt(struct sock *sk, int optname, } static int tls_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index df204c6761c4..27bbcfad9c17 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1517,7 +1517,7 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, + sockptr_t optval, unsigned int optlen) { int err; @@ -1535,7 +1535,7 @@ static int vsock_stream_setsockopt(struct socket *sock, err = -EINVAL; \ goto exit; \ } \ - if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ + if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d5b09bbff375..0bbb283f23c9 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -431,7 +431,7 @@ void x25_destroy_socket_from_timer(struct sock *sk) */ static int x25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; @@ -445,7 +445,7 @@ static int x25_setsockopt(struct socket *sock, int level, int optname, goto out; rc = -EFAULT; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 26e3bba8c204..2e94a7e94671 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -702,7 +702,7 @@ struct xdp_umem_reg_v1 { }; static int xsk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -720,7 +720,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(entries)) return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -747,7 +747,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); - if (copy_from_user(&mr, optval, mr_size)) + if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); @@ -774,7 +774,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); -- cgit v1.2.3 From 0bac966a1f2ae0e3cbc259c5bb10aab7bbcf8f4b Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:11:59 -0700 Subject: mptcp: Allow DATA_FIN in headers without TCP FIN RFC 8684-compliant DATA_FIN needs to be sent and ack'd before subflows are closed with TCP FIN, so write DATA_FIN DSS headers whenever their transmission has been enabled by the MPTCP connection-level socket. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 3bc56eb608d8..0b122b2a9c69 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -482,17 +482,10 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; - u8 tcp_fin; - if (skb) { - mpext = mptcp_get_ext(skb); - tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - } else { - mpext = NULL; - tcp_fin = 0; - } + mpext = skb ? mptcp_get_ext(skb) : NULL; - if (!skb || (mpext && mpext->use_map) || tcp_fin) { + if (!skb || (mpext && mpext->use_map) || subflow->data_fin_tx_enable) { unsigned int map_size; map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; @@ -502,7 +495,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (mpext) opts->ext_copy = *mpext; - if (skb && tcp_fin && subflow->data_fin_tx_enable) + if (skb && subflow->data_fin_tx_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; } -- cgit v1.2.3 From 57baaf2875404b555587391608da1625863086fa Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:00 -0700 Subject: mptcp: Return EPIPE if sending is shut down during a sendmsg A MPTCP socket where sending has been shut down should not attempt to send additional data, since DATA_FIN has already been sent. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2891ae8a1028..b3c3dbc89b3f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -748,6 +748,11 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) restart: mptcp_clean_una(sk); + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; + goto out; + } + wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); -- cgit v1.2.3 From 242e63f651e94da5fa3cbe6ae0a62dd219226418 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:01 -0700 Subject: mptcp: Remove outdated and incorrect comment mptcp_close() acquires the msk lock, so it clearly should not be held before the function is called. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b3c3dbc89b3f..7d7e0fa17219 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1421,7 +1421,6 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, release_sock(ssk); } -/* Called with msk lock held, releases such lock before returning */ static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; -- cgit v1.2.3 From 7279da6145bbb2e41a61def5d9bca5b65f12de9d Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:02 -0700 Subject: mptcp: Use MPTCP-level flag for sending DATA_FIN Since DATA_FIN information is the same for every subflow, store it only in the mptcp_sock. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 18 ++++++++++++------ net/mptcp/protocol.c | 21 +++++---------------- net/mptcp/protocol.h | 3 +-- 3 files changed, 18 insertions(+), 24 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 0b122b2a9c69..f157cb7e14c0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -451,6 +451,8 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { + u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq); + if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped @@ -458,10 +460,13 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; - ext->data_seq = subflow->data_fin_tx_seq; + /* The write_seq value has already been incremented, so + * the actual sequence number for the DATA_FIN is one less. + */ + ext->data_seq = data_fin_tx_seq - 1; ext->subflow_seq = 0; ext->data_len = 1; - } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) { + } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. @@ -477,15 +482,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; + u64 snd_data_fin_enable; struct mptcp_ext *mpext; - struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; + snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); - if (!skb || (mpext && mpext->use_map) || subflow->data_fin_tx_enable) { + if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; @@ -495,7 +502,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (mpext) opts->ext_copy = *mpext; - if (skb && subflow->data_fin_tx_enable) + if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; } @@ -504,7 +511,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; - msk = mptcp_sk(subflow->conn); if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7d7e0fa17219..dd403ba3679a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1391,8 +1391,7 @@ static void mptcp_cancel_work(struct sock *sk) sock_put(sk); } -static void mptcp_subflow_shutdown(struct sock *ssk, int how, - bool data_fin_tx_enable, u64 data_fin_tx_seq) +static void mptcp_subflow_shutdown(struct sock *ssk, int how) { lock_sock(ssk); @@ -1405,14 +1404,6 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, tcp_disconnect(ssk, O_NONBLOCK); break; default: - if (data_fin_tx_enable) { - struct mptcp_subflow_context *subflow; - - subflow = mptcp_subflow_ctx(ssk); - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; - } - ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); break; @@ -1426,7 +1417,6 @@ static void mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); LIST_HEAD(conn_list); - u64 data_fin_tx_seq; lock_sock(sk); @@ -1440,7 +1430,7 @@ static void mptcp_close(struct sock *sk, long timeout) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - data_fin_tx_seq = msk->write_seq; + msk->snd_data_fin_enable = 1; __mptcp_clear_xmit(sk); @@ -1448,9 +1438,6 @@ static void mptcp_close(struct sock *sk, long timeout) list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; __mptcp_close_ssk(sk, ssk, subflow, timeout); } @@ -2146,10 +2133,12 @@ static int mptcp_shutdown(struct socket *sock, int how) } __mptcp_flush_join_list(msk); + msk->snd_data_fin_enable = 1; + mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); + mptcp_subflow_shutdown(tcp_sk, how); } /* Wake up anyone sleeping in poll. */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 67634b595466..3f49cc105772 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -199,6 +199,7 @@ struct mptcp_sock { unsigned long flags; bool can_ack; bool fully_established; + bool snd_data_fin_enable; spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; @@ -291,10 +292,8 @@ struct mptcp_subflow_context { backup : 1, data_avail : 1, rx_eof : 1, - data_fin_tx_enable : 1, use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ - u64 data_fin_tx_seq; u32 remote_nonce; u64 thmac; u32 local_nonce; -- cgit v1.2.3 From 3721b9b64676b3377a966f3d96acafd70bb32dd9 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:03 -0700 Subject: mptcp: Track received DATA_FIN sequence number and add related helpers Incoming DATA_FIN headers need to propagate the presence of the DATA_FIN bit and the associated sequence number to the MPTCP layer, even when arriving on a bare ACK that does not get added to the receive queue. Add structure members to store the DATA_FIN information and helpers to set and check those values. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 16 ++++++++ net/mptcp/protocol.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++----- net/mptcp/protocol.h | 3 ++ 3 files changed, 115 insertions(+), 10 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index f157cb7e14c0..38583d1b9b5f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -782,6 +782,22 @@ static void update_una(struct mptcp_sock *msk, } } +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq) +{ + /* Skip if DATA_FIN was already received. + * If updating simultaneously with the recvmsg loop, values + * should match. If they mismatch, the peer is misbehaving and + * we will prefer the most recent information. + */ + if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) + return false; + + WRITE_ONCE(msk->rcv_data_fin_seq, data_fin_seq); + WRITE_ONCE(msk->rcv_data_fin, 1); + + return true; +} + static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dd403ba3679a..e1c71bfd61a3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -16,6 +16,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include #endif @@ -163,6 +164,101 @@ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, return mptcp_subflow_data_available(ssk); } +static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (READ_ONCE(msk->rcv_data_fin) && + ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { + u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); + + if (msk->ack_seq == rcv_data_fin_seq) { + if (seq) + *seq = rcv_data_fin_seq; + + return true; + } + } + + return false; +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static void mptcp_check_data_fin(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + u64 rcv_data_fin_seq; + + if (__mptcp_check_fallback(msk) || !msk->first) + return; + + /* Need to ack a DATA_FIN received from a peer while this side + * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. + * msk->rcv_data_fin was set when parsing the incoming options + * at the subflow level and the msk lock was not held, so this + * is the first opportunity to act on the DATA_FIN and change + * the msk state. + * + * If we are caught up to the sequence number of the incoming + * DATA_FIN, send the DATA_ACK now and do state transition. If + * not caught up, do nothing and let the recv code send DATA_ACK + * when catching up. + */ + + if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { + struct mptcp_subflow_context *subflow; + + msk->ack_seq++; + WRITE_ONCE(msk->rcv_data_fin, 0); + + sk->sk_shutdown |= RCV_SHUTDOWN; + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + // @@ Close subflows now? + break; + default: + /* Other states not expected */ + WARN_ON_ONCE(1); + break; + } + + mptcp_set_timeout(sk, NULL); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + } + + sk->sk_state_change(sk); + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -303,16 +399,6 @@ static void __mptcp_flush_join_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->join_list_lock); } -static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) -{ - long tout = ssk && inet_csk(ssk)->icsk_pending ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; - - if (tout <= 0) - tout = mptcp_sk(sk)->timer_ival; - mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; -} - static bool mptcp_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3f49cc105772..beb34b8a5363 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -193,12 +193,14 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + u64 rcv_data_fin_seq; atomic64_t snd_una; unsigned long timer_ival; u32 token; unsigned long flags; bool can_ack; bool fully_established; + bool rcv_data_fin; bool snd_data_fin_enable; spinlock_t join_list_lock; struct work_struct work; @@ -385,6 +387,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq); void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) -- cgit v1.2.3 From 6920b851584cc69a61ebf2cff3948bb153bcef20 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:04 -0700 Subject: mptcp: Add mptcp_close_state() helper This will be used to transition to the appropriate state on close and determine if a DATA_FIN needs to be sent for that state transition. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e1c71bfd61a3..51370b69e30b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1498,6 +1498,33 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) release_sock(ssk); } +static const unsigned char new_state[16] = { + /* current state: new state: action: */ + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ +}; + +static int mptcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + inet_sk_state_store(sk, ns); + + return next & TCP_ACTION_FIN; +} + static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; -- cgit v1.2.3 From 16a9a9da17234797b01ca05024d33269872a0ae0 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:05 -0700 Subject: mptcp: Add helper to process acks of DATA_FIN After DATA_FIN has been sent, the peer will acknowledge it. An ack of the relevant MPTCP-level sequence number will update the MPTCP connection state appropriately. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 54 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 8 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 51370b69e30b..b3350830e14d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -143,6 +143,14 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + /* both sockets must be locked */ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, struct sock *ssk) @@ -164,6 +172,42 @@ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, return mptcp_subflow_data_available(ssk); } +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (__mptcp_check_fallback(msk)) + return; + + /* Look for an acknowledged DATA_FIN */ + if (((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == atomic64_read(&msk->snd_una)) { + mptcp_stop_timer(sk); + + WRITE_ONCE(msk->snd_data_fin_enable, 0); + + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_FIN_WAIT2); + sk->sk_state_change(sk); + break; + case TCP_CLOSING: + fallthrough; + case TCP_LAST_ACK: + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_state_change(sk); + break; + } + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -222,6 +266,8 @@ static void mptcp_check_data_fin(struct sock *sk) WRITE_ONCE(msk->rcv_data_fin, 0); sk->sk_shutdown |= RCV_SHUTDOWN; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -455,14 +501,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) } } -static void mptcp_stop_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - mptcp_sk(sk)->timer_ival = 0; -} - static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { const struct sock *sk = (const struct sock *)msk; -- cgit v1.2.3 From 43b54c6ee382f026fc93babf5301ec79e1c9614a Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:06 -0700 Subject: mptcp: Use full MPTCP-level disconnect state machine RFC 8684 appendix D describes the connection state machine for MPTCP. This patch implements the DATA_FIN / DATA_ACK exchanges and MPTCP-level socket state changes described in that appendix, rather than simply sending DATA_FIN along with TCP FIN when disconnecting subflows. DATA_FIN is now sent and acknowledged before shutting down the subflows. Received DATA_FIN information (if not part of a data packet) is written to the MPTCP socket when the incoming DSS option is parsed by the subflow, and the MPTCP worker is scheduled to process the flag. DATA_FIN received as part of a full DSS mapping will be handled when the mapping is processed. The DATA_FIN is acknowledged by the worker if the reader is caught up. If there is still data to be moved to the MPTCP-level queue, ack_seq will be incremented to account for the DATA_FIN when it reaches the end of the stream and a DATA_ACK will be sent to the peer. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 11 +++++++ net/mptcp/protocol.c | 87 +++++++++++++++++++++++++++++++++++++++++++--------- net/mptcp/subflow.c | 11 +++++-- 3 files changed, 92 insertions(+), 17 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 38583d1b9b5f..b4458ecd01f8 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -868,6 +868,17 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (mp_opt.use_ack) update_una(msk, &mp_opt); + /* Zero-length packets, like bare ACKs carrying a DATA_FIN, are + * dropped by the caller and not propagated to the MPTCP layer. + * Copy the DATA_FIN information now. + */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + if (mp_opt.data_fin && mp_opt.data_len == 1 && + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && + schedule_work(&msk->work)) + sock_hold(subflow->conn); + } + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b3350830e14d..f264ea15e081 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -381,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, *bytes = moved; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL) && + schedule_work(&msk->work)) + sock_hold(sk); + return done; } @@ -466,7 +475,8 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if (!sk_stream_is_writeable(sk) && + if ((!sk_stream_is_writeable(sk) || + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); } @@ -1384,6 +1394,7 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); mptcp_clean_una(sk); + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); @@ -1393,6 +1404,8 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + mptcp_check_data_fin(sk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1515,7 +1528,7 @@ static void mptcp_cancel_work(struct sock *sk) sock_put(sk); } -static void mptcp_subflow_shutdown(struct sock *ssk, int how) +static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); @@ -1528,8 +1541,15 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) tcp_disconnect(ssk, O_NONBLOCK); break; default: - ssk->sk_shutdown |= how; - tcp_shutdown(ssk, how); + if (__mptcp_check_fallback(mptcp_sk(sk))) { + pr_debug("Fallback"); + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + } else { + pr_debug("Sending DATA_FIN on subflow %p", ssk); + mptcp_set_timeout(sk, ssk); + tcp_send_ack(ssk); + } break; } @@ -1570,9 +1590,35 @@ static void mptcp_close(struct sock *sk, long timeout) LIST_HEAD(conn_list); lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } else if (sk->sk_state == TCP_CLOSE) { + goto cleanup; + } + + if (__mptcp_check_fallback(msk)) { + goto update_state; + } else if (mptcp_close_state(sk)) { + pr_debug("Sending DATA_FIN sk=%p", sk); + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); + } + } + sk_stream_wait_close(sk, timeout); + +update_state: inet_sk_state_store(sk, TCP_CLOSE); +cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1581,8 +1627,6 @@ static void mptcp_close(struct sock *sk, long timeout) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - msk->snd_data_fin_enable = 1; - __mptcp_clear_xmit(sk); release_sock(sk); @@ -2265,11 +2309,8 @@ static int mptcp_shutdown(struct socket *sock, int how) pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); how++; - if ((how & ~SHUTDOWN_MASK) || !how) { ret = -EINVAL; goto out_unlock; @@ -2283,13 +2324,31 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } - __mptcp_flush_join_list(msk); - msk->snd_data_fin_enable = 1; + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if (__mptcp_check_fallback(msk)) { + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(tcp_sk, how); + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } + } else if ((how & SEND_SHUTDOWN) && + ((1 << sock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && + mptcp_close_state(sock->sk)) { + __mptcp_flush_join_list(msk); + + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } } /* Wake up anyone sleeping in poll. */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e645483d1200..7ab2a52ad150 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -598,7 +598,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } -static enum mapping_status get_mapping_status(struct sock *ssk) +static enum mapping_status get_mapping_status(struct sock *ssk, + struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_ext *mpext; @@ -648,7 +649,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (mpext->data_fin == 1) { if (data_len == 1) { - pr_debug("DATA_FIN with no payload"); + mptcp_update_rcv_data_fin(msk, mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -660,6 +662,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) } else { return MAPPING_DATA_FIN; } + } else { + mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len); + pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len); } /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -748,7 +753,7 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 ack_seq; u64 old_ack; - status = get_mapping_status(ssk); + status = get_mapping_status(ssk, msk); pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; -- cgit v1.2.3 From 067a0b3dc52f0f79b9fe64ff8d9bcbb0ffbcf8fc Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:07 -0700 Subject: mptcp: Only use subflow EOF signaling on fallback connections The MPTCP state machine handles disconnections on non-fallback connections, but the mptcp_sock still needs to get notified when fallback subflows disconnect. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 7ab2a52ad150..1c8482bc2ce5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1159,7 +1159,8 @@ static void subflow_state_change(struct sock *sk) if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); - if (!(parent->sk_shutdown & RCV_SHUTDOWN) && + if (__mptcp_check_fallback(mptcp_sk(parent)) && + !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); -- cgit v1.2.3 From 06827b348b1d43850a63c3e490fe9712c124fa0c Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:08 -0700 Subject: mptcp: Skip unnecessary skb extension allocation for bare acks Bare TCP ack skbs are freed right after MPTCP sees them, so the work to allocate, zero, and populate the MPTCP skb extension is wasted. Detect these skbs and do not add skb extensions to them. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b4458ecd01f8..7fa822b55c34 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -868,15 +868,18 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (mp_opt.use_ack) update_una(msk, &mp_opt); - /* Zero-length packets, like bare ACKs carrying a DATA_FIN, are - * dropped by the caller and not propagated to the MPTCP layer. - * Copy the DATA_FIN information now. + /* Zero-data-length packets are dropped by the caller and not + * propagated to the MPTCP layer, so the skb extension does not + * need to be allocated or populated. DATA_FIN information, if + * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && schedule_work(&msk->work)) sock_hold(subflow->conn); + + return; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); -- cgit v1.2.3 From c75293925f24630326abdf79751d980ec3878f65 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:09 -0700 Subject: mptcp: Safely read sequence number when lock isn't held The MPTCP socket's write_seq member should be read with READ_ONCE() when the msk lock is not held. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f264ea15e081..f2455a68d231 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1269,7 +1269,7 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == msk->write_seq) { + if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { mptcp_stop_timer(sk); } else { set_bit(MPTCP_WORK_RTX, &msk->flags); -- cgit v1.2.3 From 721e9089905ab7aebd5364b86b5f068f632a0e49 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 28 Jul 2020 15:12:10 -0700 Subject: mptcp: Safely store sequence number when sending data The MPTCP socket's write_seq member can be read without the msk lock held, so use WRITE_ONCE() to store it. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2455a68d231..687f0bea2b35 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -793,7 +793,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, out: if (!retransmission) pfrag->offset += frag_truesize; - *write_seq += ret; + WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; -- cgit v1.2.3 From 535fb8152f313dd5d30ef84ce55b01ad9cbae3cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Jul 2020 21:25:51 +0200 Subject: mptcp: token: move retry to caller Once syncookie support is added, no state will be stored anymore when the syn/ack is generated in syncookie mode. When the ACK comes back, the generated key will be taken from the TCP ACK, the token is re-generated and inserted into the token tree. This means we can't retry with a new key when the token is already taken in the syncookie case. Therefore, move the retry logic to the caller to prepare for syncookie support in mptcp. Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 9 ++++++++- net/mptcp/token.c | 12 ++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1c8482bc2ce5..9feb87880d1c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -126,11 +126,18 @@ static void subflow_init_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err; + int err, retries = 4; + +again: + do { + get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); + } while (subflow_req->local_key == 0); err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; + else if (retries-- > 0) + goto again; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } else if (mp_opt.mp_join && listener->request_mptcp) { diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 97cfc45bcc4f..f82410c54653 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -109,14 +109,12 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - int retries = TOKEN_MAX_RETRIES; struct token_bucket *bucket; u32 token; -again: - mptcp_crypto_key_gen_sha(&subflow_req->local_key, - &subflow_req->token, - &subflow_req->idsn); + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", req, subflow_req->local_key, subflow_req->token, subflow_req->idsn); @@ -126,9 +124,7 @@ again: spin_lock_bh(&bucket->lock); if (__token_bucket_busy(bucket, token)) { spin_unlock_bh(&bucket->lock); - if (!--retries) - return -EBUSY; - goto again; + return -EBUSY; } hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); -- cgit v1.2.3 From 78d8b7bc4b32e2d32ac19d3b217166224c4342d0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Jul 2020 21:25:52 +0200 Subject: mptcp: subflow: split subflow_init_req When syncookie support is added, we will need to add a variant of subflow_init_req() helper. It will do almost same thing except that it will not compute/add a token to the mptcp token tree. To avoid excess copy&paste, this commit splits away part of the code into a new helper, __subflow_init_req, that can then be re-used from the 'no insert' function added in a followup change. Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9feb87880d1c..091e305a81c8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -91,17 +91,9 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, return msk; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { - struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - struct mptcp_options_received mp_opt; - - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - - mptcp_get_options(skb, &mp_opt); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; @@ -113,9 +105,29 @@ static void subflow_init_req(struct request_sock *req, * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return; + return -EINVAL; #endif + return 0; +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int ret; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + ret = __subflow_init_req(req, sk_listener); + if (ret) + return; + + mptcp_get_options(skb, &mp_opt); + if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); -- cgit v1.2.3 From 08b8d080982fec354173d3fd28a3106a719b8950 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Jul 2020 21:25:53 +0200 Subject: mptcp: rename and export mptcp_subflow_request_sock_ops syncookie code path needs to create an mptcp request sock. Prepare for this and add mptcp prefix plus needed export of ops struct. Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 + net/mptcp/subflow.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 02158c257bd4..76eb915bf91c 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -58,6 +58,7 @@ struct mptcp_out_options { }; #ifdef CONFIG_MPTCP +extern struct request_sock_ops mptcp_subflow_request_sock_ops; void mptcp_init(void); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 091e305a81c8..9b11d2b6ff4d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -284,7 +284,8 @@ do_reset: tcp_done(sk); } -static struct request_sock_ops subflow_request_sock_ops; +struct request_sock_ops mptcp_subflow_request_sock_ops; +EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -297,7 +298,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: @@ -322,7 +323,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: @@ -1311,8 +1312,8 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) void __init mptcp_subflow_init(void) { - subflow_request_sock_ops = tcp_request_sock_ops; - if (subflow_ops_init(&subflow_request_sock_ops) != 0) + mptcp_subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; -- cgit v1.2.3 From c83a47e50d8fd3825a4758158e9edd5acdc74185 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Jul 2020 21:25:54 +0200 Subject: mptcp: subflow: add mptcp_subflow_init_cookie_req helper Will be used to initialize the mptcp request socket when a MP_CAPABLE request was handled in syncookie mode, i.e. when a TCP ACK containing a MP_CAPABLE option is a valid syncookie value. Normally (non-cookie case), MPTCP will generate a unique 32 bit connection ID and stores it in the MPTCP token storage to be able to retrieve the mptcp socket for subflow joining. In syncookie case, we do not want to store any state, so just generate the unique ID and use it in the reply. This means there is a small window where another connection could generate the same token. When Cookie ACK comes back, we check that the token has not been registered in the mean time. If it was, the connection needs to fall back to TCP. Changes in v2: - use req->syncookie instead of passing 'want_cookie' arg to ->init_req() (Eric Dumazet) Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 10 ++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- net/mptcp/token.c | 26 ++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 76eb915bf91c..3525d2822abe 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -131,6 +131,9 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, } void mptcp_seq_show(struct seq_file *seq); +int mptcp_subflow_init_cookie_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb); #else static inline void mptcp_init(void) @@ -200,6 +203,13 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } + +static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + return 0; /* TCP fallback */ +} #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index beb34b8a5363..d76d3b40d69e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -400,6 +400,7 @@ void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); +bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9b11d2b6ff4d..3d346572d4c9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -140,18 +140,31 @@ static void subflow_init_req(struct request_sock *req, if (mp_opt.mp_capable && listener->request_mptcp) { int err, retries = 4; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); + if (unlikely(req->syncookie)) { + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + if (mptcp_token_exists(subflow_req->token)) { + if (retries-- > 0) + goto again; + } else { + subflow_req->mp_capable = 1; + } + return; + } + err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; - subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; @@ -165,6 +178,41 @@ again: } } +int mptcp_subflow_init_cookie_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int err; + + err = __subflow_init_req(req, sk_listener); + if (err) + return err; + + mptcp_get_options(skb, &mp_opt); + + if (mp_opt.mp_capable && mp_opt.mp_join) + return -EINVAL; + + if (mp_opt.mp_capable && listener->request_mptcp) { + if (mp_opt.sndr_key == 0) + return -EINVAL; + + subflow_req->local_key = mp_opt.rcvr_key; + err = mptcp_token_new_request(req); + if (err) + return err; + + subflow_req->mp_capable = 1; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); + static void subflow_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index f82410c54653..8b47c4bb1c6b 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -204,6 +204,32 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req, spin_unlock_bh(&bucket->lock); } +bool mptcp_token_exists(u32 token) +{ + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) == token) + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + + rcu_read_unlock(); + return false; +found: + rcu_read_unlock(); + return true; +} + /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token * @token: token of the mptcp connection to retrieve -- cgit v1.2.3 From 9466a1ccebbe54ac57fb8a89c2b4b854826546a8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Jul 2020 21:25:56 +0200 Subject: mptcp: enable JOIN requests even if cookies are in use JOIN requests do not work in syncookie mode -- for HMAC validation, the peers nonce and the mptcp token (to obtain the desired connection socket the join is for) are required, but this information is only present in the initial syn. So either we need to drop all JOIN requests once a listening socket enters syncookie mode, or we need to store enough state to reconstruct the request socket later. This adds a state table (1024 entries) to store the data present in the MP_JOIN syn request and the random nonce used for the cookie syn/ack. When a MP_JOIN ACK passed cookie validation, the table is consulted to rebuild the request socket from it. An alternate approach would be to "cancel" syn-cookie mode and force MP_JOIN to always use a syn queue entry. However, doing so brings the backlog over the configured queue limit. v2: use req->syncookie, not (removed) want_cookie arg Suggested-by: Paolo Abeni Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 6 +++ net/mptcp/Makefile | 1 + net/mptcp/ctrl.c | 1 + net/mptcp/protocol.h | 20 ++++++++ net/mptcp/subflow.c | 14 ++++++ net/mptcp/syncookies.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+) create mode 100644 net/mptcp/syncookies.c (limited to 'net/mptcp') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 54838ee2e8d4..11b20474be83 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -212,6 +212,12 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); + + if (tcp_rsk(req)->drop_req) { + refcount_set(&req->rsk_refcnt, 2); + return child; + } + if (inet_csk_reqsk_queue_add(sk, req, child)) return child; diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 2360cbd27d59..a611968be4d7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o +obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 8e39585d37f3..54b888f94009 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -112,6 +112,7 @@ static struct pernet_operations mptcp_pernet_ops = { void __init mptcp_init(void) { + mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d76d3b40d69e..60b27d44c184 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -506,4 +506,24 @@ static inline bool subflow_simultaneous_connect(struct sock *sk) !subflow->conn_finished; } +#ifdef CONFIG_SYN_COOKIES +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +void __init mptcp_join_cookie_init(void); +#else +static inline void +subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) {} +static inline bool +mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + return false; +} + +static inline void mptcp_join_cookie_init(void) {} +#endif + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3d346572d4c9..a4cc4591bd4e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -173,6 +173,12 @@ again: subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); + + if (unlikely(req->syncookie) && subflow_req->msk) { + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_init_req_cookie_join_save(subflow_req, skb); + } + pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } @@ -207,6 +213,14 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } else if (mp_opt.mp_join && listener->request_mptcp) { + if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) + return -EINVAL; + + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_req->mp_join = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c new file mode 100644 index 000000000000..6eb992789b50 --- /dev/null +++ b/net/mptcp/syncookies.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "protocol.h" + +/* Syncookies do not work for JOIN requests. + * + * Unlike MP_CAPABLE, where the ACK cookie contains the needed MPTCP + * options to reconstruct the initial syn state, MP_JOIN does not contain + * the token to obtain the mptcp socket nor the server-generated nonce + * that was used in the cookie SYN/ACK response. + * + * Keep a small best effort state table to store the syn/synack data, + * indexed by skb hash. + * + * A MP_JOIN SYN packet handled by syn cookies is only stored if the 32bit + * token matches a known mptcp connection that can still accept more subflows. + * + * There is no timeout handling -- state is only re-constructed + * when the TCP ACK passed the cookie validation check. + */ + +struct join_entry { + u32 token; + u32 remote_nonce; + u32 local_nonce; + u8 join_id; + u8 local_id; + u8 backup; + u8 valid; +}; + +#define COOKIE_JOIN_SLOTS 1024 + +static struct join_entry join_entries[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; +static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; + +static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net) +{ + u32 i = skb_get_hash(skb) ^ net_hash_mix(net); + + return i % ARRAY_SIZE(join_entries); +} + +static void mptcp_join_store_state(struct join_entry *entry, + const struct mptcp_subflow_request_sock *subflow_req) +{ + entry->token = subflow_req->token; + entry->remote_nonce = subflow_req->remote_nonce; + entry->local_nonce = subflow_req->local_nonce; + entry->backup = subflow_req->backup; + entry->join_id = subflow_req->remote_id; + entry->local_id = subflow_req->local_id; + entry->valid = 1; +} + +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + + /* No use in waiting if other cpu is already using this slot -- + * would overwrite the data that got stored. + */ + spin_lock_bh(&join_entry_locks[i]); + mptcp_join_store_state(&join_entries[i], subflow_req); + spin_unlock_bh(&join_entry_locks[i]); +} + +/* Called for a cookie-ack with MP_JOIN option present. + * Look up the saved state based on skb hash & check token matches msk + * in same netns. + * + * Caller will check msk can still accept another subflow. The hmac + * present in the cookie ACK mptcp option space will be checked later. + */ +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + struct mptcp_sock *msk; + struct join_entry *e; + + e = &join_entries[i]; + + spin_lock_bh(&join_entry_locks[i]); + + if (e->valid == 0) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + e->valid = 0; + + msk = mptcp_token_get_sock(e->token); + if (!msk) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + /* If this fails, the token got re-used in the mean time by another + * mptcp socket in a different netns, i.e. entry is outdated. + */ + if (!net_eq(sock_net((struct sock *)msk), net)) + goto err_put; + + subflow_req->remote_nonce = e->remote_nonce; + subflow_req->local_nonce = e->local_nonce; + subflow_req->backup = e->backup; + subflow_req->remote_id = e->join_id; + subflow_req->token = e->token; + subflow_req->msk = msk; + spin_unlock_bh(&join_entry_locks[i]); + return true; + +err_put: + spin_unlock_bh(&join_entry_locks[i]); + sock_put((struct sock *)msk); + return false; +} + +void __init mptcp_join_cookie_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(join_entry_locks); i++) + spin_lock_init(&join_entry_locks[i]); + + BUILD_BUG_ON(ARRAY_SIZE(join_entry_locks) != ARRAY_SIZE(join_entries)); +} -- cgit v1.2.3 From 7126bd5c8bcbc015cf89864cf71d750e8f33f924 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 1 Aug 2020 16:39:59 +0200 Subject: mptcp: fix syncookie build error on UP kernel test robot says: net/mptcp/syncookies.c: In function 'mptcp_join_cookie_init': include/linux/kernel.h:47:38: warning: division by zero [-Wdiv-by-zero] #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) I forgot that spinock_t size is 0 on UP, so ARRAY_SIZE cannot be used. Fixes: 9466a1ccebbe54 ("mptcp: enable JOIN requests even if cookies are in use") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/syncookies.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index 6eb992789b50..abe0fd099746 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -125,8 +125,6 @@ void __init mptcp_join_cookie_init(void) { int i; - for (i = 0; i < ARRAY_SIZE(join_entry_locks); i++) + for (i = 0; i < COOKIE_JOIN_SLOTS; i++) spin_lock_init(&join_entry_locks[i]); - - BUILD_BUG_ON(ARRAY_SIZE(join_entry_locks) != ARRAY_SIZE(join_entries)); } -- cgit v1.2.3 From 190f8b060ee38fcea885e08b2fe0e3fdd428a618 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 3 Aug 2020 21:00:44 +0800 Subject: mptcp: use mptcp_for_each_subflow in mptcp_stream_accept Use mptcp_for_each_subflow in mptcp_stream_accept instead of open-coding. Signed-off-by: Geliang Tang Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d3fe7296e1c9..400824eabf73 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2249,7 +2249,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, * This is needed so NOSPACE flag can be set from tcp stack. */ __mptcp_flush_join_list(msk); - list_for_each_entry(subflow, &msk->conn_list, node) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) -- cgit v1.2.3 From 8555c6bfd5fddb1cf363d3cd157d70a1bb27f718 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 3 Aug 2020 18:40:39 +0200 Subject: mptcp: fix bogus sendmsg() return code under pressure In case of memory pressure, mptcp_sendmsg() may call sk_stream_wait_memory() after succesfully xmitting some bytes. If the latter fails we currently return to the user-space the error code, ignoring the succeful xmit. Address the issue always checking for the xmitted bytes before mptcp_sendmsg() completes. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 400824eabf73..8c1d1a595701 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -984,7 +984,6 @@ wait_for_sndbuf: mptcp_set_timeout(sk, ssk); if (copied) { - ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); @@ -997,7 +996,7 @@ wait_for_sndbuf: release_sock(ssk); out: release_sock(sk); - return ret; + return copied ? : ret; } static void mptcp_wait_data(struct sock *sk, long *timeo) -- cgit v1.2.3 From adf7341064982de923a1f8a11bcdec48be6b3004 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 4 Aug 2020 18:31:06 +0200 Subject: mptcp: be careful on subflow creation Nicolas reported the following oops: [ 1521.392541] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 1521.394189] #PF: supervisor read access in kernel mode [ 1521.395376] #PF: error_code(0x0000) - not-present page [ 1521.396607] PGD 0 P4D 0 [ 1521.397156] Oops: 0000 [#1] SMP PTI [ 1521.398020] CPU: 0 PID: 22986 Comm: kworker/0:2 Not tainted 5.8.0-rc4+ #109 [ 1521.399618] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 1521.401728] Workqueue: events mptcp_worker [ 1521.402651] RIP: 0010:mptcp_subflow_create_socket+0xf1/0x1c0 [ 1521.403954] Code: 24 08 89 44 24 04 48 8b 7a 18 e8 2a 48 d4 ff 8b 44 24 04 85 c0 75 7a 48 8b 8b 78 02 00 00 48 8b 54 24 08 48 8d bb 80 00 00 00 <48> 8b 89 c0 00 00 00 48 89 8a c0 00 00 00 48 8b 8b 78 02 00 00 8b [ 1521.408201] RSP: 0000:ffffabc4002d3c60 EFLAGS: 00010246 [ 1521.409433] RAX: 0000000000000000 RBX: ffffa0b9ad8c9a00 RCX: 0000000000000000 [ 1521.411096] RDX: ffffa0b9ae78a300 RSI: 00000000fffffe01 RDI: ffffa0b9ad8c9a80 [ 1521.412734] RBP: ffffa0b9adff2e80 R08: ffffa0b9af02d640 R09: ffffa0b9ad923a00 [ 1521.414333] R10: ffffabc4007139f8 R11: fefefefefefefeff R12: ffffabc4002d3cb0 [ 1521.415918] R13: ffffa0b9ad91fa58 R14: ffffa0b9ad8c9f9c R15: 0000000000000000 [ 1521.417592] FS: 0000000000000000(0000) GS:ffffa0b9af000000(0000) knlGS:0000000000000000 [ 1521.419490] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1521.420839] CR2: 00000000000000c0 CR3: 000000002951e006 CR4: 0000000000160ef0 [ 1521.422511] Call Trace: [ 1521.423103] __mptcp_subflow_connect+0x94/0x1f0 [ 1521.425376] mptcp_pm_create_subflow_or_signal_addr+0x200/0x2a0 [ 1521.426736] mptcp_worker+0x31b/0x390 [ 1521.431324] process_one_work+0x1fc/0x3f0 [ 1521.432268] worker_thread+0x2d/0x3b0 [ 1521.434197] kthread+0x117/0x130 [ 1521.435783] ret_from_fork+0x22/0x30 on some unconventional configuration. The MPTCP protocol is trying to create a subflow for an unaccepted server socket. That is allowed by the RFC, even if subflow creation will likely fail. Unaccepted sockets have still a NULL sk_socket field, avoid the issue by failing earlier. Reported-and-tested-by: Nicolas Rybowski Fixes: 7d14b0d2b9b3 ("mptcp: set correct vfs info for subflows") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a4cc4591bd4e..96f4f2fe50ad 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1137,6 +1137,12 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) struct socket *sf; int err; + /* un-accepted server sockets can reach here - on bad configuration + * bail early to avoid greater trouble later + */ + if (unlikely(!sk->sk_socket)) + return -EINVAL; + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) -- cgit v1.2.3