diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 257 |
1 files changed, 151 insertions, 106 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 06da6ad31c87..a57f3eab7b6a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -45,11 +45,14 @@ static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk); +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +static struct net_device mptcp_napi_dev; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. */ -static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { if (!msk->subflow || READ_ONCE(msk->can_ack)) return NULL; @@ -114,11 +117,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; - - /* accept() will wait on first subflow sk_wq, and we always wakes up - * via msk->sk_socket - */ - RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + mptcp_sock_graft(msk->first, sk->sk_socket); return 0; } @@ -743,10 +742,14 @@ wake: void __mptcp_flush_join_list(struct mptcp_sock *msk) { + struct mptcp_subflow_context *subflow; + if (likely(list_empty(&msk->join_list))) return; spin_lock_bh(&msk->join_list_lock); + list_for_each_entry(subflow, &msk->join_list, node) + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); } @@ -1046,13 +1049,6 @@ out: __mptcp_update_wmem(sk); sk_mem_reclaim_partial(sk); } - - if (sk_stream_is_writeable(sk)) { - /* pairs with memory barrier in mptcp_poll */ - smp_mb(); - if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) - sk_stream_write_space(sk); - } } if (snd_una == READ_ONCE(msk->snd_nxt)) { @@ -1371,8 +1367,7 @@ struct subflow_send_info { u64 ratio; }; -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, - u32 *sndbuf) +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; @@ -1383,24 +1378,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); - *sndbuf = 0; if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - *sndbuf = msk->first->sk_sndbuf; return sk_stream_memory_free(msk->first) ? msk->first : NULL; } /* re-use last subflow, if the burst allow that */ if (msk->last_snd && msk->snd_burst > 0 && sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - } + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) return msk->last_snd; - } /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < 2; ++i) { @@ -1413,8 +1401,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, continue; nr_active += !subflow->backup; - *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); - if (!sk_stream_memory_free(subflow->tcp_sock)) + if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) continue; pace = READ_ONCE(ssk->sk_pacing_rate); @@ -1440,9 +1427,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, if (send_info[0].ssk) { msk->last_snd = send_info[0].ssk; msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - sk_stream_wspace(msk->last_snd)); + tcp_sk(msk->last_snd)->snd_wnd); return msk->last_snd; } + return NULL; } @@ -1463,7 +1451,6 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) }; struct mptcp_data_frag *dfrag; int len, copied = 0; - u32 sndbuf; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1474,12 +1461,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) prev_ssk = ssk; __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); + ssk = mptcp_subflow_get_send(msk); /* try to keep the subflow socket lock across * consecutive xmit on the same socket @@ -1536,7 +1518,9 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info; struct mptcp_data_frag *dfrag; + struct sock *xmit_ssk; int len, copied = 0; + bool first = true; info.flags = 0; while ((dfrag = mptcp_send_head(sk))) { @@ -1546,10 +1530,17 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) while (len > 0) { int ret = 0; - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + /* the caller already invoked the packet scheduler, + * check for a different subflow usage only after + * spooling the first chunk of data + */ + xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); + if (!xmit_ssk) + goto out; + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + goto out; + } if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { __mptcp_update_wmem(sk); @@ -1569,6 +1560,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) msk->tx_pending_data -= ret; copied += ret; len -= ret; + first = false; } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1591,6 +1583,15 @@ out: } } +static void mptcp_set_nospace(struct sock *sk) +{ + /* enable autotune */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* will be cleared on avail space */ + set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1692,7 +1693,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) continue; wait_for_memory: - set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_set_nospace(sk); mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) @@ -2120,12 +2121,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ -void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow) +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) { - bool dispose_socket = false; - struct socket *sock; - list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); @@ -2133,11 +2131,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* if we are invoked by the msk cleanup code, the subflow is * already orphaned */ - sock = ssk->sk_socket; - if (sock) { - dispose_socket = sock != sk->sk_socket; + if (ssk->sk_socket) sock_orphan(ssk); - } subflow->disposable = 1; @@ -2155,59 +2150,40 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, __sock_put(ssk); } release_sock(ssk); - if (dispose_socket) - iput(SOCK_INODE(sock)); sock_put(ssk); } -static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) +void mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow) { - return 0; + if (sk->sk_state == TCP_ESTABLISHED) + mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); + __mptcp_close_ssk(sk, ssk, subflow); } -static void pm_work(struct mptcp_sock *msk) +static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { - struct mptcp_pm_data *pm = &msk->pm; - - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_add_addr_send_ack(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); + return 0; } static void __mptcp_close_subflow(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; + might_sleep(); + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) continue; - __mptcp_close_ssk((struct sock *)msk, ssk, subflow); + /* 'subflow_data_ready' will re-sched once rx queue is empty */ + if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) + continue; + + mptcp_close_ssk((struct sock *)msk, ssk, subflow); } } @@ -2279,11 +2255,8 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_fastclose(msk); - if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - __mptcp_close_subflow(msk); - if (msk->pm.status) - pm_work(msk); + mptcp_pm_nl_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -2303,6 +2276,9 @@ static void mptcp_worker(struct work_struct *work) goto unlock; } + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -2544,6 +2520,14 @@ static void __mptcp_destroy_sock(struct sock *sk) pr_debug("msk=%p", msk); + might_sleep(); + + /* dispose the ancillatory tcp socket, if any */ + if (msk->subflow) { + iput(SOCK_INODE(msk->subflow)); + msk->subflow = NULL; + } + /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -2594,20 +2578,10 @@ cleanup: inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow, dispose_socket; - struct socket *sock; + bool slow = lock_sock_fast(ssk); - slow = lock_sock_fast(ssk); - sock = ssk->sk_socket; - dispose_socket = sock && sock != sk->sk_socket; sock_orphan(ssk); unlock_sock_fast(ssk, slow); - - /* for the outgoing subflows we additionally need to free - * the associated socket - */ - if (dispose_socket) - iput(SOCK_INODE(sock)); } sock_orphan(sk); @@ -2622,6 +2596,10 @@ cleanup: release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); + + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + sock_put(sk); } @@ -2936,10 +2914,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) if (!mptcp_send_head(sk)) return; - if (!sock_owned_by_user(sk)) - __mptcp_subflow_push_pending(sk, ssk); - else + if (!sock_owned_by_user(sk)) { + struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); + + if (xmit_ssk == ssk) + __mptcp_subflow_push_pending(sk, ssk); + else if (xmit_ssk) + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk)); + } else { set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + } } #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) @@ -2989,6 +2973,20 @@ static void mptcp_release_cb(struct sock *sk) } } +void mptcp_subflow_process_delegated(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); + mptcp_subflow_delegated_done(subflow); +} + static int mptcp_hash(struct sock *sk) { /* should never be called, @@ -3046,12 +3044,12 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); - mptcp_pm_new_connection(msk, 0); + mptcp_pm_new_connection(msk, ssk, 0); mptcp_rcv_space_init(msk, ssk); } -static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); @@ -3075,7 +3073,7 @@ bool mptcp_finish_join(struct sock *ssk) return false; if (!msk->pm.server_side) - return true; + goto out; if (!mptcp_pm_allow_new_subflow(msk)) return false; @@ -3102,6 +3100,8 @@ bool mptcp_finish_join(struct sock *ssk) if (parent_sock && !ssk->sk_socket) mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); +out: + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } @@ -3278,9 +3278,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; struct sock *newsk = newsock->sk; - bool slowpath; - slowpath = lock_sock_fast(newsk); + lock_sock(newsk); /* PM/worker can now acquire the first subflow socket * lock without racing with listener queue cleanup, @@ -3290,10 +3289,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, list_add(&subflow->node, &msk->conn_list); sock_hold(msk->first); if (mptcp_is_fully_established(newsk)) - mptcp_pm_fully_established(msk); + mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); + mptcp_propagate_sndbuf(newsk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@ -3305,7 +3305,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } - unlock_sock_fast(newsk, slowpath); + release_sock(newsk); } if (inet_csk_listen_poll(ssock->sk)) @@ -3334,7 +3334,7 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; - set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_set_nospace(sk); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@ -3405,13 +3405,58 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; +static int mptcp_napi_poll(struct napi_struct *napi, int budget) +{ + struct mptcp_delegated_action *delegated; + struct mptcp_subflow_context *subflow; + int work_done = 0; + + delegated = container_of(napi, struct mptcp_delegated_action, napi); + while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + bh_lock_sock_nested(ssk); + if (!sock_owned_by_user(ssk) && + mptcp_subflow_has_delegated_action(subflow)) + mptcp_subflow_process_delegated(ssk); + /* ... elsewhere tcp_release_cb_override already processed + * the action or will do at next release_sock(). + * In both case must dequeue the subflow here - on the same + * CPU that scheduled it. + */ + bh_unlock_sock(ssk); + sock_put(ssk); + + if (++work_done == budget) + return budget; + } + + /* always provide a 0 'work_done' argument, so that napi_complete_done + * will not try accessing the NULL napi->dev ptr + */ + napi_complete_done(napi, 0); + return work_done; +} + void __init mptcp_proto_init(void) { + struct mptcp_delegated_action *delegated; + int cpu; + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); + init_dummy_netdev(&mptcp_napi_dev); + for_each_possible_cpu(cpu) { + delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); + INIT_LIST_HEAD(&delegated->head); + netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(&delegated->napi); + } + mptcp_subflow_init(); mptcp_pm_init(); mptcp_token_init(); |