diff options
author | Alexei Starovoitov <ast@kernel.org> | 2021-04-01 20:56:15 +0300 |
---|---|---|
committer | Alexei Starovoitov <ast@kernel.org> | 2021-04-01 20:56:15 +0300 |
commit | 89d69c5d0fbcabd8656459bc8b1a476d6f1efee4 (patch) | |
tree | bdcfa87c0cf209ce0de77a4b8cfaec4302e4f10f /net/ipv4 | |
parent | e27bfefb21f28d5295432f042b5d9d7871100c35 (diff) | |
parent | 8d7cb74f2ccb5486ab8c631a8fcdc7621bbbc42c (diff) | |
download | linux-89d69c5d0fbcabd8656459bc8b1a476d6f1efee4.tar.xz |
Merge branch 'sockmap: introduce BPF_SK_SKB_VERDICT and support UDP'
Cong Wang says:
====================
From: Cong Wang <cong.wang@bytedance.com>
We have thousands of services connected to a daemon on every host
via AF_UNIX dgram sockets, after they are moved into VM, we have to
add a proxy to forward these communications from VM to host, because
rewriting thousands of them is not practical. This proxy uses an
AF_UNIX socket connected to services and a UDP socket to connect to
the host. It is inefficient because data is copied between kernel
space and user space twice, and we can not use splice() which only
supports TCP. Therefore, we want to use sockmap to do the splicing
without going to user-space at all (after the initial setup).
Currently sockmap only fully supports TCP, UDP is partially supported
as it is only allowed to add into sockmap. This patchset, as the second
part of the original large patchset, extends sockmap with:
1) cross-protocol support with BPF_SK_SKB_VERDICT; 2) full UDP support.
On the high level, ->read_sock() is required for each protocol to support
sockmap redirection, and in order to do sock proto update, a new ops
->psock_update_sk_prot() is introduced, which is also required. And the
BPF ->recvmsg() is also needed to replace the original ->recvmsg() to
retrieve skmsg. To make life easier, we have to get rid of lock_sock()
in sk_psock_handle_skb(), otherwise we would have to implement
->sendmsg_locked() on top of ->sendmsg(), which is ugly.
Please see each patch for more details.
To see the big picture, the original patchset is available here:
https://github.com/congwang/linux/tree/sockmap
this patchset is also available:
https://github.com/congwang/linux/tree/sockmap2
---
v8: get rid of 'offset' in udp_read_sock()
add checks for skb_verdict/stream_verdict conflict
add two cleanup patches for sock_map_link()
add a new test case
v7: use work_mutex to protect psock->work
return err in udp_read_sock()
add patch 6/13
clean up test case
v6: get rid of sk_psock_zap_ingress()
add rcu work patch
v5: use INDIRECT_CALL_2() for function pointers
use ingress_lock to fix a race condition found by Jacub
rename two helper functions
v4: get rid of lock_sock() in sk_psock_handle_skb()
get rid of udp_sendmsg_locked()
remove an empty line
update cover letter
v3: export tcp/udp_update_proto()
rename sk->sk_prot->psock_update_sk_prot()
improve changelogs
v2: separate from the original large patchset
rebase to the latest bpf-next
split UDP test case
move inet_csk_has_ulp() check to tcp_bpf.c
clean up udp_read_sock()
====================
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 130 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/udp.c | 32 | ||||
-rw-r--r-- | net/ipv4/udp_bpf.c | 79 |
5 files changed, 135 insertions, 110 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..3d622a0d0753 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,86 +10,6 @@ #include <net/inet_common.h> #include <net/tls.h> -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) - break; - msg_rx = list_next_entry(msg_rx, list); - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -601,20 +499,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..dfc6d1c0e710 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2806,6 +2806,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a0478b17243..4d02f6839e38 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,35 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct sk_buff *skb; + int err, used; + + skb = skb_recv_udp(sk, 0, 1, &err); + if (!skb) + return err; + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= skb->len) { + copied += used; + } + + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -2849,6 +2878,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..7d5c4ebf42fe 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -4,6 +4,68 @@ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> +#include <net/inet_common.h> + +#include "udp_impl.h" + +static struct proto *udpv6_prot_saved __read_mostly; + +static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags, + addr_len); +#endif + return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); +} + +static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + if (sk_psock_queue_empty(psock)) { + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} enum { UDP_BPF_IPV4, @@ -11,7 +73,6 @@ enum { UDP_BPF_NUM_PROTS, }; -static struct proto *udpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; @@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) *prot = *base; prot->unhash = sock_map_unhash; prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -41,12 +103,23 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); |