summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2021-04-01 20:56:15 +0300
committerAlexei Starovoitov <ast@kernel.org>2021-04-01 20:56:15 +0300
commit89d69c5d0fbcabd8656459bc8b1a476d6f1efee4 (patch)
treebdcfa87c0cf209ce0de77a4b8cfaec4302e4f10f /net/ipv4
parente27bfefb21f28d5295432f042b5d9d7871100c35 (diff)
parent8d7cb74f2ccb5486ab8c631a8fcdc7621bbbc42c (diff)
downloadlinux-89d69c5d0fbcabd8656459bc8b1a476d6f1efee4.tar.xz
Merge branch 'sockmap: introduce BPF_SK_SKB_VERDICT and support UDP'
Cong Wang says: ==================== From: Cong Wang <cong.wang@bytedance.com> We have thousands of services connected to a daemon on every host via AF_UNIX dgram sockets, after they are moved into VM, we have to add a proxy to forward these communications from VM to host, because rewriting thousands of them is not practical. This proxy uses an AF_UNIX socket connected to services and a UDP socket to connect to the host. It is inefficient because data is copied between kernel space and user space twice, and we can not use splice() which only supports TCP. Therefore, we want to use sockmap to do the splicing without going to user-space at all (after the initial setup). Currently sockmap only fully supports TCP, UDP is partially supported as it is only allowed to add into sockmap. This patchset, as the second part of the original large patchset, extends sockmap with: 1) cross-protocol support with BPF_SK_SKB_VERDICT; 2) full UDP support. On the high level, ->read_sock() is required for each protocol to support sockmap redirection, and in order to do sock proto update, a new ops ->psock_update_sk_prot() is introduced, which is also required. And the BPF ->recvmsg() is also needed to replace the original ->recvmsg() to retrieve skmsg. To make life easier, we have to get rid of lock_sock() in sk_psock_handle_skb(), otherwise we would have to implement ->sendmsg_locked() on top of ->sendmsg(), which is ugly. Please see each patch for more details. To see the big picture, the original patchset is available here: https://github.com/congwang/linux/tree/sockmap this patchset is also available: https://github.com/congwang/linux/tree/sockmap2 --- v8: get rid of 'offset' in udp_read_sock() add checks for skb_verdict/stream_verdict conflict add two cleanup patches for sock_map_link() add a new test case v7: use work_mutex to protect psock->work return err in udp_read_sock() add patch 6/13 clean up test case v6: get rid of sk_psock_zap_ingress() add rcu work patch v5: use INDIRECT_CALL_2() for function pointers use ingress_lock to fix a race condition found by Jacub rename two helper functions v4: get rid of lock_sock() in sk_psock_handle_skb() get rid of udp_sendmsg_locked() remove an empty line update cover letter v3: export tcp/udp_update_proto() rename sk->sk_prot->psock_update_sk_prot() improve changelogs v2: separate from the original large patchset rebase to the latest bpf-next split UDP test case move inet_csk_has_ulp() check to tcp_bpf.c clean up udp_read_sock() ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv4/tcp_bpf.c130
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_bpf.c79
5 files changed, 135 insertions, 110 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1355e6c0d567..f17870ee558b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
+ .read_sock = udp_read_sock,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 17c322b875fd..3d622a0d0753 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,86 +10,6 @@
#include <net/inet_common.h>
#include <net/tls.h>
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
- struct msghdr *msg, int len, int flags)
-{
- struct iov_iter *iter = &msg->msg_iter;
- int peek = flags & MSG_PEEK;
- struct sk_msg *msg_rx;
- int i, copied = 0;
-
- msg_rx = list_first_entry_or_null(&psock->ingress_msg,
- struct sk_msg, list);
-
- while (copied != len) {
- struct scatterlist *sge;
-
- if (unlikely(!msg_rx))
- break;
-
- i = msg_rx->sg.start;
- do {
- struct page *page;
- int copy;
-
- sge = sk_msg_elem(msg_rx, i);
- copy = sge->length;
- page = sg_page(sge);
- if (copied + copy > len)
- copy = len - copied;
- copy = copy_page_to_iter(page, sge->offset, copy, iter);
- if (!copy)
- return copied ? copied : -EFAULT;
-
- copied += copy;
- if (likely(!peek)) {
- sge->offset += copy;
- sge->length -= copy;
- if (!msg_rx->skb)
- sk_mem_uncharge(sk, copy);
- msg_rx->sg.size -= copy;
-
- if (!sge->length) {
- sk_msg_iter_var_next(i);
- if (!msg_rx->skb)
- put_page(page);
- }
- } else {
- /* Lets not optimize peek case if copy_page_to_iter
- * didn't copy the entire length lets just break.
- */
- if (copy != sge->length)
- return copied;
- sk_msg_iter_var_next(i);
- }
-
- if (copied == len)
- break;
- } while (i != msg_rx->sg.end);
-
- if (unlikely(peek)) {
- if (msg_rx == list_last_entry(&psock->ingress_msg,
- struct sk_msg, list))
- break;
- msg_rx = list_next_entry(msg_rx, list);
- continue;
- }
-
- msg_rx->sg.start = i;
- if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
- list_del(&msg_rx->list);
- if (msg_rx->skb)
- consume_skb(msg_rx->skb);
- kfree(msg_rx);
- }
- msg_rx = list_first_entry_or_null(&psock->ingress_msg,
- struct sk_msg, list);
- }
-
- return copied;
-}
-EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk)
return !empty;
}
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
- int flags, long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- return 1;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
lock_sock(sk);
msg_bytes_ready:
- copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
if (!copied) {
int data, err = 0;
long timeo;
timeo = sock_rcvtimeo(sk, nonblock);
- data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
if (data) {
if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
@@ -601,20 +499,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
}
-struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int tcp_bpf_update_proto(struct sock *sk, bool restore)
{
+ struct sk_psock *psock = sk_psock(sk);
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+ if (restore) {
+ if (inet_csk_has_ulp(sk)) {
+ tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+ } else {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ }
+ return 0;
+ }
+
+ if (inet_csk_has_ulp(sk))
+ return -EINVAL;
+
if (sk->sk_family == AF_INET6) {
if (tcp_bpf_assert_proto_ops(psock->sk_proto))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
}
- return &tcp_bpf_prots[family][config];
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+ return 0;
}
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
/* If a child got cloned from a listening socket that had tcp_bpf
* protocol callbacks installed, we need to restore the callbacks to
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index daad4f99db32..dfc6d1c0e710 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2806,6 +2806,9 @@ struct proto tcp_prot = {
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a0478b17243..4d02f6839e38 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1782,6 +1782,35 @@ busy_check:
}
EXPORT_SYMBOL(__skb_recv_udp);
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ int copied = 0;
+
+ while (1) {
+ struct sk_buff *skb;
+ int err, used;
+
+ skb = skb_recv_udp(sk, 0, 1, &err);
+ if (!skb)
+ return err;
+ used = recv_actor(desc, skb, 0, skb->len);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
+ } else if (used <= skb->len) {
+ copied += used;
+ }
+
+ if (!desc->count)
+ break;
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL(udp_read_sock);
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -2849,6 +2878,9 @@ struct proto udp_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a94791efc1a..7d5c4ebf42fe 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -4,6 +4,68 @@
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags,
+ addr_len);
+#endif
+ return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len);
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ lock_sock(sk);
+ if (sk_psock_queue_empty(psock)) {
+ ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ goto out;
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ goto out;
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
enum {
UDP_BPF_IPV4,
@@ -11,7 +73,6 @@ enum {
UDP_BPF_NUM_PROTS,
};
-static struct proto *udpv6_prot_saved __read_mostly;
static DEFINE_SPINLOCK(udpv6_prot_lock);
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
@@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
*prot = *base;
prot->unhash = sock_map_unhash;
prot->close = sock_map_close;
+ prot->recvmsg = udp_bpf_recvmsg;
}
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
@@ -41,12 +103,23 @@ static int __init udp_bpf_v4_build_proto(void)
}
core_initcall(udp_bpf_v4_build_proto);
-struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int udp_bpf_update_proto(struct sock *sk, bool restore)
{
int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ return 0;
+ }
if (sk->sk_family == AF_INET6)
udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
- return &udp_bpf_prots[family];
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+ return 0;
}
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);