From 9c02bec95954252c3c01bfbb3f7560e0b95ca955 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:11 +0200 Subject: bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT sockets. This means we can't use the helper to steer traffic to Envoy, which configures SO_REUSEPORT on its sockets. In turn, we're blocked from removing TPROXY from our setup. The reason that bpf_sk_assign refuses such sockets is that the bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead, one of the reuseport sockets is selected by hash. This could cause dispatch to the "wrong" socket: sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup helpers unfortunately. In the tc context, L2 headers are at the start of the skb, while SK_REUSEPORT expects L3 headers instead. Instead, we execute the SK_REUSEPORT program when the assigned socket is pulled out of the skb, further up the stack. This creates some trickiness with regards to refcounting as bpf_sk_assign will put both refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU freed. We can infer that the sk_assigned socket is RCU freed if the reuseport lookup succeeds, but convincing yourself of this fact isn't straight forward. Therefore we defensively check refcounting on the sk_assign sock even though it's probably not required in practice. Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign") Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Cc: Joe Stringer Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/ Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- include/net/inet6_hashtables.h | 56 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) (limited to 'include/net/inet6_hashtables.h') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index a6722d6ef80f..284b5ce7205d 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -103,6 +103,46 @@ static inline struct sock *__inet6_lookup(struct net *net, daddr, hnum, dif, sdif); } +static inline +struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + bool *refcounted, inet6_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, @@ -110,14 +150,20 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); - + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *sk; + + sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, + refcounted, inet6_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, - doff, &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), + return __inet6_lookup(net, hashinfo, skb, + doff, &ip6h->saddr, sport, + &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -- cgit v1.2.3