From a26527981af2988ae0f17f6d633848c019929e38 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 20 Mar 2020 10:34:25 +0800 Subject: bpf, tcp: Fix unused function warnings If BPF_STREAM_PARSER is not set, gcc warns: net/ipv4/tcp_bpf.c:483:12: warning: 'tcp_bpf_sendpage' defined but not used [-Wunused-function] net/ipv4/tcp_bpf.c:395:12: warning: 'tcp_bpf_sendmsg' defined but not used [-Wunused-function] net/ipv4/tcp_bpf.c:13:13: warning: 'tcp_bpf_stream_read' defined but not used [-Wunused-function] Moves the unused functions into the #ifdef CONFIG_BPF_STREAM_PARSER. Fixes: f747632b608f ("bpf: sockmap: Move generic sockmap hooks from BPF TCP") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann Reviewed-by: Lorenz Bauer Reviewed-by: Jakub Sitnicki Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200320023426.60684-2-yuehaibing@huawei.com --- net/ipv4/tcp_bpf.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index fe7b4fbc31c1..37c91f25cae3 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,19 +10,6 @@ #include #include -static bool tcp_bpf_stream_read(const struct sock *sk) -{ - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, int flags, long timeo, int *err) { @@ -298,6 +285,20 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); +#ifdef CONFIG_BPF_STREAM_PARSER +static bool tcp_bpf_stream_read(const struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} + static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { @@ -528,7 +529,6 @@ out_err: return copied ? copied : err; } -#ifdef CONFIG_BPF_STREAM_PARSER enum { TCP_BPF_IPV4, TCP_BPF_IPV6, -- cgit v1.2.3 From c0fd336ea4ca82377fbb0954db09d765eb794339 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 20 Mar 2020 10:34:26 +0800 Subject: bpf, tcp: Make tcp_bpf_recvmsg static After commit f747632b608f ("bpf: sockmap: Move generic sockmap hooks from BPF TCP"), tcp_bpf_recvmsg() is not used out of tcp_bpf.c, so make it static and remove it from tcp.h. Also move it to BPF_STREAM_PARSER #ifdef to fix unused function warnings. Fixes: f747632b608f ("bpf: sockmap: Move generic sockmap hooks from BPF TCP") Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200320023426.60684-3-yuehaibing@huawei.com --- include/net/tcp.h | 2 - net/ipv4/tcp_bpf.c | 124 ++++++++++++++++++++++++++--------------------------- 2 files changed, 62 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 43fa07a36fa6..5fa9eacd965a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2207,8 +2207,6 @@ static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) #ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 37c91f25cae3..5a05327f97c1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,25 +10,6 @@ #include #include -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { @@ -102,49 +83,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct sk_psock *psock; - int copied, ret; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); -msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); - if (!copied) { - int data, err = 0; - long timeo; - - timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); - if (data) { - if (!sk_psock_queue_empty(psock)) - goto msg_bytes_ready; - release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - } - if (err) { - ret = err; - goto out; - } - copied = -EAGAIN; - } - ret = copied; -out: - release_sock(sk); - sk_psock_put(sk, psock); - return ret; -} - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -299,6 +237,68 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + lock_sock(sk); +msg_bytes_ready: + copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + release_sock(sk); + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { -- cgit v1.2.3 From ab14fd4ee82ead4b058034509971b8bd749862ff Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Mar 2020 08:21:01 -0700 Subject: bpf: Add bpf_sk_storage support to bpf_tcp_ca This patch adds bpf_sk_storage_get() and bpf_sk_storage_delete() helper to the bpf_tcp_ca's struct_ops. That would allow bpf-tcp-cc to: 1) share sk private data with other bpf progs. 2) use bpf_sk_storage as a private storage for a bpf-tcp-cc if the existing icsk_ca_priv is not big enough. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200320152101.2169498-1-kafai@fb.com --- net/ipv4/bpf_tcp_ca.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'net') diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 574972bc7299..0fd8bfde2448 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -7,6 +7,7 @@ #include #include #include +#include static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), @@ -27,6 +28,27 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; +static int btf_sk_storage_get_ids[5]; +static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; + +static int btf_sk_storage_delete_ids[5]; +static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; + +static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, + const struct bpf_func_proto *from) +{ + int i; + + *to = *from; + to->btf_id = to_btf_ids; + for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { + if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { + to->arg_type[i] = ARG_PTR_TO_BTF_ID; + to->btf_id[i] = tcp_sock_id; + } + } +} + static int bpf_tcp_ca_init(struct btf *btf) { s32 type_id; @@ -42,6 +64,13 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + convert_sk_func_proto(&btf_sk_storage_get_proto, + btf_sk_storage_get_ids, + &bpf_sk_storage_get_proto); + convert_sk_func_proto(&btf_sk_storage_delete_proto, + btf_sk_storage_delete_ids, + &bpf_sk_storage_delete_proto); + return 0; } @@ -167,6 +196,10 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, switch (func_id) { case BPF_FUNC_tcp_send_ack: return &bpf_tcp_send_ack_proto; + case BPF_FUNC_sk_storage_get: + return &btf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &btf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 0e53d9e5e82056555020c47ee0c7a087147be084 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:50 +0100 Subject: bpf: Enable retrieval of socket cookie for bind/post-bind hook We currently make heavy use of the socket cookie in BPF's connect(), sendmsg() and recvmsg() hooks for load-balancing decisions. However, it is currently not enabled/implemented in BPF {post-}bind hooks where it can later be used in combination for correlation in the tc egress path, for example. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e9d71f310715332f12d238cc650c1edc5be55119.1585323121.git.daniel@iogearbox.net --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 96350a743539..0b6682517d45 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4117,6 +4117,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) +{ + return sock_gen_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { + .func = bpf_get_socket_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return sock_gen_cookie(ctx->sk); @@ -5954,6 +5966,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From fcf752ea8c9493acc33280e72112ea6108a0d104 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:51 +0100 Subject: bpf: Enable perf event rb output for bpf cgroup progs Currently, connect(), sendmsg(), recvmsg() and bind-related hooks are all lacking perf event rb output in order to push notifications or monitoring events up to user space. Back in commit a5a3a828cd00 ("bpf: add perf event notificaton support for sock_ops"), I've worked with Sowmini to enable them for sock_ops where the context part is not used (as opposed to skbs for example where the packet data can be appended). Make the bpf_sockopt_event_output() helper generic and enable it for mentioned hooks. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/69c39daf87e076b31e52473c902e9bfd37559124.1585323121.git.daniel@iogearbox.net --- net/core/filter.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0b6682517d45..6cb7e0e24473 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4159,8 +4159,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, u64, flags, void *, data, u64, size) +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, + void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -4168,8 +4168,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_sockopt_event_output_proto = { - .func = bpf_sockopt_event_output, +static const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5968,6 +5968,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5994,6 +5996,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -6236,7 +6240,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: - return &bpf_sockopt_event_output_proto; + return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- cgit v1.2.3 From f318903c0bf42448b4c884732df2bbb0ef7a2284 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:52 +0100 Subject: bpf: Add netns cookie and enable it for bpf cgroup hooks In Cilium we're mainly using BPF cgroup hooks today in order to implement kube-proxy free Kubernetes service translation for ClusterIP, NodePort (*), ExternalIP, and LoadBalancer as well as HostPort mapping [0] for all traffic between Cilium managed nodes. While this works in its current shape and avoids packet-level NAT for inter Cilium managed node traffic, there is one major limitation we're facing today, that is, lack of netns awareness. In Kubernetes, the concept of Pods (which hold one or multiple containers) has been built around network namespaces, so while we can use the global scope of attaching to root BPF cgroup hooks also to our advantage (e.g. for exposing NodePort ports on loopback addresses), we also have the need to differentiate between initial network namespaces and non-initial one. For example, ExternalIP services mandate that non-local service IPs are not to be translated from the host (initial) network namespace as one example. Right now, we have an ugly work-around in place where non-local service IPs for ExternalIP services are not xlated from connect() and friends BPF hooks but instead via less efficient packet-level NAT on the veth tc ingress hook for Pod traffic. On top of determining whether we're in initial or non-initial network namespace we also have a need for a socket-cookie like mechanism for network namespaces scope. Socket cookies have the nice property that they can be combined as part of the key structure e.g. for BPF LRU maps without having to worry that the cookie could be recycled. We are planning to use this for our sessionAffinity implementation for services. Therefore, add a new bpf_get_netns_cookie() helper which would resolve both use cases at once: bpf_get_netns_cookie(NULL) would provide the cookie for the initial network namespace while passing the context instead of NULL would provide the cookie from the application's network namespace. We're using a hole, so no size increase; the assignment happens only once. Therefore this allows for a comparison on initial namespace as well as regular cookie usage as we have today with socket cookies. We could later on enable this helper for other program types as well as we would see need. (*) Both externalTrafficPolicy={Local|Cluster} types [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/c47d2346982693a9cf9da0e12690453aded4c788.1585323121.git.daniel@iogearbox.net --- include/linux/bpf.h | 1 + include/net/net_namespace.h | 10 ++++++++++ include/uapi/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/verifier.c | 16 ++++++++++------ net/core/filter.c | 37 +++++++++++++++++++++++++++++++++++++ net/core/net_namespace.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 16 +++++++++++++++- 7 files changed, 103 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb981c204fa..78046c570596 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -233,6 +233,7 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ + ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 854d39ef1ca3..1c6edfdb9a2c 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -168,6 +168,9 @@ struct net { #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif + + atomic64_t net_cookie; /* written once */ + #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; #endif @@ -273,6 +276,8 @@ static inline int check_net(const struct net *net) void net_drop_ns(void *); +u64 net_gen_cookie(struct net *net); + #else static inline struct net *get_net(struct net *net) @@ -300,6 +305,11 @@ static inline int check_net(const struct net *net) return 1; } +static inline u64 net_gen_cookie(struct net *net) +{ + return 0; +} + #define net_drop_ns NULL #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5d01c5c7e598..bd81c4555206 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2950,6 +2950,19 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of bpf_get_socket_cookie() helper, but for network namespaces + * instead of sockets. + * Return + * A 8-byte long opaque number. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3073,7 +3086,8 @@ union bpf_attr { FN(jiffies64), \ FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ - FN(xdp_output), + FN(xdp_output), \ + FN(get_netns_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ea2a868324e..46ba86c540e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3461,13 +3461,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX) { + } else if (arg_type == ARG_PTR_TO_CTX || + arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_CTX_OR_NULL)) { + if (type != expected_type) + goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ diff --git a/net/core/filter.c b/net/core/filter.c index 6cb7e0e24473..e249a499cbe5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4141,6 +4141,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 __bpf_get_netns_cookie(struct sock *sk) +{ +#ifdef CONFIG_NET_NS + return net_gen_cookie(sk ? sk->sk_net.net : &init_net); +#else + return 0; +#endif +} + +BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) +{ + return __bpf_get_netns_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { + .func = bpf_get_netns_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { + .func = bpf_get_netns_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -5968,6 +6001,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: @@ -5994,6 +6029,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 757cc1d084e7..190ca66a383b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; +static atomic64_t cookie_gen; + +u64 net_gen_cookie(struct net *net) +{ + while (1) { + u64 res = atomic64_read(&net->net_cookie); + + if (res) + return res; + res = atomic64_inc_return(&cookie_gen); + atomic64_cmpxchg(&net->net_cookie, 0, res); + } +} + static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -1087,6 +1101,7 @@ static int __init net_ns_init(void) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); + net_gen_cookie(&init_net); down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5d01c5c7e598..bd81c4555206 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2950,6 +2950,19 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of bpf_get_socket_cookie() helper, but for network namespaces + * instead of sockets. + * Return + * A 8-byte long opaque number. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3073,7 +3086,8 @@ union bpf_attr { FN(jiffies64), \ FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ - FN(xdp_output), + FN(xdp_output), \ + FN(get_netns_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 5a52ae4e32a61ad06ef67f0b3123adbdbac4fb83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:53 +0100 Subject: bpf: Allow to retrieve cgroup v1 classid from v2 hooks Today, Kubernetes is still operating on cgroups v1, however, it is possible to retrieve the task's classid based on 'current' out of connect(), sendmsg(), recvmsg() and bind-related hooks for orchestrators which attach to the root cgroup v2 hook in a mixed env like in case of Cilium, for example, in order to then correlate certain pod traffic and use it as part of the key for BPF map lookups. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/555e1c69db7376c0947007b4951c260e1074efc3.1585323121.git.daniel@iogearbox.net --- include/net/cls_cgroup.h | 7 ++++++- net/core/filter.c | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 4295de3e6a4b..7e78e7d6f015 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -45,9 +45,14 @@ static inline void sock_update_classid(struct sock_cgroup_data *skcd) sock_cgroup_set_classid(skcd, classid); } +static inline u32 __task_get_classid(struct task_struct *task) +{ + return task_cls_state(task)->classid; +} + static inline u32 task_get_classid(const struct sk_buff *skb) { - u32 classid = task_cls_state(current)->classid; + u32 classid = __task_get_classid(current); /* Due to the nature of the classifier it is required to ignore all * packets originating from softirq context as accessing `current' diff --git a/net/core/filter.c b/net/core/filter.c index e249a499cbe5..3083c7746ee0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = { .arg4_type = ARG_ANYTHING, }; +#ifdef CONFIG_CGROUP_NET_CLASSID +BPF_CALL_0(bpf_get_cgroup_classid_curr) +{ + return __task_get_classid(current); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { + .func = bpf_get_cgroup_classid_curr, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -6005,6 +6018,10 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -6035,6 +6052,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; -- cgit v1.2.3 From 0f09abd105da6c37713d2b253730a86cb45e127a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:54 +0100 Subject: bpf: Enable bpf cgroup hooks to retrieve cgroup v2 and ancestor id Enable the bpf_get_current_cgroup_id() helper for connect(), sendmsg(), recvmsg() and bind-related hooks in order to retrieve the cgroup v2 context which can then be used as part of the key for BPF map lookups, for example. Given these hooks operate in process context 'current' is always valid and pointing to the app that is performing mentioned syscalls if it's subject to a v2 cgroup. Also with same motivation of commit 7723628101aa ("bpf: Introduce bpf_skb_ancestor_cgroup_id helper") enable retrieval of ancestor from current so the cgroup id can be used for policy lookups which can then forbid connect() / bind(), for example. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/d2a7ef42530ad299e3cbb245e6c12374b72145ef.1585323121.git.daniel@iogearbox.net --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 18 ++++++++++++++++++ net/core/filter.c | 12 ++++++++++++ tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- 6 files changed, 72 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 78046c570596..372708eeaecd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1501,6 +1501,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd81c4555206..222ba11966e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2963,6 +2963,24 @@ union bpf_attr { * instead of sockets. * Return * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3087,7 +3105,8 @@ union bpf_attr { FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ - FN(get_netns_cookie), + FN(get_netns_cookie), \ + FN(get_current_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 914f3463aa41..916f5132a984 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01878db15eaf..bafc53ddd350 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -340,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *ancestor; + + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + return cgroup_id(ancestor); +} + +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { + .func = bpf_get_current_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + #ifdef CONFIG_CGROUP_BPF DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); diff --git a/net/core/filter.c b/net/core/filter.c index 3083c7746ee0..5cec3ac9e3dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6018,6 +6018,12 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; @@ -6052,6 +6058,12 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bd81c4555206..222ba11966e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2963,6 +2963,24 @@ union bpf_attr { * instead of sockets. * Return * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3087,7 +3105,8 @@ union bpf_attr { FN(read_branch_records), \ FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ - FN(get_netns_cookie), + FN(get_netns_cookie), \ + FN(get_current_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 834ebca8456c5f7a03d3351227b7c59318ccab62 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Mar 2020 16:58:55 +0100 Subject: bpf: Enable retrival of pid/tgid/comm from bpf cgroup hooks We already have the bpf_get_current_uid_gid() helper enabled, and given we now have perf event RB output available for connect(), sendmsg(), recvmsg() and bind-related hooks, add a trivial change to enable bpf_get_current_pid_tgid() and bpf_get_current_comm() as well. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/18744744ed93c06343be8b41edcfd858706f39d7.1585323121.git.daniel@iogearbox.net --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5cec3ac9e3dd..bb4a196c8809 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6018,6 +6018,10 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -6058,6 +6062,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit v1.2.3 From e9ff9d52540a53ce8c9eff5bf8b66467fe81eb2b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Menil Date: Fri, 27 Mar 2020 21:47:13 +0100 Subject: bpf: Fix build warning regarding missing prototypes Fix build warnings when building net/bpf/test_run.o with W=1 due to missing prototype for bpf_fentry_test{1..6}. Instead of declaring prototypes, turn off warnings with __diag_{push,ignore,pop} as pointed out by Alexei. Signed-off-by: Jean-Philippe Menil Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200327204713.28050-1-jpmenil@gmail.com --- net/bpf/test_run.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4c921f5154e0..29dbdd4c29f6 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -114,6 +114,9 @@ out: * architecture dependent calling conventions. 7+ can be supported in the * future. */ +__diag_push(); +__diag_ignore(GCC, 8, "-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); int noinline bpf_fentry_test1(int a) { return a + 1; @@ -149,6 +152,7 @@ int noinline bpf_modify_return_test(int a, int *b) *b += 1; return a + *b; } +__diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); -- cgit v1.2.3 From 92234c8f15c8d96ad7e52afdc5994cba6be68eb9 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 25 Mar 2020 18:23:26 +0100 Subject: xdp: Support specifying expected existing program when attaching XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While it is currently possible for userspace to specify that an existing XDP program should not be replaced when attaching to an interface, there is no mechanism to safely replace a specific XDP program with another. This patch adds a new netlink attribute, IFLA_XDP_EXPECTED_FD, which can be set along with IFLA_XDP_FD. If set, the kernel will check that the program currently loaded on the interface matches the expected one, and fail the operation if it does not. This corresponds to a 'cmpxchg' memory operation. Setting the new attribute with a negative value means that no program is expected to be attached, which corresponds to setting the UPDATE_IF_NOEXIST flag. A new companion flag, XDP_FLAGS_REPLACE, is also added to explicitly request checking of the EXPECTED_FD attribute. This is needed for userspace to discover whether the kernel supports the new attribute. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/158515700640.92963.3551295145441017022.stgit@toke.dk --- include/linux/netdevice.h | 2 +- include/uapi/linux/if_link.h | 4 +++- net/core/dev.c | 26 +++++++++++++++++++++----- net/core/rtnetlink.c | 14 ++++++++++++++ 4 files changed, 39 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 654808bfad83..b503d468f0df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3768,7 +3768,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, u32 flags); + int fd, int expected_fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); int xdp_umem_query(struct net_device *dev, u16 queue_id); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 61e0801c82df..c2f768c8d65b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -972,11 +972,12 @@ enum { #define XDP_FLAGS_SKB_MODE (1U << 1) #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_MODES) + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum { @@ -996,6 +997,7 @@ enum { IFLA_XDP_DRV_PROG_ID, IFLA_XDP_SKB_PROG_ID, IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, __IFLA_XDP_MAX, }; diff --git a/net/core/dev.c b/net/core/dev.c index d84541c24446..651a3c28d33a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8655,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev) * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, u32 flags) + int fd, int expected_fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; enum bpf_netdev_command query; + u32 prog_id, expected_id = 0; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; bool offload; @@ -8684,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, if (bpf_op == bpf_chk) bpf_chk = generic_xdp_install; - if (fd >= 0) { - u32 prog_id; + prog_id = __dev_xdp_query(dev, bpf_op, query); + if (flags & XDP_FLAGS_REPLACE) { + if (expected_fd >= 0) { + prog = bpf_prog_get_type_dev(expected_fd, + BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); + if (IS_ERR(prog)) + return PTR_ERR(prog); + expected_id = prog->aux->id; + bpf_prog_put(prog); + } + if (prog_id != expected_id) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } + } + if (fd >= 0) { if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); return -EEXIST; } - prog_id = __dev_xdp_query(dev, bpf_op, query); if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; @@ -8715,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return 0; } } else { - if (!__dev_xdp_query(dev, bpf_op, query)) + if (!prog_id) return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 14e6ea21c378..709ebbf8ab5b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, @@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb, } if (xdp[IFLA_XDP_FD]) { + int expected_fd = -1; + + if (xdp_flags & XDP_FLAGS_REPLACE) { + if (!xdp[IFLA_XDP_EXPECTED_FD]) { + err = -EINVAL; + goto errout; + } + expected_fd = + nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); + } + err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), + expected_fd, xdp_flags); if (err) goto errout; -- cgit v1.2.3 From cf7fbe660f2dbd738ab58aea8e9b0ca6ad232449 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sun, 29 Mar 2020 15:53:38 -0700 Subject: bpf: Add socket assign support Add support for TPROXY via a new bpf helper, bpf_sk_assign(). This helper requires the BPF program to discover the socket via a call to bpf_sk*_lookup_*(), then pass this socket to the new helper. The helper takes its own reference to the socket in addition to any existing reference that may or may not currently be obtained for the duration of BPF processing. For the destination socket to receive the traffic, the traffic must be routed towards that socket via local route. The simplest example route is below, but in practice you may want to route traffic more narrowly (eg by CIDR): $ ip route add local default dev lo This patch avoids trying to introduce an extra bit into the skb->sk, as that would require more invasive changes to all code interacting with the socket to ensure that the bit is handled correctly, such as all error-handling cases along the path from the helper in BPF through to the orphan path in the input. Instead, we opt to use the destructor variable to switch on the prefetch of the socket. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200329225342.16317-2-joe@wand.net.nz --- include/net/sock.h | 11 +++++++++++ include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- net/core/filter.c | 31 +++++++++++++++++++++++++++++++ net/core/sock.c | 11 +++++++++++ net/ipv4/ip_input.c | 3 ++- net/ipv6/ip6_input.c | 3 ++- net/sched/act_bpf.c | 3 +++ tools/include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- 8 files changed, 108 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index b5cca7bae69b..dc398cee7873 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1659,6 +1659,7 @@ void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); +void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif @@ -2526,6 +2527,16 @@ void sock_net_set(struct sock *sk, struct net *net) write_pnet(&sk->sk_net, net); } +static inline bool +skb_sk_is_prefetched(struct sk_buff *skb) +{ +#ifdef CONFIG_INET + return skb->destructor == sock_pfree; +#else + return false; +#endif /* CONFIG_INET */ +} + static inline struct sock *skb_steal_sock(struct sk_buff *skb) { if (skb->sk) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1fbc36f58d3..9f786a5a44ac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2983,6 +2983,28 @@ union bpf_attr { * **bpf_get_current_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * Description + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EINVAL** Unsupported flags specified. + * * **-ENOENT** Socket is unavailable for assignment. + * * **-ENETUNREACH** Socket is unreachable (wrong netns). + * * **-EOPNOTSUPP** Unsupported operation, for example a + * call from outside of TC ingress. + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3108,7 +3130,8 @@ union bpf_attr { FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), + FN(get_current_ancestor_cgroup_id), \ + FN(sk_assign), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index bb4a196c8809..ac5c1633f8d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5918,6 +5918,35 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) +{ + if (flags != 0) + return -EINVAL; + if (!skb_at_tc_ingress(skb)) + return -EOPNOTSUPP; + if (unlikely(dev_net(skb->dev) != sock_net(sk))) + return -ENETUNREACH; + if (unlikely(sk->sk_reuseport)) + return -ESOCKTNOSUPPORT; + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + return -ENOENT; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_pfree; + + return 0; +} + +static const struct bpf_func_proto bpf_sk_assign_proto = { + .func = bpf_sk_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6249,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_assign_proto; #endif default: return bpf_base_func_proto(func_id); diff --git a/net/core/sock.c b/net/core/sock.c index 0fc8937a7ff4..87e3a03c9056 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2071,6 +2071,17 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ + kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index aa438c6758a7..b0c244af1e4d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ - skb_orphan(skb); + if (!skb_sk_is_prefetched(skb)) + skb_orphan(skb); return skb; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 7b089d0ac8cd..e96304d8a4a7 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, rcu_read_unlock(); /* Must drop socket now because of tproxy. */ - skb_orphan(skb); + if (!skb_sk_is_prefetched(skb)) + skb_orphan(skb); return skb; err: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 46f47e58b3be..54d5652cfe6c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } + if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) + skb_orphan(skb); rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1fbc36f58d3..9f786a5a44ac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2983,6 +2983,28 @@ union bpf_attr { * **bpf_get_current_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * Description + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EINVAL** Unsupported flags specified. + * * **-ENOENT** Socket is unavailable for assignment. + * * **-ENETUNREACH** Socket is unreachable (wrong netns). + * * **-EOPNOTSUPP** Unsupported operation, for example a + * call from outside of TC ingress. + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3108,7 +3130,8 @@ union bpf_attr { FN(get_ns_current_pid_tgid), \ FN(xdp_output), \ FN(get_netns_cookie), \ - FN(get_current_ancestor_cgroup_id), + FN(get_current_ancestor_cgroup_id), \ + FN(sk_assign), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 71489e21d720a09388b565d60ef87ae993c10528 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sun, 29 Mar 2020 15:53:39 -0700 Subject: net: Track socket refcounts in skb_steal_sock() Refactor the UDP/TCP handlers slightly to allow skb_steal_sock() to make the determination of whether the socket is reference counted in the case where it is prefetched by earlier logic such as early_demux. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200329225342.16317-3-joe@wand.net.nz --- include/net/inet6_hashtables.h | 3 +-- include/net/inet_hashtables.h | 3 +-- include/net/sock.h | 10 +++++++++- net/ipv4/udp.c | 6 ++++-- net/ipv6/udp.c | 9 ++++++--- 5 files changed, 21 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index fe96bf247aac..81b965953036 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -85,9 +85,8 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb); + struct sock *sk = skb_steal_sock(skb, refcounted); - *refcounted = true; if (sk) return sk; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index d0019d3395cf..ad64ba6a057f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,9 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb); + struct sock *sk = skb_steal_sock(skb, refcounted); const struct iphdr *iph = ip_hdr(skb); - *refcounted = true; if (sk) return sk; diff --git a/include/net/sock.h b/include/net/sock.h index dc398cee7873..f81d528845f6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2537,15 +2537,23 @@ skb_sk_is_prefetched(struct sk_buff *skb) #endif /* CONFIG_INET */ } -static inline struct sock *skb_steal_sock(struct sk_buff *skb) +/** + * skb_steal_sock + * @skb to steal the socket from + * @refcounted is set to true if the socket is reference-counted + */ +static inline struct sock * +skb_steal_sock(struct sk_buff *skb, bool *refcounted) { if (skb->sk) { struct sock *sk = skb->sk; + *refcounted = true; skb->destructor = NULL; skb->sk = NULL; return sk; } + *refcounted = false; return NULL; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2633fc231593..b4035021bbd3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2288,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); + bool refcounted; /* * Validate the packet. @@ -2313,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - sk = skb_steal_sock(skb); + sk = skb_steal_sock(skb, &refcounted); if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -2322,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5dc439a391fe..7d4151747340 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -843,6 +843,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct net *net = dev_net(skb->dev); struct udphdr *uh; struct sock *sk; + bool refcounted; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -879,7 +880,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ - sk = skb_steal_sock(skb); + sk = skb_steal_sock(skb, &refcounted); if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -888,12 +889,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, udp6_sk_rx_dst_set(sk, dst); if (!uh->check && !udp_sk(sk)->no_check6_rx) { - sock_put(sk); + if (refcounted) + sock_put(sk); goto report_csum_error; } ret = udp6_unicast_rcv_skb(sk, skb, uh); - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; } -- cgit v1.2.3 From 7ae215d23c12a939005f35d1848ca55b6109b9c0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sun, 29 Mar 2020 15:53:40 -0700 Subject: bpf: Don't refcount LISTEN sockets in sk_assign() Avoid taking a reference on listen sockets by checking the socket type in the sk_assign and in the corresponding skb_steal_sock() code in the the transport layer, and by ensuring that the prefetch free (sock_pfree) function uses the same logic to check whether the socket is refcounted. Suggested-by: Martin KaFai Lau Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200329225342.16317-4-joe@wand.net.nz --- include/net/sock.h | 25 +++++++++++++++++-------- net/core/filter.c | 6 +++--- net/core/sock.c | 3 ++- 3 files changed, 22 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index f81d528845f6..6d84784d33fa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2537,6 +2537,21 @@ skb_sk_is_prefetched(struct sk_buff *skb) #endif /* CONFIG_INET */ } +/* This helper checks if a socket is a full socket, + * ie _not_ a timewait or request socket. + */ +static inline bool sk_fullsock(const struct sock *sk) +{ + return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); +} + +static inline bool +sk_is_refcounted(struct sock *sk) +{ + /* Only full sockets have sk->sk_flags. */ + return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); +} + /** * skb_steal_sock * @skb to steal the socket from @@ -2549,6 +2564,8 @@ skb_steal_sock(struct sk_buff *skb, bool *refcounted) struct sock *sk = skb->sk; *refcounted = true; + if (skb_sk_is_prefetched(skb)) + *refcounted = sk_is_refcounted(sk); skb->destructor = NULL; skb->sk = NULL; return sk; @@ -2557,14 +2574,6 @@ skb_steal_sock(struct sk_buff *skb, bool *refcounted) return NULL; } -/* This helper checks if a socket is a full socket, - * ie _not_ a timewait or request socket. - */ -static inline bool sk_fullsock(const struct sock *sk) -{ - return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); -} - /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. diff --git a/net/core/filter.c b/net/core/filter.c index ac5c1633f8d2..7628b947dbc3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5401,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - /* Only full sockets have sk->sk_flags. */ - if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE)) + if (sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } @@ -5928,7 +5927,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -ENETUNREACH; if (unlikely(sk->sk_reuseport)) return -ESOCKTNOSUPPORT; - if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + if (sk_is_refcounted(sk) && + unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); diff --git a/net/core/sock.c b/net/core/sock.c index 87e3a03c9056..da32d9b6d09f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2077,7 +2077,8 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - sock_gen_put(skb->sk); + if (sk_is_refcounted(skb->sk)) + sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -- cgit v1.2.3