From bc235cdb423a2daed6f337676006a66557429cd1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:15 -0800 Subject: bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete] BPF helpers bpf_task_storage_[get|delete] could hold two locks: bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling these helpers from fentry/fexit programs on functions in bpf_*_storage.c may cause deadlock on either locks. Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We need this counter to be global, because the two locks here belong to two different objects: bpf_local_storage_map and bpf_local_storage. If we pick one of them as the owner of the counter, it is still possible to trigger deadlock on the other lock. For example, if bpf_local_storage_map owns the counters, it cannot prevent deadlock on bpf_local_storage->lock when two maps are used. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com --- net/core/bpf_sk_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 26 ++++----- include/linux/bpf_types.h | 6 +- include/linux/skmsg.h | 18 ++++++ include/net/tcp.h | 16 ++--- include/net/udp.h | 4 +- init/Kconfig | 1 + net/Kconfig | 6 +- net/core/Makefile | 6 +- net/core/skmsg.c | 145 +++++++++++++++++++++++++--------------------- net/core/sock_map.c | 2 + net/ipv4/Makefile | 2 +- net/ipv4/tcp_bpf.c | 4 +- 12 files changed, 133 insertions(+), 103 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae2c35641619..2be47ada5f2d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1778,7 +1778,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_BPF_STREAM_PARSER) +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); @@ -1786,7 +1786,18 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); + +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); #else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL static inline int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) @@ -1811,20 +1822,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } -#endif /* CONFIG_BPF_STREAM_PARSER */ -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} - -#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b9edee336d80..f883f01a5061 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -103,10 +103,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) -#if defined(CONFIG_BPF_STREAM_PARSER) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) -#endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif @@ -116,6 +112,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif #ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8edbbf5f2f93..db7a08be4725 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -71,7 +71,9 @@ struct sk_psock_link { }; struct sk_psock_parser { +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; +#endif bool enabled; void (*saved_data_ready)(struct sock *sk); }; @@ -305,9 +307,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) struct sk_psock *sk_psock_init(struct sock *sk, int node); +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +#else +static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + return -EOPNOTSUPP; +} + +static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ +} + +static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ +} +#endif + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); diff --git a/include/net/tcp.h b/include/net/tcp.h index 963cd86d12dd..c00e125dcfb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, diff --git a/include/net/udp.h b/include/net/udp.h index a132a02b2f2c..d4d064c59232 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -#endif /* BPF_STREAM_PARSER */ +#endif #endif /* _UDP_H */ diff --git a/init/Kconfig b/init/Kconfig index 096e1af5c586..66cef5eac275 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1702,6 +1702,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select NET_SOCK_MSG if INET default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/net/Kconfig b/net/Kconfig index 8cea808ad9e8..0ead7ec0d2bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -317,13 +317,9 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG help - Enabling this allows a stream parser to be used with + Enabling this allows a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. - config NET_FLOW_LIMIT bool depends on RPS diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..0c2233c826fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..e017744111e1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, return bpf_prog_run_pin_on_cpu(prog, skb); } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -866,6 +858,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); @@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) parser->enabled = true; } -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} - void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..ee3334dd3a38 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..17c322b875fd 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:27 -0800 Subject: skmsg: Get rid of struct sk_psock_parser struct sk_psock_parser is embedded in sk_psock, it is unnecessary as skb verdict also uses ->saved_data_ready. We can simply fold these fields into sk_psock, and get rid of ->enabled. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 19 +++++++----------- net/core/skmsg.c | 53 ++++++++++++++++----------------------------------- net/core/sock_map.c | 8 ++++---- 3 files changed, 27 insertions(+), 53 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index db7a08be4725..22e26f82de33 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -70,14 +70,6 @@ struct sk_psock_link { void *link_raw; }; -struct sk_psock_parser { -#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) - struct strparser strp; -#endif - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - struct sk_psock_work_state { struct sk_buff *skb; u32 len; @@ -92,7 +84,9 @@ struct sk_psock { u32 eval; struct sk_msg *cork; struct sk_psock_progs progs; - struct sk_psock_parser parser; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + struct strparser strp; +#endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; unsigned long state; @@ -102,6 +96,7 @@ struct sk_psock { void (*saved_unhash)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); + void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -418,8 +413,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { - if (psock->parser.enabled) - psock->parser.saved_data_ready(sk); + if (psock->saved_data_ready) + psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } @@ -458,6 +453,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; - return psock->parser.enabled; + return !!psock->saved_data_ready; } #endif /* _LINUX_SKMSG_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e017744111e1..d00c9a4b47e7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; @@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } @@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) .parse_msg = sk_psock_strp_parse, }; - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); + return strp_init(&psock->strp, sk, &cb); } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) @@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk) void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ee3334dd3a38..1a28a5c2c61e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.skb_verdict, skb_verdict); sk_psock_start_verdict(sk,psock); } -- cgit v1.2.3 From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- include/net/tcp.h | 6 ------ net/core/filter.c | 48 ++++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e125dcfb9..947ef5da6867 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -886,18 +886,12 @@ struct tcp_skb_cb { struct { __u32 flags; struct sock *sk_redir; - void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } -- cgit v1.2.3 From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 3 +++ include/linux/skmsg.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 19 ------------------- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 5 files changed, 62 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22e26f82de33..e0de45527bb6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } -- cgit v1.2.3 From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 8 +-- net/core/skmsg.c | 14 ++--- net/core/sock_map.c | 60 +++++++++++----------- .../selftests/bpf/prog_tests/sockmap_listen.c | 8 +-- .../selftests/bpf/progs/test_sockmap_listen.c | 4 +- 5 files changed, 47 insertions(+), 47 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e0de45527bb6..d9f6ec4a9cf2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; -- cgit v1.2.3 From 4675e234b9e15159894b90ead9340e1dc202b670 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:31 -0800 Subject: sock_map: Make sock_map_prog_update() static It is only used within sock_map.c so can become static. Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-7-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 9 --------- net/core/sock_map.c | 7 +++++-- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be47ada5f2d..e1e4d2f60527 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1779,8 +1779,6 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); @@ -1798,13 +1796,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk) } #ifdef CONFIG_BPF_SYSCALL -static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, - struct bpf_prog *old, u32 which) -{ - return -EOPNOTSUPP; -} - static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 69785070f02d..dd53a7771d7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,9 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; -- cgit v1.2.3 From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:32 -0800 Subject: skmsg: Make __sk_psock_purge_ingress_msg() static It is only used within skmsg.c so can become static. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9f6ec4a9cf2..676d48e08159 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 35f9caa3b125..46e29d2c0c48 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; -- cgit v1.2.3 From 533342322276b06b4db260c413ce907238851e9b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:33 -0800 Subject: skmsg: Get rid of sk_psock_bpf_run() It is now nearly identical to bpf_prog_run_pin_on_cpu() and it has an unused parameter 'psock', so we can just get rid of it and call bpf_prog_run_pin_on_cpu() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 46e29d2c0c48..07f54015238a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -749,12 +749,6 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - return bpf_prog_run_pin_on_cpu(prog, skb); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -812,7 +806,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -899,7 +893,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); @@ -922,7 +916,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -1019,7 +1013,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); -- cgit v1.2.3 From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- include/linux/bpf.h | 10 ++++ include/uapi/linux/bpf.h | 5 +- net/bpf/test_run.c | 105 +++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + tools/include/uapi/linux/bpf.h | 5 +- 5 files changed, 124 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c730863fa77..c931bc97019d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb3c78cd4d7c..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -781,3 +783,106 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ -- cgit v1.2.3 From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { -- cgit v1.2.3 From e6a4750ffe9d701c4d55212b14b615e63571d235 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 8 Mar 2021 12:29:06 +0100 Subject: bpf, xdp: Make bpf_redirect_map() a map operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the bpf_redirect_map() implementation dispatches to the correct map-lookup function via a switch-statement. To avoid the dispatching, this change adds bpf_redirect_map() as a map operation. Each map provides its bpf_redirect_map() version, and correct function is automatically selected by the BPF verifier. A nice side-effect of the code movement is that the map lookup functions are now local to the map implementation files, which removes one additional function call. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-2-bjorn.topel@gmail.com --- include/linux/bpf.h | 26 ++++++-------------------- include/linux/filter.h | 27 +++++++++++++++++++++++++++ include/net/xdp_sock.h | 19 ------------------- kernel/bpf/cpumap.c | 8 +++++++- kernel/bpf/devmap.c | 16 ++++++++++++++-- kernel/bpf/verifier.c | 13 +++++++++++-- net/core/filter.c | 39 +-------------------------------------- net/xdp/xskmap.c | 16 ++++++++++++++++ 8 files changed, 82 insertions(+), 82 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c931bc97019d..a25730eaa148 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -118,6 +118,9 @@ struct bpf_map_ops { void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + /* Misc helpers.*/ + int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. @@ -1450,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_buff; struct sk_buff; +struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1462,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); bool dev_map_can_have_prog(struct bpf_map *map); -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1593,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} static inline bool dev_map_can_have_prog(struct bpf_map *map) { return false; @@ -1615,6 +1606,7 @@ static inline void __dev_flush(void) struct xdp_buff; struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -1639,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } -static inline -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - static inline void __cpu_map_flush(void) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3b00fc906ccd..008691fd3b58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1472,4 +1472,31 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, + void *lookup_elem(struct bpf_map *map, u32 key)) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) + return XDP_ABORTED; + + ri->tgt_value = lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + + ri->flags = flags; + ri->tgt_index = ifindex; + WRITE_ONCE(ri->map, map); + + return XDP_REDIRECT; +} + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..9c0722c6d7ac 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void) { } -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..7352d4160b7f 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -563,7 +563,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +600,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +617,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..f7f42448259f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -258,7 +258,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -392,7 +392,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +735,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +757,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +772,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe90ce52a65..97eb0b2435b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5582,7 +5582,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && - func_id != BPF_FUNC_for_each_map_elem) + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -12017,7 +12018,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -12059,6 +12061,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -12085,6 +12090,10 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; diff --git a/net/core/filter.c b/net/core/filter.c index 588b19ba0da8..183b0aa6b027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3943,22 +3943,6 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - void bpf_clear_redirect_map(struct bpf_map *map) { struct bpf_redirect_info *ri; @@ -4112,28 +4096,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 113fd9017203..fbeb4870f798 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -125,6 +125,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(m->xsk_map[key]); +} + static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -215,6 +225,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); +} + void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry) { @@ -247,4 +262,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "xsk_map", .map_btf_id = &xsk_map_btf_id, + .map_redirect = xsk_map_redirect, }; -- cgit v1.2.3 From ee75aef23afe6e88497151c127c13ed69f41aaa2 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 8 Mar 2021 12:29:07 +0100 Subject: bpf, xdp: Restructure redirect actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XDP_REDIRECT implementations for maps and non-maps are fairly similar, but obviously need to take different code paths depending on if the target is using a map or not. Today, the redirect targets for XDP either uses a map, or is based on ifindex. Here, the map type and id are added to bpf_redirect_info, instead of the actual map. Map type, map item/ifindex, and the map_id (if any) is passed to xdp_do_redirect(). For ifindex-based redirect, used by the bpf_redirect() XDP BFP helper, a special map type/id are used. Map type of UNSPEC together with map id equal to INT_MAX has the special meaning of an ifindex based redirect. Note that valid map ids are 1 inclusive, INT_MAX exclusive ([1,INT_MAX[). In addition to making the code easier to follow, using explicit type and id in bpf_redirect_info has a slight positive performance impact by avoiding a pointer indirection for the map type lookup, and instead use the cacheline for bpf_redirect_info. Since the actual map is not passed via bpf_redirect_info anymore, the map lookup is only done in the BPF helper. This means that the bpf_clear_redirect_map() function can be removed. The actual map item is RCU protected. The bpf_redirect_info flags member is not used by XDP, and not read/written any more. The map member is only written to when required/used, and not unconditionally. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-3-bjorn.topel@gmail.com --- include/linux/filter.h | 10 +-- include/trace/events/xdp.h | 62 ++++++++++------- kernel/bpf/cpumap.c | 1 - kernel/bpf/devmap.c | 1 - net/core/filter.c | 170 ++++++++++++++++++++------------------------- net/xdp/xskmap.c | 1 - 6 files changed, 116 insertions(+), 129 deletions(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index 008691fd3b58..b2b85b2cad8e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,7 +646,8 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; - struct bpf_map *map; + u32 map_id; + enum bpf_map_type map_type; u32 kern_flags; struct bpf_nh_params nh; }; @@ -1488,13 +1489,14 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind * performs multiple lookups, the last one always takes * precedence. */ - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags; } - ri->flags = flags; ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); + ri->map_id = map->id; + ri->map_type = map->map_type; return XDP_REDIRECT; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 76a97176ab81..fcad3645a70b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -86,19 +86,15 @@ struct _bpf_dtab_netdev { }; #endif /* __DEVMAP_OBJ_TYPE */ -#define devmap_ifindex(tgt, map) \ - (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ - ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) - DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), + enum bpf_map_type map_type, + u32 map_id, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) @@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, ), TP_fast_assign( + u32 ifindex = 0, map_index = index; + + if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + ifindex = index; + map_index = 0; + } + __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : - index; - __entry->map_id = map ? map->id : 0; - __entry->map_index = map ? index : 0; + __entry->to_ifindex = ifindex; + __entry->map_id = map_id; + __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" @@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); -#define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to) +#define _trace_xdp_redirect(dev, xdp, to) \ + trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to) +#define _trace_xdp_redirect_err(dev, xdp, to, err) \ + trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ - trace_xdp_redirect(dev, xdp, to, 0, map, index) +#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) -#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, map, index) +#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 7352d4160b7f..0cf2791d5099 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f7f42448259f..7a5ad7331c3b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ diff --git a/net/core/filter.c b/net/core/filter.c index 183b0aa6b027..b6732000d8a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3918,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3943,55 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4000,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; - - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; + int err; - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4042,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; - - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4077,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index fbeb4870f798..67b4ce504852 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - bpf_clear_redirect_map(map); synchronize_net(); bpf_map_area_free(m); } -- cgit v1.2.3 From b1866bfff9223e4d15727e05a865b744a163eff2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 9 Mar 2021 23:42:43 -0600 Subject: net: core: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6c5967e80132..2bfdd528c7c3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5265,6 +5265,7 @@ skip_classify: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: -- cgit v1.2.3 From 1ddc3229ad3c40840c24a699ada5cfeb4319b578 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 10 Mar 2021 16:28:58 +0800 Subject: skbuff: remove some unnecessary operation in skb_segment_list() gro list uses skb_shinfo(skb)->frag_list to link two skb together, and NAPI_GRO_CB(p)->last->next is used when there are more skb, see skb_gro_receive_list(). gso expects that each segmented skb is linked together using skb->next, so only the first skb->next need to set to skb_shinfo(skb)-> frag_list when doing gso list segment. It is the same reason that nskb->next does not need to be set to list_skb before goto the error handling, because nskb->next already pointers to list_skb. And nskb is also the last skb at the end of loop, so remove tail variable and use nskb instead. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c421c8f80925..e8320b5d651a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3732,13 +3732,13 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; - struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int err; skb_push(skb, -skb_network_offset(skb) + offset); skb_shinfo(skb)->frag_list = NULL; + skb->next = list_skb; do { nskb = list_skb; @@ -3756,17 +3756,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, } } - if (!tail) - skb->next = nskb; - else - tail->next = nskb; - - if (unlikely(err)) { - nskb->next = list_skb; + if (unlikely(err)) goto err_linearize; - } - - tail = nskb; delta_len += nskb->len; delta_truesize += nskb->truesize; @@ -3793,7 +3784,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_gso_reset(skb); - skb->prev = tail; + skb->prev = nskb; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) -- cgit v1.2.3 From 0ccf4d50d14f360dfae5b25b8ffcb27f98e591f0 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:05 +0000 Subject: gro: simplify gro_list_prepare() gro_list_prepare() always returns &napi->gro_hash[bucket].list, without any variations. Moreover, it uses 'napi' argument only to have access to this list, and calculates the bucket index for the second time (firstly it happens at the beginning of dev_gro_receive()) to do that. Given that dev_gro_receive() already has an index to the needed list, just pass it as the first argument to eliminate redundant calculations, and make gro_list_prepare() return void. Also, both arguments of gro_list_prepare() can be constified since this function can only modify the skbs from the bucket list. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2bfdd528c7c3..1317e6b6758a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5858,15 +5858,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -static struct list_head *gro_list_prepare(struct napi_struct *napi, - struct sk_buff *skb) +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); - struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5892,8 +5890,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } - - return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -5957,10 +5953,10 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct list_head *gro_head = &napi->gro_hash[hash].list; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -5969,7 +5965,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_head = gro_list_prepare(napi, skb); + gro_list_prepare(gro_head, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { -- cgit v1.2.3 From 9dc2c313378528afe1bddf12cad88dbfe0998820 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:10 +0000 Subject: gro: consistentify napi->gro_hash[x] access in dev_gro_receive() GRO bucket index doesn't change through the entire function. Store a pointer to the corresponding bucket instead of its member and use it consistently through the function. It is performance-safe since &gro_list->list == gro_list. Misc: remove superfluous braces around single-line branches. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1317e6b6758a..b635467087f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5953,7 +5953,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct list_head *gro_head = &napi->gro_hash[hash].list; + struct gro_list *gro_list = &napi->gro_hash[hash]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5965,7 +5965,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(gro_head, skb); + gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -6001,7 +6001,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, - gro_head, skb); + &gro_list->list, skb); break; } rcu_read_unlock(); @@ -6020,7 +6020,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[hash].count--; + gro_list->count--; } if (same_flow) @@ -6029,16 +6029,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi, gro_head); - } else { - napi->gro_hash[hash].count++; - } + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, gro_head); + list_add(&skb->list, &gro_list->list); ret = GRO_HELD; pull: @@ -6046,7 +6046,7 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[hash].count) { + if (gro_list->count) { if (!test_bit(hash, &napi->gro_bitmask)) __set_bit(hash, &napi->gro_bitmask); } else if (test_bit(hash, &napi->gro_bitmask)) { -- cgit v1.2.3 From d0eed5c325149002c364a1439ae1afe1992beae4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:14 +0000 Subject: gro: give 'hash' variable in dev_gro_receive() a less confusing name 'hash' stores not the flow hash, but the index of the GRO bucket corresponding to it. Change its name to 'bucket' to avoid confusion while reading lines like '__set_bit(hash, &napi->gro_bitmask)'. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b635467087f3..5a2847a19cf2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5952,8 +5952,8 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[hash]; + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -6047,10 +6047,10 @@ pull: gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { - if (!test_bit(hash, &napi->gro_bitmask)) - __set_bit(hash, &napi->gro_bitmask); - } else if (test_bit(hash, &napi->gro_bitmask)) { - __clear_bit(hash, &napi->gro_bitmask); + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); } return ret; -- cgit v1.2.3 From f96533cded173b3b019001a505a746c3cd8fc323 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:23 +0000 Subject: flow_dissector: constify raw input data argument Flow Dissector code never modifies the input buffer, neither skb nor raw data. Make 'data' argument const for all of the Flow dissector's functions. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++-------- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 41 ++++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6ea3dc3eddb..46c61e127e9f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1292,10 +1292,10 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen_proto); + const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) @@ -1314,9 +1314,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags); + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -1338,9 +1337,9 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, - struct flow_keys_basic *flow, void *data, - __be16 proto, int nhoff, int hlen, - unsigned int flags) + struct flow_keys_basic *flow, + const void *data, __be16 proto, + int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bf00e71816ed..ffd386ea0dbb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -350,7 +350,7 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen); + const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ef2224b3bff..2ed380d096ce 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen) + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type) */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen) + const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; @@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci); */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; @@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen, - int lse_index, bool *entropy_label) + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; @@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, const void *data, + int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { @@ -523,7 +524,7 @@ static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, - void *target_container, void *data, + void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { @@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, - void *data, __be16 *p_proto, int *p_nhoff, int hlen, - unsigned int flags) + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; @@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb, static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; @@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, - u8 ip_proto, int hlen) + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; @@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct iphdr *iph) + void *target_container, const void *data, + const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb, static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct ipv6hdr *iph) + void *target_container, const void *data, + const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -908,9 +912,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags) + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -1642,7 +1645,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_get_hash_perturb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; -- cgit v1.2.3 From 6503b9f29a47cdb4ebd6c36d8bbb018418415c2a Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Wed, 10 Mar 2021 10:23:05 -0800 Subject: bpf: Add getter and setter for SO_REUSEPORT through bpf_{g,s}etsockopt Augment the current set of options that are accessible via bpf_{g,s}etsockopt to also support SO_REUSEPORT. Signed-off-by: Manu Bretelle Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210310182305.1910312-1-chantra@fb.com --- net/core/filter.c | 6 ++++++ tools/testing/selftests/bpf/progs/bind4_prog.c | 25 +++++++++++++++++++++++++ tools/testing/selftests/bpf/progs/bind6_prog.c | 25 +++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index b6732000d8a2..10dac9dd5086 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4729,6 +4729,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; default: ret = -EINVAL; } @@ -4898,6 +4901,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_BINDTOIFINDEX: *((int *)optval) = sk->sk_bound_dev_if; break; + case SO_REUSEPORT: + *((int *)optval) = sk->sk_reuseport; + break; default: goto err_clear; } diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 115a3b0ad984..474c6a62078a 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -57,6 +57,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -127,6 +148,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 4c0d348034b9..c19cfa869f30 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -63,6 +63,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -141,6 +162,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); -- cgit v1.2.3 From 8f64860f8b567cc4f8ac854a65cbf6337404c520 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 14 Mar 2021 15:49:19 +0100 Subject: net: export dev_set_threaded symbol For wireless devices (e.g. mt76 driver) multiple net_devices belongs to the same wireless phy and the napi object is registered in a dummy netdevice related to the wireless phy. Export dev_set_threaded in order to be reused in device drivers enabling threaded NAPI. Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5a2847a19cf2..6bc20eabd2b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6766,6 +6766,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) return err; } +EXPORT_SYMBOL(dev_set_threaded); void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) -- cgit v1.2.3 From ea4fe7e842f6c7f972d795a8efc167c4bb33b62f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:40 +0100 Subject: net-sysfs: convert xps_cpus_show to bitmap_zalloc Use bitmap_zalloc instead of zalloc_cpumask_var in xps_cpus_show to align with xps_rxqs_show. This will improve maintenance and allow us to factorize the two functions. The function should behave the same. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 307628fdf380..3a083c0c9dd3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1367,8 +1367,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - cpumask_var_t mask; - unsigned long index; + unsigned long *mask, index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1396,7 +1395,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); + if (!mask) { ret = -ENOMEM; goto err_rtnl_unlock; } @@ -1414,7 +1414,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, for (i = map->len; i--;) { if (map->queues[i] == index) { - cpumask_set_cpu(cpu, mask); + set_bit(cpu, mask); break; } } @@ -1424,8 +1424,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rtnl_unlock(); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); - free_cpumask_var(mask); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids); + bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; err_rtnl_unlock: -- cgit v1.2.3 From d9a063d207f0e538b0f5aa8b04a6c14f88906a6d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:41 +0100 Subject: net-sysfs: store the return of get_netdev_queue_index in an unsigned int In net-sysfs, get_netdev_queue_index returns an unsigned int. Some of its callers use an unsigned long to store the returned value. Update the code to be consistent, this should only be cosmetic. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3a083c0c9dd3..5dc4223f6b68 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1367,7 +1367,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1437,7 +1438,7 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; - unsigned long index; + unsigned int index; cpumask_var_t mask; int err; @@ -1479,7 +1480,8 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; index = get_netdev_queue_index(queue); @@ -1541,7 +1543,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) -- cgit v1.2.3 From 73f5e52b15e3aa4ef641264228cd9069b1948149 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:42 +0100 Subject: net-sysfs: make xps_cpus_show and xps_rxqs_show consistent Make the implementations of xps_cpus_show and xps_rxqs_show to converge, as the two share the same logic but diverted over time. This should not modify their behaviour but will help future changes and improve maintenance. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5dc4223f6b68..5f76183ad5bc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1364,7 +1364,7 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { - int cpu, len, ret, num_tc = 1, tc = 0; + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask; @@ -1404,23 +1404,26 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); - if (dev_maps) { - for_each_possible_cpu(cpu) { - int i, tci = cpu * num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - set_bit(cpu, mask); - break; - } + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), + j < nr_cpu_ids;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; } } } +out_no_maps: rcu_read_unlock(); rtnl_unlock(); -- cgit v1.2.3 From 255c04a87f4381849fce9ed81e5efabf78a71a30 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:43 +0100 Subject: net: embed num_tc in the xps maps The xps cpus/rxqs map is accessed using dev->num_tc, which is used when allocating the map. But later updates of dev->num_tc can lead to having a mismatch between the maps and how they're accessed. In such cases the map values do not make any sense and out of bound accesses can occur (that can be easily seen using KASAN). This patch aims at fixing this by embedding num_tc into the maps, using the value at the time the map is created. This brings two improvements: - The maps can be accessed using the embedded num_tc, so we know for sure we won't have out of bound accesses. - Checks can be made before accessing the maps so we know the values retrieved will make sense. We also update __netif_set_xps_queue to conditionally copy old maps from dev_maps in the new one only if the number of traffic classes from both maps match. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++ net/core/dev.c | 63 ++++++++++++++++++++++++++++++----------------- net/core/net-sysfs.c | 45 +++++++++++++-------------------- 3 files changed, 64 insertions(+), 50 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 97254c089eb2..c38534c55ea1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -771,9 +771,15 @@ struct xps_map { /* * This structure holds all XPS maps for device. Maps are indexed by CPU. + * + * We keep track of the number of traffic classes used when the struct is + * allocated, in num_tc. This will be used to navigate the maps, to ensure we're + * not crossing its upper bound, as the original dev->num_tc can be updated in + * the meantime. */ struct xps_dev_maps { struct rcu_head rcu; + s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; diff --git a/net/core/dev.c b/net/core/dev.c index 6bc20eabd2b0..4e29d1994fdd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2491,7 +2491,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int num_tc = dev->num_tc ? : 1; + int num_tc = dev_maps->num_tc; bool active = false; int tci; @@ -2634,10 +2634,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, { const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - bool active = false; unsigned int nr_ids; if (dev->num_tc) { @@ -2672,19 +2672,29 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc could have been updated in between. We + * could try to be smart, but let's be safe instead and only copy + * foreign traffic classes if the two map sizes match. + */ + if (dev_maps && dev_maps->num_tc == num_tc) + copy = true; + /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : - NULL; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) @@ -2706,7 +2716,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; copy && i--; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2736,14 +2746,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, numa_node_id = -1; } #endif - } else if (dev_maps) { + } else if (copy) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + for (i = num_tc - tc, tci++; copy && --i; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2761,11 +2771,18 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = num_tc, tci = j * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); - if (map && map != new_map) - kfree_rcu(map, rcu); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + kfree_rcu(map, rcu); } } @@ -2789,12 +2806,12 @@ out_no_new_maps: /* removes tx-queue from unused CPUs/rx-queues */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = tc, tci = j * num_tc; i--; tci++) + for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, nr_ids) || !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); - for (i = num_tc - tc, tci++; --i; tci++) + for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); } @@ -2812,7 +2829,7 @@ error: j < nr_ids;) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); - map = dev_maps ? + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) @@ -3944,13 +3961,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { + int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tc >= dev_maps->num_tc) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5f76183ad5bc..1364d0f39cb0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1364,9 +1364,9 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1378,22 +1378,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - /* Do not allow XPS on subordinate device directly */ - num_tc = dev->num_tc; - if (num_tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } - - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); @@ -1404,12 +1395,12 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), j < nr_cpu_ids;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); @@ -1480,9 +1471,9 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1491,14 +1482,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - num_tc = dev->num_tc; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } + mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) { ret = -ENOMEM; @@ -1507,12 +1496,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), j < dev->num_rx_queues;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); -- cgit v1.2.3 From 5478fcd0f48322e04ae6c173ad3a1959e066dc83 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:44 +0100 Subject: net: embed nr_ids in the xps maps Embed nr_ids (the number of cpu for the xps cpus map, and the number of rxqs for the xps cpus map) in dev_maps. That will help not accessing out of bound memory if those values change after dev_maps was allocated. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 45 +++++++++++++++++++++------------------------ net/core/net-sysfs.c | 38 ++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c38534c55ea1..09e73f5a8c78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -772,6 +772,9 @@ struct xps_map { /* * This structure holds all XPS maps for device. Maps are indexed by CPU. * + * We keep track of the number of cpus/rxqs used when the struct is allocated, + * in nr_ids. This will help not accessing out-of-bound memory. + * * We keep track of the number of traffic classes used when the struct is * allocated, in num_tc. This will be used to navigate the maps, to ensure we're * not crossing its upper bound, as the original dev->num_tc can be updated in @@ -779,6 +782,7 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; + unsigned int nr_ids; s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; diff --git a/net/core/dev.c b/net/core/dev.c index 4e29d1994fdd..7530c95970a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2524,14 +2524,14 @@ static void reset_xps_maps(struct net_device *dev, } static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, - struct xps_dev_maps *dev_maps, unsigned int nr_ids, - u16 offset, u16 count, bool is_rxqs_map) + struct xps_dev_maps *dev_maps, u16 offset, u16 count, + bool is_rxqs_map) { + unsigned int nr_ids = dev_maps->nr_ids; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), - j < nr_ids;) + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) @@ -2551,7 +2551,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, { const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - unsigned int nr_ids; if (!static_key_false(&xps_needed)) return; @@ -2561,11 +2560,9 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, - offset, count, true); - } + if (dev_maps) + clean_xps_maps(dev, possible_mask, dev_maps, offset, + count, true); } dev_maps = xmap_dereference(dev->xps_cpus_map); @@ -2574,9 +2571,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (num_possible_cpus() > 1) possible_mask = cpumask_bits(cpu_possible_mask); - nr_ids = nr_cpu_ids; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, - false); + clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2673,11 +2668,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're - * setting up now, as dev->num_tc could have been updated in between. We - * could try to be smart, but let's be safe instead and only copy - * foreign traffic classes if the two map sizes match. + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. */ - if (dev_maps && dev_maps->num_tc == num_tc) + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ @@ -2690,6 +2686,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, return -ENOMEM; } + new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } @@ -2770,7 +2767,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, goto out_no_old_maps; for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + j < dev_maps->nr_ids;) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) @@ -2804,12 +2801,12 @@ out_no_new_maps: goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids), + j < dev_maps->nr_ids;) { for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, nr_ids) || - !netif_attr_test_online(j, online_mask, nr_ids)) + if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || + !netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -3965,7 +3962,7 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_map *map; int queue_index = -1; - if (tc >= dev_maps->num_tc) + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; @@ -4004,7 +4001,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (dev_maps) { int tci = sk_rx_queue_get(sk); - if (tci >= 0 && tci < dev->num_rx_queues) + if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1364d0f39cb0..bb08bdc88fa9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1366,9 +1366,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1387,19 +1387,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, goto err_rtnl_unlock; } - mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_cpus_map); + nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), - j < nr_cpu_ids;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1419,10 +1420,12 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; @@ -1473,9 +1476,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; index = get_netdev_queue_index(queue); @@ -1488,19 +1491,20 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) goto err_rtnl_unlock; } - mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), - j < dev->num_rx_queues;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1520,11 +1524,13 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; -- cgit v1.2.3 From 6f36158e058409ec5ceb4290541e77ae2648fc86 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:45 +0100 Subject: net: remove the xps possible_mask Remove the xps possible_mask. It was an optimization but we can just loop from 0 to nr_ids now that it is embedded in the xps dev_maps. That simplifies the code a bit. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++--------------------------- net/core/net-sysfs.c | 4 ++-- 2 files changed, 15 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7530c95970a0..3ed8cb3a4061 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2523,33 +2523,28 @@ static void reset_xps_maps(struct net_device *dev, kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, +static void clean_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, u16 offset, u16 count, bool is_rxqs_map) { - unsigned int nr_ids = dev_maps->nr_ids; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) - active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, - count); + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, is_rxqs_map); if (!is_rxqs_map) { - for (i = offset + (count - 1); count--; i--) { + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); - } + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; if (!static_key_false(&xps_needed)) @@ -2561,17 +2556,14 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); if (dev_maps) - clean_xps_maps(dev, possible_mask, dev_maps, offset, - count, true); + clean_xps_maps(dev, dev_maps, offset, count, true); } dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; - if (num_possible_cpus() > 1) - possible_mask = cpumask_bits(cpu_possible_mask); - clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false); + clean_xps_maps(dev, dev_maps, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2627,8 +2619,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, bool is_rxqs_map) { - const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; @@ -2656,10 +2648,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); - if (num_possible_cpus() > 1) { + if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - possible_mask = cpumask_bits(cpu_possible_mask); - } dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } @@ -2710,8 +2700,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { /* copy maps belonging to foreign traffic classes */ for (i = tc, tci = j * num_tc; copy && i--; tci++) { /* fill in the new device map from the old device map */ @@ -2766,8 +2755,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) goto out_no_old_maps; - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < dev_maps->nr_ids;) { + for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) @@ -2801,8 +2789,7 @@ out_no_new_maps: goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids), - j < dev_maps->nr_ids;) { + for (j = 0; j < dev_maps->nr_ids; j++) { for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || @@ -2822,8 +2809,7 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb08bdc88fa9..c762c435ff76 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1400,7 +1400,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1504,7 +1504,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; -- cgit v1.2.3 From 044ab86d431b59b88966457dbb62679f274ec442 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:46 +0100 Subject: net: move the xps maps to an array Move the xps maps (xps_cpus_map and xps_rxqs_map) to an array in net_device. That will simplify a lot the code removing the need for lots of if/else conditionals as the correct map will be available using its offset in the array. This should not modify the xps maps behaviour in any way. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- include/linux/netdevice.h | 17 +++++++---- net/core/dev.c | 73 ++++++++++++++++++++--------------------------- net/core/net-sysfs.c | 6 ++-- 4 files changed, 46 insertions(+), 52 deletions(-) (limited to 'net/core') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 77ba8e2fc11c..584a9bd59dda 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2015,7 +2015,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) } virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); - __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false); + __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); cpumask_clear(mask); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 09e73f5a8c78..4940509999be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -754,6 +754,13 @@ struct rx_queue_attribute { const char *buf, size_t len); }; +/* XPS map type and offset of the xps map within net_device->xps_maps[]. */ +enum xps_map_type { + XPS_CPUS = 0, + XPS_RXQS, + XPS_MAPS_MAX, +}; + #ifdef CONFIG_XPS /* * This structure holds an XPS map which can be of variable length. The @@ -1773,8 +1780,7 @@ enum netdev_ml_priv_type { * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one * @xdp_bulkq: XDP device bulk queue - * @xps_cpus_map: all CPUs map for XPS device - * @xps_rxqs_map: all RXQs map for XPS device + * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for @@ -2070,8 +2076,7 @@ struct net_device { struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_cpus_map; - struct xps_dev_maps __rcu *xps_rxqs_map; + struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3701,7 +3706,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map); + u16 index, enum xps_map_type type); /** * netif_attr_test_mask - Test a CPU or Rx queue set in a mask @@ -3796,7 +3801,7 @@ static inline int netif_set_xps_queue(struct net_device *dev, static inline int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 3ed8cb3a4061..af57e32bb543 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2511,31 +2511,34 @@ static bool remove_xps_queue_cpu(struct net_device *dev, static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, - bool is_rxqs_map) + enum xps_map_type type) { - if (is_rxqs_map) { - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - } static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, - struct xps_dev_maps *dev_maps, u16 offset, u16 count, - bool is_rxqs_map) +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) { + struct xps_dev_maps *dev_maps; bool active = false; int i, j; + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); - if (!is_rxqs_map) { + if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); @@ -2545,27 +2548,17 @@ static void clean_xps_maps(struct net_device *dev, static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - struct xps_dev_maps *dev_maps; - if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) - clean_xps_maps(dev, dev_maps, offset, count, true); - } - - dev_maps = xmap_dereference(dev->xps_cpus_map); - if (!dev_maps) - goto out_no_maps; + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); - clean_xps_maps(dev, dev_maps, offset, count, false); + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2617,7 +2610,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; const unsigned long *online_mask = NULL; @@ -2642,15 +2635,15 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } mutex_lock(&xps_map_mutex); - if (is_rxqs_map) { + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); - dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } @@ -2683,7 +2676,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, j, index, is_rxqs_map); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; @@ -2696,7 +2689,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) + if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } @@ -2725,7 +2718,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (!is_rxqs_map) { + if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) @@ -2746,10 +2739,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } } - if (is_rxqs_map) - rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); - else - rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) @@ -2778,12 +2768,11 @@ out_no_old_maps: active = true; out_no_new_maps: - if (!is_rxqs_map) { + if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); - } if (!dev_maps) goto out_no_maps; @@ -2801,7 +2790,7 @@ out_no_new_maps: /* free map if not active */ if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2833,7 +2822,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, int ret; cpus_read_lock(); - ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; @@ -3983,7 +3972,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); @@ -3994,7 +3983,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c762c435ff76..ca1f3b63cfad 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1388,7 +1388,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1492,7 +1492,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1566,7 +1566,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, } cpus_read_lock(); - err = __netif_set_xps_queue(dev, mask, index, true); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); -- cgit v1.2.3 From 402fbb992e13fc57e917ac7c0a07a8a3e2385858 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:47 +0100 Subject: net: add an helper to copy xps maps to the new dev_maps This patch adds an helper, xps_copy_dev_maps, to copy maps from dev_maps to new_dev_maps at a given index. The logic should be the same, with an improved code readability and maintenance. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index af57e32bb543..00f6b41e11d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2608,6 +2608,25 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) +{ + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) @@ -2694,23 +2713,16 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } for (j = 0; j < nr_ids; j++) { - /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; copy && i--; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + bool skip_tc = false; - /* We need to explicitly update tci as prevous loop - * could break out early if dev_maps is NULL. - */ tci = j * num_tc + tc; - if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; + skip_tc = true; + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2725,18 +2737,11 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, numa_node_id = -1; } #endif - } else if (copy) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } - /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; copy && --i; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); -- cgit v1.2.3 From 132f743b01b85b8fae7e1f298bfd81a66b9389a8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:48 +0100 Subject: net: improve queue removal readability in __netif_set_xps_queue Improve the readability of the loop removing tx-queue from unused CPUs/rx-queues in __netif_set_xps_queue. The change should only be cosmetic. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 00f6b41e11d8..c8ce2dfcc97d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2784,13 +2784,16 @@ out_no_new_maps: /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { - for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) - active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || - !netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) - active |= remove_xps_queue(dev_maps, tci, index); - for (i = dev_maps->num_tc - tc, tci++; --i; tci++) + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; + active |= remove_xps_queue(dev_maps, tci, index); + } } /* free map if not active */ -- cgit v1.2.3 From d7be87a687cc261d663dcf97c01056f71398f9f9 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:49 +0100 Subject: net-sysfs: move the rtnl unlock up in the xps show helpers Now that nr_ids and num_tc are stored in the xps dev_maps, which are RCU protected, we do not have the need to protect the maps in the rtnl lock. Move the rtnl unlock up so we reduce the rtnl locking section. We also increase the reference count on the subordinate device if any, as we don't want this device to be freed while we use it (now that the rtnl lock isn't protecting it in the whole function). Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca1f3b63cfad..094fea082649 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1383,10 +1383,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, tc = netdev_txq_to_tc(dev, index); if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; + rtnl_unlock(); + return -EINVAL; } + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); + rtnl_unlock(); + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; @@ -1417,8 +1421,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } out_no_maps: rcu_read_unlock(); - - rtnl_unlock(); + put_device(&dev->dev); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); @@ -1426,8 +1429,7 @@ out_no_maps: err_rcu_unlock: rcu_read_unlock(); -err_rtnl_unlock: - rtnl_unlock(); + put_device(&dev->dev); return ret; } @@ -1486,10 +1488,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) return restart_syscall(); tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + rtnl_unlock(); + if (tc < 0) + return -EINVAL; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); @@ -1522,8 +1523,6 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); - rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); @@ -1531,8 +1530,6 @@ out_no_maps: err_rcu_unlock: rcu_read_unlock(); -err_rtnl_unlock: - rtnl_unlock(); return ret; } -- cgit v1.2.3 From 2db6cdaebac83c13acb165594b09282fa03cec89 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:50 +0100 Subject: net-sysfs: move the xps cpus/rxqs retrieval in a common function Most of the xps_cpus_show and xps_rxqs_show functions share the same logic. Having it in two different functions does not help maintenance. This patch moves their common logic into a new function, xps_queue_show, to improve this. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 125 ++++++++++++++++++++------------------------------- 1 file changed, 48 insertions(+), 77 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 094fea082649..562a42fcd437 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1361,44 +1361,27 @@ static const struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static ssize_t xps_cpus_show(struct netdev_queue *queue, - char *buf) +static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, + int tc, char *buf, enum xps_map_type type) { - struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned int index, nr_ids; - int j, len, ret, tc = 0; unsigned long *mask; - - if (!netif_is_multiqueue(dev)) - return -ENOENT; - - index = get_netdev_queue_index(queue); - - if (!rtnl_trylock()) - return restart_syscall(); - - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; - - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - rtnl_unlock(); - return -EINVAL; - } - - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); - rtnl_unlock(); + unsigned int nr_ids; + int j, len; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); - nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; + dev_maps = rcu_dereference(dev->xps_maps[type]); + + /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 + * when dev_maps hasn't been allocated yet, to be backward compatible. + */ + nr_ids = dev_maps ? dev_maps->nr_ids : + (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { - ret = -ENOMEM; - goto err_rcu_unlock; + rcu_read_unlock(); + return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) @@ -1421,16 +1404,44 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } out_no_maps: rcu_read_unlock(); - put_device(&dev->dev); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int len, tc; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + + if (!rtnl_trylock()) + return restart_syscall(); + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + rtnl_unlock(); + return -EINVAL; + } + + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); + rtnl_unlock(); + + len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); -err_rcu_unlock: - rcu_read_unlock(); put_device(&dev->dev); - return ret; + return len; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1477,10 +1488,8 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - unsigned int index, nr_ids; - int j, len, ret, tc = 0; - unsigned long *mask; + unsigned int index; + int tc; index = get_netdev_queue_index(queue); @@ -1492,45 +1501,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (tc < 0) return -EINVAL; - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); - nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; - - mask = bitmap_zalloc(nr_ids, GFP_KERNEL); - if (!mask) { - ret = -ENOMEM; - goto err_rcu_unlock; - } - - if (!dev_maps || tc >= dev_maps->num_tc) - goto out_no_maps; - - for (j = 0; j < nr_ids; j++) { - int i, tci = j * dev_maps->num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - set_bit(j, mask); - break; - } - } - } -out_no_maps: - rcu_read_unlock(); - - len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); - bitmap_free(mask); - - return len < PAGE_SIZE ? len : -EINVAL; - -err_rcu_unlock: - rcu_read_unlock(); - return ret; + return xps_queue_show(dev, index, tc, buf, XPS_RXQS); } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, -- cgit v1.2.3 From 2d05bf015308275f7c67a780f70026077285cfc0 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:51 +0100 Subject: net: fix use after free in xps When setting up an new dev_maps in __netif_set_xps_queue, we remove and free maps from unused CPUs/rx-queues near the end of the function; by calling remove_xps_queue. However it's possible those maps are also part of the old not-freed-yet dev_maps, which might be used concurrently. When that happens, a map can be freed while its corresponding entry in the old dev_maps table isn't NULLed, leading to: "BUG: KASAN: use-after-free" in different places. This fixes the map freeing logic for unused CPUs/rx-queues, to also NULL the map entries from the old dev_maps table. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c8ce2dfcc97d..d5f6ba209f1e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2460,7 +2460,7 @@ static DEFINE_MUTEX(xps_map_mutex); rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, - int tci, u16 index) + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; @@ -2479,6 +2479,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; @@ -2499,7 +2501,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, int i, j; for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, tci, j)) + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } @@ -2631,7 +2633,7 @@ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; @@ -2766,7 +2768,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } } - kfree_rcu(dev_maps, rcu); + old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; @@ -2792,10 +2794,15 @@ out_no_new_maps: netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; - active |= remove_xps_queue(dev_maps, tci, index); + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); -- cgit v1.2.3 From 75b2758abc355c410dd335d45b2d40f920e27cde Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:52 +0100 Subject: net: NULL the old xps map entries when freeing them In __netif_set_xps_queue, old map entries from the old dev_maps are freed but their corresponding entry in the old dev_maps aren't NULLed. Fix this. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d5f6ba209f1e..4961fc2e9b19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2764,6 +2764,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, continue; } + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } -- cgit v1.2.3 From a835f9034efbb699f307575bead2607c2fbc93ac Mon Sep 17 00:00:00 2001 From: Xiong Zhenwu Date: Thu, 18 Mar 2021 04:52:13 -0700 Subject: /net/core/: fix misspellings using codespell tool A typo is found out by codespell tool in 1734th line of drop_monitor.c: $ codespell ./net/core/ ./net/core/drop_monitor.c:1734: guarnateed ==> guaranteed Fix a typo found by codespell. Signed-off-by: Xiong Zhenwu Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 571f191c06d9..1eb02c2236f2 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1731,7 +1731,7 @@ static void exit_net_drop_monitor(void) /* * Because of the module_get/put we do in the trace state change path - * we are guarnateed not to have any current users when we get here + * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 919067cc845f323a80b6fe987b64238bd82d309e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Mar 2021 10:39:33 -0700 Subject: net: add CONFIG_PCPU_DEV_REFCNT I was working on a syzbot issue, claiming one device could not be dismantled because its refcount was -1 unregister_netdevice: waiting for sit0 to become free. Usage count = -1 It would be nice if syzbot could trigger a warning at the time this reference count became negative. This patch adds CONFIG_PCPU_DEV_REFCNT options which defaults to per cpu variables (as before this patch) on SMP builds. v2: free_dev label in alloc_netdev_mqs() is moved to avoid a compiler warning (-Wunused-label), as reported by kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ net/Kconfig | 8 ++++++++ net/core/dev.c | 10 ++++++++++ 3 files changed, 31 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4940509999be..8f003955c485 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2092,7 +2092,12 @@ struct net_device { u32 proto_down_reason; struct list_head todo_list; + +#ifdef CONFIG_PCPU_DEV_REFCNT int __percpu *pcpu_refcnt; +#else + refcount_t dev_refcnt; +#endif struct list_head link_watch_list; @@ -4044,7 +4049,11 @@ void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_dec(*dev->pcpu_refcnt); +#else + refcount_dec(&dev->dev_refcnt); +#endif } /** @@ -4055,7 +4064,11 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_inc(*dev->pcpu_refcnt); +#else + refcount_inc(&dev->dev_refcnt); +#endif } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/Kconfig b/net/Kconfig index 0ead7ec0d2bd..9c456acc379e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,6 +245,14 @@ source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" source "net/ncsi/Kconfig" +config PCPU_DEV_REFCNT + bool "Use percpu variables to maintain network device refcount" + depends on SMP + default y + help + network device refcount are using per cpu variables if this option is set. + This can be forced to N to detect underflows (with a performance drop). + config RPS bool depends on SMP && SYSFS diff --git a/net/core/dev.c b/net/core/dev.c index 4961fc2e9b19..be941ed754ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10312,11 +10312,15 @@ EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); @@ -10674,9 +10678,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -10740,8 +10746,10 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: +#endif netdev_freemem(dev); return NULL; } @@ -10783,8 +10791,10 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; -- cgit v1.2.3 From 5da9ace3405f40d8d93c1b519696f47bc4402318 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 13:30:19 +0200 Subject: net: make xps_needed and xps_rxqs_needed static Since their introduction in commit 04157469b7b8 ("net: Use static_key for XPS maps"), xps_needed and xps_rxqs_needed were never used outside net/core/dev.c, so I don't really understand why they were exported as symbols in the first place. This is needed in order to silence a "make W=1" warning about these static keys not being declared as static variables, but not having a previous declaration in a header file nonetheless. Cc: Amritha Nambiar Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index be941ed754ac..ffab3928eeeb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2451,10 +2451,8 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS -struct static_key xps_needed __read_mostly; -EXPORT_SYMBOL(xps_needed); -struct static_key xps_rxqs_needed __read_mostly; -EXPORT_SYMBOL(xps_rxqs_needed); +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -- cgit v1.2.3 From 744b8376632208137fe4acc9967b93e2970732a3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 13:31:48 +0200 Subject: net: move the ptype_all and ptype_base declarations to include/linux/netdevice.h ptype_all and ptype_base are declared in net/core/dev.c as non-static, because they are used by net-procfs.c too. However, a "make W=1" build complains that there was no previous declaration of ptype_all and ptype_base in a header file, so this way of declaring things constitutes a violation of coding style. Let's move the extern declarations of ptype_all and ptype_base to the linux/netdevice.h file, which is included by net-procfs.c too. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/net-procfs.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 33b8ea08996e..e4a503288d9b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5336,6 +5336,9 @@ do { \ #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + extern struct net_device *blackhole_netdev; #endif /* _LINUX_NETDEVICE_H */ diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index c714e6a9dad4..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -10,9 +10,6 @@ #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -extern struct list_head ptype_all __read_mostly; -extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; - static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); -- cgit v1.2.3 From 7f08ec6e04269ce53b664761c9108b44ed2f54ab Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 22 Mar 2021 16:43:29 +0100 Subject: net-sysfs: remove possible sleep from an RCU read-side critical section xps_queue_show is mostly made of an RCU read-side critical section and calls bitmap_zalloc with GFP_KERNEL in the middle of it. That is not allowed as this call may sleep and such behaviours aren't allowed in RCU read-side critical sections. Fix this by using GFP_NOWAIT instead. Fixes: 5478fcd0f483 ("net: embed nr_ids in the xps maps") Reported-by: kernel test robot Suggested-by: Matthew Wilcox Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 562a42fcd437..f6197774048b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1378,7 +1378,7 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); - mask = bitmap_zalloc(nr_ids, GFP_KERNEL); + mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; -- cgit v1.2.3 From add2d73631070c951b0de81a01d1463a15cfbd47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Mar 2021 11:21:45 -0700 Subject: net: set initial device refcount to 1 When adding CONFIG_PCPU_DEV_REFCNT, I forgot that the initial net device refcount was 0. When CONFIG_PCPU_DEV_REFCNT is not set, this means the first dev_hold() triggers an illegal refcount operation (addition on 0) refcount_t: addition on 0; use-after-free. WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0x128/0x1a4 Fix is to change initial (and final) refcount to be 1. Also add a missing kerneldoc piece, as reported by Stephen Rothwell. Fixes: 919067cc845f ("net: add CONFIG_PCPU_DEV_REFCNT") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e4a503288d9b..7005ad80e8d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1792,6 +1792,7 @@ enum netdev_ml_priv_type { * * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device + * @dev_refcnt: Number of references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * diff --git a/net/core/dev.c b/net/core/dev.c index ffab3928eeeb..c9a496f5e687 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10346,7 +10346,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + while (refcnt != 1) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -10383,7 +10383,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; @@ -10459,7 +10459,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); @@ -10680,6 +10680,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; + dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) -- cgit v1.2.3 From 5aa3afe107d9099fc0dea2acf82c3e3c8f0f20e2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 23 Mar 2021 07:49:23 +0100 Subject: net: make unregister netdev warning timeout configurable netdev_wait_allrefs() issues a warning if refcount does not drop to 0 after 10 seconds. While 10 second wait generally should not happen under normal workload in normal environment, it seems to fire falsely very often during fuzzing and/or in qemu emulation (~10x slower). At least it's not possible to understand if it's really a false positive or not. Automated testing generally bumps all timeouts to very high values to avoid flake failures. Add net.core.netdev_unregister_timeout_secs sysctl to make the timeout configurable for automated testing systems. Lowering the timeout may also be useful for e.g. manual bisection. The default value matches the current behavior. Signed-off-by: Dmitry Vyukov Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=211877 Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 11 +++++++++++ include/linux/netdevice.h | 1 + net/core/dev.c | 6 +++++- net/core/sysctl_net_core.c | 10 ++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index f2ab8a5b6a4b..2090bfc69aa5 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -311,6 +311,17 @@ permit to distribute the load on several cpus. If set to 1 (default), timestamps are sampled as soon as possible, before queueing. +netdev_unregister_timeout_secs +------------------------------ + +Unregister network device timeout in seconds. +This option controls the timeout (in seconds) used to issue a warning while +waiting for a network device refcount to drop to 0 during device +unregistration. A lower value may be useful during bisection to detect +a leaked reference faster. A larger value may be useful to prevent false +warnings on slow/loaded systems. +Default value is 10, minimum 0, maximum 3600. + optmem_max ---------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7005ad80e8d1..5fa66db0cb5d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4661,6 +4661,7 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; diff --git a/net/core/dev.c b/net/core/dev.c index c9a496f5e687..515309573cb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10322,6 +10322,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** @@ -10383,7 +10385,9 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt && + time_after(jiffies, warning_time + + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4567de519603..d84c8a1b280e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ static int two = 2; static int three = 3; +static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &int_3600, + }, { } }; -- cgit v1.2.3 From ddb94eafab8b597b05904c8277194ea2d6357fa9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:32 +0100 Subject: net: resolve forwarding path from virtual netdevice and HW destination address This patch adds dev_fill_forward_path() which resolves the path to reach the real netdevice from the IP forwarding side. This function takes as input the netdevice and the destination hardware address and it walks down the devices calling .ndo_fill_forward_path() for each device until the real device is found. For instance, assuming the following topology: IP forwarding / \ br0 eth0 / \ eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef where eth1 and eth2 are bridge ports and eth0 provides WAN connectivity. ethX is the interface in another box which is connected to the eth1 bridge port. For packets going through IP forwarding to br0 whose destination MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the following path: br0 -> eth1 .ndo_fill_forward_path for br0 looks up at the FDB for the bridge port from the destination MAC address to get the bridge port eth1. This information allows to create a fast path that bypasses the classic bridge and IP forwarding paths, so packets go directly from the bridge port eth1 to eth0 (wan interface) and vice versa. fast path .------------------------. / \ | IP forwarding | | / \ \/ | br0 eth0 . / \ -> eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 27 +++++++++++++++++++++++++++ net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5fa66db0cb5d..03cff88c7292 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -848,6 +848,27 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); +enum net_device_path_type { + DEV_PATH_ETHERNET = 0, +}; + +struct net_device_path { + enum net_device_path_type type; + const struct net_device *dev; +}; + +#define NET_DEVICE_PATH_STACK_MAX 5 + +struct net_device_path_stack { + int num_paths; + struct net_device_path path[NET_DEVICE_PATH_STACK_MAX]; +}; + +struct net_device_path_ctx { + const struct net_device *dev; + const u8 *daddr; +}; + enum tc_setup_type { TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, @@ -1282,6 +1303,8 @@ struct netdev_net_notifier { * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); * If a device is paired with a peer device, return the peer instance. * The caller must be under RCU read context. + * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + * Get the forwarding path to reach the real device from the HW destination address */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1488,6 +1511,8 @@ struct net_device_ops { int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); + int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, + struct net_device_path *path); }; /** @@ -2870,6 +2895,8 @@ void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack); struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index 515309573cb8..4bb6dcdbed8b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = daddr, + }; + struct net_device_path *path; + int ret = 0; + + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace -- cgit v1.2.3 From 897b9fae7a8ac1de2372a15234c944831c83ec26 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Thu, 25 Mar 2021 14:38:22 +0800 Subject: net: core: Fix a typo in dev_addr_lists.c Modify "funciton" to "function" in net/core/dev_addr_lists.c. Reported-by: Hulk Robot Signed-off-by: Lu Wei Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index fa1c37ec40c9..1e5bde241185 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -228,7 +228,7 @@ EXPORT_SYMBOL(__hw_addr_unsync); * @sync: function to call if address should be added * @unsync: function to call if address should be removed * - * This funciton is intended to be called from the ndo_set_rx_mode + * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without -- cgit v1.2.3 From 6c996e19949b34d7edebed4f6b0511145c036404 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 25 Mar 2021 15:52:45 +0100 Subject: net: change netdev_unregister_timeout_secs min value to 1 netdev_unregister_timeout_secs=0 can lead to printing the "waiting for dev to become free" message every jiffy. This is too frequent and unnecessary. Set the min value to 1 second. Also fix the merge issue introduced by "net: make unregister netdev warning timeout configurable": it changed "refcnt != 1" to "refcnt". Signed-off-by: Dmitry Vyukov Suggested-by: Eric Dumazet Fixes: 5aa3afe107d9 ("net: make unregister netdev warning timeout configurable") Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 2 +- net/core/dev.c | 2 +- net/core/sysctl_net_core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 2090bfc69aa5..c941b214e0b7 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -320,7 +320,7 @@ waiting for a network device refcount to drop to 0 during device unregistration. A lower value may be useful during bisection to detect a leaked reference faster. A larger value may be useful to prevent false warnings on slow/loaded systems. -Default value is 10, minimum 0, maximum 3600. +Default value is 10, minimum 1, maximum 3600. optmem_max ---------- diff --git a/net/core/dev.c b/net/core/dev.c index 48b529d59157..b4c67a5be606 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10460,7 +10460,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && + if (refcnt != 1 && time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d84c8a1b280e..c8496c1142c9 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -577,7 +577,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { } -- cgit v1.2.3 From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:52 -0700 Subject: bpf: selftests: Add kfunc_call test This patch adds a few kernel function bpf_kfunc_call_test*() for the selftest's test_run purpose. They will be allowed for tc_cls prog. The selftest calling the kernel function bpf_kfunc_call_test*() is also added in this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com --- include/linux/bpf.h | 6 +++ net/bpf/test_run.c | 28 ++++++++++ net/core/filter.c | 1 + .../testing/selftests/bpf/prog_tests/kfunc_call.c | 59 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/kfunc_call_test.c | 47 +++++++++++++++++ .../selftests/bpf/progs/kfunc_call_test_subprog.c | 42 +++++++++++++++ 6 files changed, 183 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_call.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b5b7967e3ff3..9fdd839b418c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1532,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1731,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } +static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return false; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4aabf71cd95d..a5d72c48fb66 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include +#include #include #include #include @@ -213,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b) *b += 1; return a + *b; } + +u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) +{ + return a + b + c + d; +} + +int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) +{ + return a + b; +} + +struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) +{ + return sk; +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET_START(test_sk_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test1) +BTF_ID(func, bpf_kfunc_call_test2) +BTF_ID(func, bpf_kfunc_call_test3) +BTF_SET_END(test_sk_kfunc_ids) + +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { diff --git a/net/core/filter.c b/net/core/filter.c index 17dc159ec40c..cae56d08a670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c new file mode 100644 index 000000000000..7fc0951ee75f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test_subprog.skel.h" + +static void test_main(void) +{ + struct kfunc_call_test *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 12, "test1-retval"); + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test2)"); + ASSERT_EQ(retval, 3, "test2-retval"); + + kfunc_call_test__destroy(skel); +} + +static void test_subprog(void) +{ + struct kfunc_call_test_subprog *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + + kfunc_call_test_subprog__destroy(skel); +} + +void test_kfunc_call(void) +{ + if (test__start_subtest("main")) + test_main(); + + if (test__start_subtest("subprog")) + test_subprog(); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c new file mode 100644 index 000000000000..470f8723e463 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; + +SEC("classifier") +int kfunc_call_test2(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + __u64 a = 1ULL << 32; + __u32 ret; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); + ret = a >> 32; /* ret should be 2 */ + ret += (__u32)a; /* ret should be 12 */ + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c new file mode 100644 index 000000000000..b2dcb7d9cb03 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern const int bpf_prog_active __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; +extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; +int active_res = -1; +int sk_state = -1; + +int __noinline f1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + int *active; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, + bpf_get_smp_processor_id()); + if (active) + active_res = *active; + + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + return f1(skb); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From af825087433fb94a431edf387c1265463fce3bd1 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Sat, 27 Mar 2021 16:15:50 +0800 Subject: net: core: Correct function name dev_uc_flush() in the kerneldoc Fix the following W=1 kernel build warning(s): net/core/dev_addr_lists.c:732: warning: expecting prototype for dev_uc_flush(). Prototype was for dev_uc_init() instead Reported-by: Hulk Robot Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 1e5bde241185..45ae6eeb2964 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -723,7 +723,7 @@ void dev_uc_flush(struct net_device *dev) EXPORT_SYMBOL(dev_uc_flush); /** - * dev_uc_flush - Init unicast address list + * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. -- cgit v1.2.3 From bb2882bc6c54672b4c57a2108a18ec3acc7c878c Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Sat, 27 Mar 2021 16:15:51 +0800 Subject: net: core: Correct function name netevent_unregister_notifier() in the kerneldoc Fix the following W=1 kernel build warning(s): net/core/netevent.c:45: warning: expecting prototype for netevent_unregister_notifier(). Prototype was for unregister_netevent_notifier() instead Reported-by: Hulk Robot Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/core/netevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netevent.c b/net/core/netevent.c index d76ed7739c70..5bb615e963cc 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -32,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(register_netevent_notifier); /** - * netevent_unregister_notifier - unregister a netevent notifier block + * unregister_netevent_notifier - unregister a netevent notifier block * @nb: notifier * * Unregister a notifier previously registered by -- cgit v1.2.3 From 37f0e514db660f03f8982b8f4fbbd4b2740abe7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:22 -0700 Subject: skmsg: Lock ingress_skb when purging Currently we purge the ingress_skb queue only when psock refcnt goes down to 0, so locking the queue is not necessary, but in order to be called during ->close, we have to lock it here. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-2-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 07f54015238a..bebf84ed4e30 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -634,7 +634,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; - while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); kfree_skb(skb); } -- cgit v1.2.3 From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:23 -0700 Subject: skmsg: Introduce a spinlock to protect ingress_msg Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues. __tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/skmsg.c | 3 +++ net/ipv4/tcp_bpf.c | 18 ++++++------------ 3 files changed, 55 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6c09d94be2e9..f2d45a73b2b2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -89,6 +89,7 @@ struct sk_psock { #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; } +static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bebf84ed4e30..305dddc51857 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } + spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); + spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..ae980716d896 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg_rx; int i, copied = 0; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - + msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; @@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) break; - msg_rx = list_next_entry(msg_rx, list); continue; } msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); + msg_rx = sk_psock_peek_msg(psock); } return copied; -- cgit v1.2.3 From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:24 -0700 Subject: net: Introduce skb_send_sock() for sock_map We only have skb_send_sock_locked() which requires callers to use lock_sock(). Introduce a variant skb_send_sock() which locks on its own, callers do not need to lock it any more. This will save us from adding a ->sendmsg_locked for each protocol. To reuse the code, pass function pointers to __skb_send_sock() and build skb_send_sock() and skb_send_sock_locked() on top. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 55 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c8def85fcc22..dbf820a50a39 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e8320b5d651a..3ad9e8425ab2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -/* Send skb data on a socket. Socket must be locked. */ -int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, - int len) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -2522,7 +2545,8 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); if (ret <= 0) goto error; @@ -2553,9 +2577,11 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2587,8 +2613,23 @@ out: error: return orig_len == len ? ret : orig_len - len; } + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} EXPORT_SYMBOL_GPL(skb_send_sock_locked); +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.3 From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:25 -0700 Subject: skmsg: Avoid lock_sock() in sk_psock_backlog() We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense. We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too. As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 50 ++++++++++++++++++++++++++++++++++---------------- net/core/sock_map.c | 1 + 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f2d45a73b2b2..7382c4b518d7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 305dddc51857..9c25020086a9 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret; - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -529,7 +528,7 @@ start: skb_bpf_redirect_clear(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -553,7 +552,7 @@ start: kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock); INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } } -static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; @@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) @@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy_deferred(struct work_struct *gc) @@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) sk_psock_done_strp(psock); cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); if (psock->sk_redir) sock_put(psock->sk_redir); @@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false); write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); call_rcu(&psock->rcu, sk_psock_destroy); } @@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dd53a7771d7e..e564fdeaada1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.3 From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:26 -0700 Subject: skmsg: Use rcu work for destroying psock The RCU callback sk_psock_destroy() only queues work psock->gc, so we can just switch to rcu work to simplify the code. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +---- net/core/skmsg.c | 17 +++++------------ 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 7382c4b518d7..e7aba150539d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,10 +102,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; - union { - struct rcu_head rcu; - struct work_struct gc; - }; + struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9c25020086a9..d43d43905d2c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait) static void sk_psock_done_strp(struct sk_psock *psock); -static void sk_psock_destroy_deferred(struct work_struct *gc) +static void sk_psock_destroy(struct work_struct *work) { - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); - + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); @@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -static void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} - void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { sk_psock_stop(psock, false); @@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - call_rcu(&psock->rcu, sk_psock_destroy); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); -- cgit v1.2.3 From 190179f65ba8bc18dc1d38435b7932505ca5544f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:27 -0700 Subject: skmsg: Use GFP_KERNEL in sk_psock_create_ingress_msg() This function is only called in process context. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d43d43905d2c..656eceab73bc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -410,7 +410,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!msg)) return NULL; -- cgit v1.2.3 From 2004fdbd8a2b56757691717639f86d0eea3ab5b4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:28 -0700 Subject: sock_map: Simplify sock_map_link() a bit sock_map_link() passes down map progs, but it is confusing to see both map progs and psock progs. Make the map progs more obvious by retrieving it directly with sock_map_progs() inside sock_map_link(). Now it is aligned with sock_map_link_no_progs() too. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-8-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e564fdeaada1..d06face0f16c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -26,6 +26,7 @@ struct bpf_stab { static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { @@ -224,10 +225,10 @@ out: return psock; } -static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, - struct sock *sk) +static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; + struct sk_psock_progs *progs = sock_map_progs(map); struct sk_psock *psock; int ret; @@ -492,7 +493,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, * and sk_write_space callbacks overridden. */ if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &stab->progs, sk); + ret = sock_map_link(map, sk); else ret = sock_map_link_no_progs(map, sk); if (ret < 0) @@ -1004,7 +1005,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, * and sk_write_space callbacks overridden. */ if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &htab->progs, sk); + ret = sock_map_link(map, sk); else ret = sock_map_link_no_progs(map, sk); if (ret < 0) -- cgit v1.2.3 From b017055255d620b365299c3824610e0098414664 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:29 -0700 Subject: sock_map: Kill sock_map_link_no_progs() Now we can fold sock_map_link_no_progs() into sock_map_link() and get rid of sock_map_link_no_progs(). Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-9-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 55 +++++++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d06face0f16c..42d797291d34 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -225,13 +225,24 @@ out: return psock; } +static bool sock_map_redirect_allowed(const struct sock *sk); + static int sock_map_link(struct bpf_map *map, struct sock *sk) { - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog *stream_verdict = NULL; + struct bpf_prog *stream_parser = NULL; + struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; + /* Only sockets we can redirect into/from in BPF need to hold + * refs to parser/verdict progs and have their sk_data_ready + * and sk_write_space callbacks overridden. + */ + if (!sock_map_redirect_allowed(sk)) + goto no_progs; + stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -257,6 +268,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } +no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -316,27 +328,6 @@ out_put_stream_verdict: return ret; } -static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) -{ - struct sk_psock *psock; - int ret; - - psock = sock_map_psock_get_checked(sk); - if (IS_ERR(psock)) - return PTR_ERR(psock); - - if (!psock) { - psock = sk_psock_init(sk, map->numa_node); - if (IS_ERR(psock)) - return PTR_ERR(psock); - } - - ret = sock_map_init_proto(sk, psock); - if (ret < 0) - sk_psock_put(sk, psock); - return ret; -} - static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); @@ -467,8 +458,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { @@ -488,14 +477,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; @@ -1000,14 +982,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:31 -0700 Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot() Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each protocol can implement its own way to replace the struct proto. This also helps get rid of symbol dependencies on CONFIG_INET. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 18 +++--------------- include/net/sock.h | 3 +++ include/net/tcp.h | 1 + include/net/udp.h | 1 + net/core/skmsg.c | 5 ----- net/core/sock_map.c | 24 ++++-------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 3 +++ net/ipv4/udp_bpf.c | 15 +++++++++++++-- net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ 12 files changed, 58 insertions(+), 45 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c83dbc2d81d9..5e800ddc2dc6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + int (*psock_update_sk_prot)(struct sock *sk, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock) } } -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, ops); -} - static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - if (inet_csk_has_ulp(sk)) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - } else { - sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); - } + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 0b6266fd6bf6..8b4155e756c2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1184,6 +1184,9 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_BPF_SYSCALL + int (*psock_update_sk_prot)(struct sock *sk, bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index 075de26f449d..2efa4e5ea23d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2203,6 +2203,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index d4d064c59232..df7cc1edc200 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int udp_bpf_update_proto(struct sock *sk, bool restore); #endif #endif /* _UDP_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a045812d7c78..9fc83f7cc1a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); - if (inet_csk_has_ulp(sk)) { - psock = ERR_PTR(-EINVAL); - goto out; - } - if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index c2a0411e08a8..2915c7c8778b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk) static bool sock_map_sk_is_suitable(const struct sock *sk) { - return sk_is_tcp(sk) || sk_is_udp(sk); + return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ae980716d896..ac8cfbaeacd2 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -595,20 +595,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..dfc6d1c0e710 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2806,6 +2806,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a0478b17243..38952aaee3a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2849,6 +2849,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..6001f93cd3a0 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -41,12 +41,23 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d0f007741e8e..bff22d6ef516 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2139,6 +2139,9 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d25e5a9252fd..ef2c75bb4771 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1713,6 +1713,9 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), -- cgit v1.2.3 From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:33 -0700 Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data() Although these two functions are only used by TCP, they are not specific to TCP at all, both operate on skmsg and ingress_msg, so fit in net/core/skmsg.c very well. And we will need them for non-TCP, so rename and move them to skmsg.c and export them to modules. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 4 ++ include/net/tcp.h | 2 - net/core/skmsg.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 100 +------------------------------------------------- net/tls/tls_sw.c | 4 +- 5 files changed, 106 insertions(+), 102 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 5e800ddc2dc6..f78e90a04a69 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err); +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2efa4e5ea23d..31b1696c62ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9fc83f7cc1a0..92a83c02562a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,6 +399,104 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_wait_data); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } + + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ac8cfbaeacd2..3d622a0d0753 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,80 +10,6 @@ #include #include -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = sk_psock_peek_msg(psock); - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - msg_rx = sk_psock_next_msg(psock, msg_rx); - if (!msg_rx) - break; - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - msg_rx = sk_psock_dequeue_msg(psock); - kfree_sk_msg(msg_rx); - } - msg_rx = sk_psock_peek_msg(psock); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -237,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -278,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 01d933ae5f16..1dcb34dfd56b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, - msg, len, flags); + int ret = sk_msg_recvmsg(sk, psock, msg, len, + flags); if (ret > 0) { decrypted += ret; -- cgit v1.2.3 From 122e6c79efe1c25816118aca9cfabe54e99c2432 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:35 -0700 Subject: sock_map: Update sock type checks for UDP Now UDP supports sockmap and redirection, we can safely update the sock type checks for it accordingly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-15-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2915c7c8778b..3d190d22b0d8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -535,7 +535,10 @@ static bool sk_is_udp(const struct sock *sk) static bool sock_map_redirect_allowed(const struct sock *sk) { - return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; + if (sk_is_tcp(sk)) + return sk->sk_state != TCP_LISTEN; + else + return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) -- cgit v1.2.3 From eeb85a14ee3494febb85ccfbee0772eda0823b13 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 5 Apr 2021 00:12:23 -0700 Subject: net: Allow to specify ifindex when device is moved to another namespace Currently, we can specify ifindex on link creation. This change allows to specify ifindex when a device is moved to another network namespace. Even now, a device ifindex can be changed if there is another device with the same ifindex in the target namespace. So this change doesn't introduce completely new behavior, it adds more control to the process. CRIU users want to restore containers with pre-created network devices. A user will provide network devices and instructions where they have to be restored, then CRIU will restore network namespaces and move devices into them. The problem is that devices have to be restored with the same indexes that they have before C/R. Cc: Alexander Mikhalitsyn Suggested-by: Christian Brauner Signed-off-by: Andrei Vagin Reviewed-by: Christian Brauner Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/netdevice.h | 3 ++- net/core/dev.c | 24 +++++++++++++++++------- net/core/rtnetlink.c | 19 +++++++++++++++---- net/ieee802154/core.c | 4 ++-- net/wireless/core.c | 4 ++-- 6 files changed, 39 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7349a70af083..8c0c70e1da77 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) */ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { ret = dev_change_net_namespace(vf_netdev, - dev_net(ndev), "eth%d"); + dev_net(ndev), "eth%d", 0); if (ret) netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f57b70fc251f..b482236c0e99 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4026,7 +4026,8 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int dev_change_net_namespace(struct net_device *, struct net *, const char *); +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex); int __dev_set_mtu(struct net_device *, int); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index b4c67a5be606..9d1a8fac793f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11067,6 +11067,8 @@ EXPORT_SYMBOL(unregister_netdev); * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -11075,10 +11077,11 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); - int err, new_nsid, new_ifindex; + int err, new_nsid; ASSERT_RTNL(); @@ -11109,6 +11112,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; } + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + /* * And now a mini version of register_netdevice unregister_netdevice. */ @@ -11136,10 +11144,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11448,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name); + err = dev_change_net_namespace(dev, &init_net, fb_name, 0); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1bdcb33fb561..d51252afde0a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2266,6 +2266,9 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } + if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0) + return -EINVAL; + if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; @@ -2603,14 +2606,22 @@ static int do_setlink(const struct sk_buff *skb, return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { - struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); + struct net *net; + int new_ifindex; + + net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - err = dev_change_net_namespace(dev, net, ifname); + if (tb[IFLA_NEW_IFINDEX]) + new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); + else + new_ifindex = 0; + + err = dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3452,7 +3463,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, dest_net, ifname, 0); if (err < 0) goto out_unregister; } diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index de259b5170ab..ec3068937fc3 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0); if (err) break; wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, - "wpan%d"); + "wpan%d", 0); WARN_ON(err); wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } diff --git a/net/wireless/core.c b/net/wireless/core.c index a2785379df6e..fabb677b7d58 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); + err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0); if (err) break; wdev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, - "wlan%d"); + "wlan%d", 0); WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } -- cgit v1.2.3 From 7e4a51319d3a71ac8002c96f817bcbeb36789b07 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 Apr 2021 23:40:03 -0700 Subject: net: introduce nla_policy for IFLA_NEW_IFINDEX In this case, we don't need to check that new_ifindex is positive in validate_linkmsg. Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace") Suggested-by: Jakub Kicinski Signed-off-by: Andrei Vagin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d51252afde0a..9108a7e6c0c0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1877,6 +1877,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, + [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2266,9 +2267,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } - if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0) - return -EINVAL; - if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; -- cgit v1.2.3 From 0854fa82c96ca37a35e954b7079c0bfd795affb1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 Apr 2021 23:40:51 -0700 Subject: net: remove the new_ifindex argument from dev_change_net_namespace Here is only one place where we want to specify new_ifindex. In all other cases, callers pass 0 as new_ifindex. It looks reasonable to add a low-level function with new_ifindex and to convert dev_change_net_namespace to a static inline wrapper. Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace") Suggested-by: Jakub Kicinski Signed-off-by: Andrei Vagin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/netdevice.h | 8 +++++++- net/core/dev.c | 10 +++++----- net/core/rtnetlink.c | 4 ++-- net/ieee802154/core.c | 4 ++-- net/wireless/core.c | 4 ++-- 6 files changed, 19 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8c0c70e1da77..7349a70af083 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) */ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { ret = dev_change_net_namespace(vf_netdev, - dev_net(ndev), "eth%d", 0); + dev_net(ndev), "eth%d"); if (ret) netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b482236c0e99..5cbc950b34df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4026,8 +4026,14 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex); +static inline int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex); + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0); +} int __dev_set_mtu(struct net_device *, int); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index 9d1a8fac793f..33ff4a944109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11062,7 +11062,7 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); /** - * dev_change_net_namespace - move device to different nethost namespace + * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name @@ -11077,8 +11077,8 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); int err, new_nsid; @@ -11202,7 +11202,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11458,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name, 0); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9108a7e6c0c0..9f1f55785a6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2619,7 +2619,7 @@ static int do_setlink(const struct sk_buff *skb, else new_ifindex = 0; - err = dev_change_net_namespace(dev, net, ifname, new_ifindex); + err = __dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3461,7 +3461,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname, 0); + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index ec3068937fc3..de259b5170ab 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0); + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, - "wpan%d", 0); + "wpan%d"); WARN_ON(err); wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } diff --git a/net/wireless/core.c b/net/wireless/core.c index fabb677b7d58..a2785379df6e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0); + err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; wdev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, - "wlan%d", 0); + "wlan%d"); WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } -- cgit v1.2.3 From 51e0158a54321a48d260e95998393934bb0de52c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 Apr 2021 20:21:11 -0700 Subject: skmsg: Pass psock pointer to ->psock_update_sk_prot() Using sk_psock() to retrieve psock pointer from sock requires RCU read lock, but we already get psock pointer before calling ->psock_update_sk_prot() in both cases, so we can just pass it without bothering sk_psock(). Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") Reported-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210407032111.33398-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +++-- include/net/sock.h | 5 ++++- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/core/sock_map.c | 2 +- net/ipv4/tcp_bpf.c | 3 +-- net/ipv4/udp_bpf.c | 3 +-- 7 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f78e90a04a69..e2fb0a5a101e 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,7 +99,8 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, + bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -405,7 +406,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, { sk->sk_prot->unhash = psock->saved_unhash; if (psock->psock_update_sk_prot) - psock->psock_update_sk_prot(sk, true); + psock->psock_update_sk_prot(sk, psock, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 8b4155e756c2..c4bbdcd83f4d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1114,6 +1114,7 @@ struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; +struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes @@ -1185,7 +1186,9 @@ struct proto { void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); #ifdef CONFIG_BPF_SYSCALL - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, + struct sk_psock *psock, + bool restore); #endif /* Keeping track of sockets in use */ diff --git a/include/net/tcp.h b/include/net/tcp.h index eaea43afcc97..d05193cb0d99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2215,7 +2215,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int tcp_bpf_update_proto(struct sock *sk, bool restore); +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index f55aaeef7e91..360df454356c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -543,7 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int udp_bpf_update_proto(struct sock *sk, bool restore); +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */ diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3d190d22b0d8..f473c51cbc4b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; - return sk->sk_prot->psock_update_sk_prot(sk, false); + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3d622a0d0753..4930bc8ab47e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -499,9 +499,8 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -int tcp_bpf_update_proto(struct sock *sk, bool restore) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { - struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 4a7e38c5d842..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -103,10 +103,9 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -int udp_bpf_update_proto(struct sock *sk, bool restore) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - struct sk_psock *psock = sk_psock(sk); if (restore) { sk->sk_write_space = psock->saved_write_space; -- cgit v1.2.3 From aadb2bb83ff789de63b48b4edeab7329423a50d3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Apr 2021 20:05:56 -0700 Subject: sock_map: Fix a potential use-after-free in sock_map_close() The last refcnt of the psock can be gone right after sock_map_remove_links(), so sk_psock_stop() could trigger a UAF. The reason why I placed sk_psock_stop() there is to avoid RCU read critical section, and more importantly, some callee of sock_map_remove_links() is supposed to be called with RCU read lock, we can not simply get rid of RCU read lock here. Therefore, the only choice we have is to grab an additional refcnt with sk_psock_get() and put it back after sk_psock_stop(). Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: syzbot+7b6548ae483d6f4c64ae@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210408030556.45134-1-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f473c51cbc4b..6f1b82b8ad49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1521,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1532,6 +1532,7 @@ void sock_map_close(struct sock *sk, long timeout) sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.3 From 17c3df7078e3742bd9e907f3006a9e3469383007 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 14 Apr 2021 12:48:48 +0200 Subject: skbuff: revert "skbuff: remove some unnecessary operation in skb_segment_list()" the commit 1ddc3229ad3c ("skbuff: remove some unnecessary operation in skb_segment_list()") introduces an issue very similar to the one already fixed by commit 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist"). If the GSO skb goes though skb_clone() and pskb_expand_head() before entering skb_segment_list(), the latter will unshare the frag_list skbs and will release the old list. With the reverted commit in place, when skb_segment_list() completes, skb->next points to the just released list, and later on the kernel will hit UaF. Note that since commit e0e3070a9bc9 ("udp: properly complete L4 GRO over UDP tunnel packet") the critical scenario can be reproduced also receiving UDP over vxlan traffic with: NIC (NETIF_F_GRO_FRAGLIST enabled) -> vxlan -> UDP sink Attaching a packet socket to the NIC will cause skb_clone() and the tunnel decapsulation will call pskb_expand_head(). Fixes: 1ddc3229ad3c ("skbuff: remove some unnecessary operation in skb_segment_list()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ad9e8425ab2..14010c0eec48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3773,13 +3773,13 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; + struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int err; skb_push(skb, -skb_network_offset(skb) + offset); skb_shinfo(skb)->frag_list = NULL; - skb->next = list_skb; do { nskb = list_skb; @@ -3797,8 +3797,17 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, } } - if (unlikely(err)) + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + if (unlikely(err)) { + nskb->next = list_skb; goto err_linearize; + } + + tail = nskb; delta_len += nskb->len; delta_truesize += nskb->truesize; @@ -3825,7 +3834,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_gso_reset(skb); - skb->prev = nskb; + skb->prev = tail; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) -- cgit v1.2.3 From 38ebcf5096a86762b82262e96b2c8b170fe79040 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Apr 2021 10:37:53 -0700 Subject: scm: optimize put_cmsg() Calling two copy_to_user() for very small regions has very high overhead. Switch to inlined unsafe_put_user() to save one stac/clac sequence, and avoid copy_to_user(). Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/core/scm.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index 8156d4fb8a39..bd96c922041d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -228,14 +228,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; - struct cmsghdr cmhdr; - - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || - copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - return -EFAULT; + + if (!user_write_access_begin(cm, cmlen)) + goto efault; + + unsafe_put_user(len, &cm->cmsg_len, efault_end); + unsafe_put_user(level, &cm->cmsg_level, efault_end); + unsafe_put_user(type, &cm->cmsg_type, efault_end); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault_end); + user_write_access_end(); } else { struct cmsghdr *cm = msg->msg_control; @@ -249,6 +251,11 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; } EXPORT_SYMBOL(put_cmsg); -- cgit v1.2.3 From e7ad33fa7bc5f788cdb14eea68c65c4da0f06edf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Apr 2021 11:35:38 -0700 Subject: scm: fix a typo in put_cmsg() We need to store cmlen instead of len in cm->cmsg_len. Fixes: 38ebcf5096a8 ("scm: optimize put_cmsg()") Signed-off-by: Eric Dumazet Reported-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index bd96c922041d..ae3085d9aae8 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -232,7 +232,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (!user_write_access_begin(cm, cmlen)) goto efault; - unsafe_put_user(len, &cm->cmsg_len, efault_end); + unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); unsafe_put_user(level, &cm->cmsg_level, efault_end); unsafe_put_user(type, &cm->cmsg_type, efault_end); unsafe_copy_to_user(CMSG_USER_DATA(cm), data, -- cgit v1.2.3 From 1e3d976dbb23b3fce544752b434bdc32ce64aabc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 16 Apr 2021 14:31:51 -0500 Subject: flow_dissector: Fix out-of-bounds warning in __skb_flow_bpf_to_target() Fix the following out-of-bounds warning: net/core/flow_dissector.c:835:3: warning: 'memcpy' offset [33, 48] from the object at 'flow_keys' is out of the bounds of referenced subobject 'ipv6_src' with type '__u32[4]' {aka 'unsigned int[4]'} at offset 16 [-Warray-bounds] The problem is that the original code is trying to copy data into a couple of struct members adjacent to each other in a single call to memcpy(). So, the compiler legitimately complains about it. As these are just a couple of members, fix this by copying each one of them in separate calls to memcpy(). This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). Link: https://github.com/KSPP/linux/issues/109 Reported-by: kernel test robot Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5985029e43d4..3ed7c98a98e1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -832,8 +832,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } -- cgit v1.2.3 From 7ad18ff6449cbd6beb26b53128ddf56d2685aa93 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 19 Apr 2021 12:53:06 +0000 Subject: gro: fix napi_gro_frags() Fast GRO breakage due to IP alignment check Commit 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") did the right thing, but missed the fact that napi_gro_frags() logics calls for skb_gro_reset_offset() *before* pulling Ethernet header to the skb linear space. That said, the introduced check for frag0 address being aligned to 4 always fails for it as Ethernet header is obviously 14 bytes long, and in case with NET_IP_ALIGN its start is not aligned to 4. Fix this by adding @nhoff argument to skb_gro_reset_offset() which tells if an IP header is placed right at the start of frag0 or not. This restores Fast GRO for napi_gro_frags() that became very slow after the mentioned commit, and preserves the introduced check to avoid silent unaligned accesses. From v1 [0]: - inline tiny skb_gro_reset_offset() to let the code be optimized more efficively (esp. for the !NET_IP_ALIGN case) (Eric); - pull in Reviewed-by from Eric. [0] https://lore.kernel.org/netdev/20210418114200.5839-1-alobakin@pm.me Fixes: 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") Reviewed-by: Eric Dumazet Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1f79b9aa9a3f..15fe36332fb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5914,7 +5914,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, return head; } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5925,7 +5925,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -6143,7 +6143,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6232,7 +6232,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); -- cgit v1.2.3 From 3e1e58d64c3d0a6789f9d865936c4ce46b20f3f5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 19 Apr 2021 15:01:03 +0200 Subject: net: add generic selftest support Port some parts of the stmmac selftest and reuse it as basic generic selftest library. This patch was tested with following combinations: - iMX6DL FEC -> AT8035 - iMX6DL FEC -> SJA1105Q switch -> KSZ8081 - iMX6DL FEC -> SJA1105Q switch -> KSZ9031 - AR9331 ag71xx -> AR9331 PHY - AR9331 ag71xx -> AR9331 switch -> AR9331 PHY Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- include/net/selftests.h | 12 ++ net/Kconfig | 4 + net/core/Makefile | 1 + net/core/selftests.c | 400 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 417 insertions(+) create mode 100644 include/net/selftests.h create mode 100644 net/core/selftests.c (limited to 'net/core') diff --git a/include/net/selftests.h b/include/net/selftests.h new file mode 100644 index 000000000000..9993b9498cf3 --- /dev/null +++ b/include/net/selftests.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_SELFTESTS +#define _NET_SELFTESTS + +#include + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf); +int net_selftest_get_count(void); +void net_selftest_get_strings(u8 *data); + +#endif /* _NET_SELFTESTS */ diff --git a/net/Kconfig b/net/Kconfig index 9c456acc379e..8d955195c069 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -429,6 +429,10 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_SELFTESTS + def_tristate PHYLIB + depends on PHYLIB + config NET_SOCK_MSG bool default n diff --git a/net/core/Makefile b/net/core/Makefile index 0c2233c826fd..1a6168d8f23b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000000..ba7b0171974c --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel + */ + +#include +#include +#include +#include + +struct net_packet_attrs { + unsigned char *src; + unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +static u8 net_test_next_id; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) + +static struct sk_buff *net_test_get_skb(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + thdr->check = 0; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = net_test_next_id; + shdr->id = net_test_next_id++; + + if (attr->size) + skb_put(skb, attr->size); + if (attr->max_size && attr->max_size > skb->len) + skb_put(skb, attr->max_size - skb->len); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, + ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + unsigned char *src = tpriv->packet->src; + unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + tpriv->ok = true; + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + ret = tpriv->ok ? 0 : -ETIMEDOUT; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + u8 *p = data; + int i; + + for (i = 0; i < net_selftest_get_count(); i++) { + snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, + net_selftests[i].name); + p += ETH_GSTRING_LEN; + } +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel "); -- cgit v1.2.3 From eefb45eef5c4c425e87667af8f5e904fbdd47abf Mon Sep 17 00:00:00 2001 From: Chinmay Agarwal Date: Thu, 22 Apr 2021 01:12:22 +0530 Subject: neighbour: Prevent Race condition in neighbour subsytem Following Race Condition was detected: : Executing: __netif_receive_skb() ->__netif_receive_skb_core() -> arp_rcv() -> arp_process().arp_process() calls __neigh_lookup() which takes a reference on neighbour entry 'n'. Moves further along, arp_process() and calls neigh_update()-> __neigh_update(). Neighbour entry is unlocked just before a call to neigh_update_gc_list. This unlocking paves way for another thread that may take a reference on the same and mark it dead and remove it from gc_list. - neigh_flush_dev() is under execution and calls neigh_mark_dead(n) marking the neighbour entry 'n' as dead. Also n will be removed from gc_list. Moves further along neigh_flush_dev() and calls neigh_cleanup_and_release(n), but since reference count increased in t1, 'n' couldn't be destroyed. - Code hits neigh_update_gc_list, with neighbour entry set as dead. - arp_process() finally calls neigh_release(n), destroying the neighbour entry and we have a destroyed ntry still part of gc_list. Fixes: eb4e8fac00d1("neighbour: Prevent a dead entry from updating gc_list") Signed-off-by: Chinmay Agarwal Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8379719d1dce..98f20efbfadf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); + if (n->dead) + goto out; + /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ @@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n) atomic_inc(&n->tbl->gc_entries); } +out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -- cgit v1.2.3 From 22b6034323fd736f260e00b9ea85c634abeb3446 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Mon, 19 Apr 2021 16:15:59 +0200 Subject: net, xdp: Update pkt_type if generic XDP changes unicast MAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a generic XDP program changes the destination MAC address from/to multicast/broadcast, the skb->pkt_type is updated to properly handle the packet when passed up the stack. When changing the MAC from/to the NICs MAC, PACKET_HOST/OTHERHOST is not updated, though, making the behavior different from that of native XDP. Remember the PACKET_HOST/OTHERHOST state before calling the program in generic XDP, and update pkt_type accordingly if the destination MAC address has changed. As eth_type_trans() assumes a default pkt_type of PACKET_HOST, restore that before calling it. The use case for this is when a XDP program wants to push received packets up the stack by rewriting the MAC to the NICs MAC, for example by cluster nodes sharing MAC addresses. Fixes: 297249569932 ("net: fix generic XDP to handle if eth header was mangled") Signed-off-by: Martin Willi Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210419141559.8611-1-martin@strongswan.org --- net/core/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b4c67a5be606..6a1ef7a15bed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4723,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4773,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4800,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } -- cgit v1.2.3 From ed744d819379ddeec5744b0bfc7eb6d0a8ac4e46 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 22 Apr 2021 21:41:51 +0800 Subject: net: sock: remove the unnecessary check in proto_register tw_prot_cleanup will check the twsk_prot. Fixes: 0f5907af3913 ("net: Fix potential memory leak in proto_register()") Cc: Miaohe Lin Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5ec90f99e102..c761c4a0b66b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3531,7 +3531,7 @@ int proto_register(struct proto *prot, int alloc_slab) return ret; out_free_timewait_sock_slab: - if (alloc_slab && prot->twsk_prot) + if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { -- cgit v1.2.3 From a1ab3e4554b5342b34845df452601ebd5a310d0a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Mar 2021 15:35:03 +0200 Subject: devlink: Extend SF port attributes to have external attribute Extended SF port attributes to have optional external flag similar to PCI PF and VF port attributes. External atttibute is required to generate unique phys_port_name when PF number and SF number are overlapping between two controllers similar to SR-IOV VFs. When a SF is for external controller an example view of external SF port and config sequence. On eswitch system: $ devlink dev eswitch set pci/0033:01:00.0 mode switchdev $ devlink port show pci/0033:01:00.0/196607: type eth netdev enP51p1s0f0np0 flavour physical port 0 splittable false pci/0033:01:00.0/131072: type eth netdev eth0 flavour pcipf controller 1 pfnum 0 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port add pci/0033:01:00.0 flavour pcisf pfnum 0 sfnum 77 controller 1 pci/0033:01:00.0/163840: type eth netdev eth1 flavour pcisf controller 1 pfnum 0 sfnum 77 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached phys_port_name construction: $ cat /sys/class/net/eth1/phys_port_name c1pf0sf77 Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 2 +- include/net/devlink.h | 5 ++++- net/core/devlink.c | 11 ++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 8e825ef35cb7..183f782b940f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -141,7 +141,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p mlx5_esw_get_port_parent_id(dev, &ppid); memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum); + devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum, false); devlink = priv_to_devlink(dev); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); err = devlink_port_register(devlink, dl_port, dl_port_index); diff --git a/include/net/devlink.h b/include/net/devlink.h index 853420db5d32..7c984cadfec4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -98,11 +98,13 @@ struct devlink_port_pci_vf_attrs { * @controller: Associated controller number * @sf: Associated PCI SF for of the PCI PF for this port. * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { u32 controller; u32 sf; u16 pf; + u8 external:1; }; /** @@ -1508,7 +1510,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, - u32 controller, u16 pf, u32 sf); + u32 controller, u16 pf, u32 sf, + bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; -- cgit v1.2.3 From 4a52dd8fefb45626dace70a63c0738dbd83b7edb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 28 Apr 2021 15:09:46 +0200 Subject: net: selftest: fix build issue if INET is disabled In case ethernet driver is enabled and INET is disabled, selftest will fail to build. Reported-by: Randy Dunlap Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Oleksij Rempel Acked-by: Randy Dunlap # build-tested Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210428130947.29649-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/atheros/Kconfig | 2 +- drivers/net/ethernet/freescale/Kconfig | 2 +- include/net/selftests.h | 19 +++++++++++++++++++ net/Kconfig | 2 +- net/core/Makefile | 2 +- net/dsa/Kconfig | 2 +- 6 files changed, 24 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/atheros/Kconfig b/drivers/net/ethernet/atheros/Kconfig index 6842b74b0696..482c58c4c584 100644 --- a/drivers/net/ethernet/atheros/Kconfig +++ b/drivers/net/ethernet/atheros/Kconfig @@ -20,8 +20,8 @@ if NET_VENDOR_ATHEROS config AG71XX tristate "Atheros AR7XXX/AR9XXX built-in ethernet mac support" depends on ATH79 - select NET_SELFTESTS select PHYLINK + imply NET_SELFTESTS help If you wish to compile a kernel for AR7XXX/91XXX and enable ethernet support, then you should always answer Y to this. diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index 3d937b4650b2..2d1abdd58fab 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -26,8 +26,8 @@ config FEC ARCH_MXC || SOC_IMX28 || COMPILE_TEST) default ARCH_MXC || SOC_IMX28 if ARM select CRC32 - select NET_SELFTESTS select PHYLIB + imply NET_SELFTESTS imply PTP_1588_CLOCK help Say Y here if you want to use the built-in 10/100 Fast ethernet diff --git a/include/net/selftests.h b/include/net/selftests.h index 9993b9498cf3..e65e8d230d33 100644 --- a/include/net/selftests.h +++ b/include/net/selftests.h @@ -4,9 +4,28 @@ #include +#if IS_ENABLED(CONFIG_NET_SELFTESTS) + void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); int net_selftest_get_count(void); void net_selftest_get_strings(u8 *data); +#else + +static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf) +{ +} + +static inline int net_selftest_get_count(void) +{ + return 0; +} + +static inline void net_selftest_get_strings(u8 *data) +{ +} + +#endif #endif /* _NET_SELFTESTS */ diff --git a/net/Kconfig b/net/Kconfig index 8d955195c069..f5ee7c65e6b4 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -431,7 +431,7 @@ config SOCK_VALIDATE_XMIT config NET_SELFTESTS def_tristate PHYLIB - depends on PHYLIB + depends on PHYLIB && INET config NET_SOCK_MSG bool diff --git a/net/core/Makefile b/net/core/Makefile index 1a6168d8f23b..f7f16650fe9e 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o @@ -33,7 +34,6 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 5baba7021427..00bb89b2d86f 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,7 +9,7 @@ menuconfig NET_DSA select NET_SWITCHDEV select PHYLINK select NET_DEVLINK - select NET_SELFTESTS + imply NET_SELFTESTS help Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. -- cgit v1.2.3