From 3024d95a4c521c278a7504ee9e80c57c3a9750e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 17 Aug 2022 23:32:09 +0200 Subject: bpf: Partially revert flexible-array member replacement Partially revert 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") given it breaks BPF UAPI. For example, BPF CI run reveals build breakage under LLVM: [...] CLNG-BPF [test_maps] map_ptr_kern.o CLNG-BPF [test_maps] btf__core_reloc_arrays___diff_arr_val_sz.o CLNG-BPF [test_maps] test_bpf_cookie.o progs/map_ptr_kern.c:314:26: error: field 'trie_key' with variable sized type 'struct bpf_lpm_trie_key' not at the end of a struct or class is a GNU extension [-Werror,-Wgnu-variable-sized-type-not-at-end] struct bpf_lpm_trie_key trie_key; ^ CLNG-BPF [test_maps] btf__core_reloc_type_based___diff.o 1 error generated. make: *** [Makefile:521: /tmp/runner/work/bpf/bpf/tools/testing/selftests/bpf/map_ptr_kern.o] Error 1 make: *** Waiting for unfinished jobs.... [...] Typical usage of the bpf_lpm_trie_key is that the struct gets embedded into a user defined key for the LPM BPF map, from the selftest example: struct bpf_lpm_trie_key { <-- UAPI exported struct __u32 prefixlen; __u8 data[]; }; struct lpm_key { <-- BPF program defined struct struct bpf_lpm_trie_key trie_key; __u32 data; }; Undo this for BPF until a different solution can be found. It's the only flexible- array member case in the UAPI header. This was discovered in BPF CI after Dave reported that the include/uapi/linux/bpf.h header was out of sync with tools/include/uapi/linux/bpf.h after 94dfc73e7cf4. And the subsequent sync attempt failed CI. Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Reported-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Cc: Gustavo A. R. Silva Link: https://lore.kernel.org/bpf/22aebc88-da67-f086-e620-dd4a16e2bc69@iogearbox.net --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bf9ba1329be..59a217ca2dfd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -79,7 +79,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { -- cgit v1.2.3 From 7ec9fce4b31604f8415136a4c07f7dc8ad431aec Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 18 Aug 2022 10:41:18 +0300 Subject: ip_tunnel: Respect tunnel key's "flow_flags" in IP tunnels Commit 451ef36bd229 ("ip_tunnels: Add new flow flags field to ip_tunnel_key") added a "flow_flags" member to struct ip_tunnel_key which was later used by the commit in the fixes tag to avoid dropping packets with sources that aren't locally configured when set in bpf_set_tunnel_key(). VXLAN and GENEVE were made to respect this flag, ip tunnels like IPIP and GRE were not. This commit fixes this omission by making ip_tunnel_init_flow() receive the flow flags from the tunnel key in the relevant collect_md paths. Fixes: b8fff748521c ("bpf: Set flow flag to allow any source IP in bpf_tunnel_key") Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann Reviewed-by: Paul Chaignon Link: https://lore.kernel.org/bpf/20220818074118.726639-1-eyal.birger@gmail.com --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 ++- include/net/ip_tunnels.h | 4 +++- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_tunnel.c | 7 ++++--- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 39904dacf4f0..b3472fb94617 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,7 +423,8 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0); + 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, + 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 63fac94f9ace..ced80e2f8b58 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -246,7 +246,8 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, - __u32 mark, __u32 tun_inner_hash) + __u32 mark, __u32 tun_inner_hash, + __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); @@ -263,6 +264,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; + fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5c58e21f724e..f866d6282b2b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -609,7 +609,7 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), key->tos & ~INET_ECN_MASK, dev_net(dev), 0, - skb->mark, skb_get_hash(skb)); + skb->mark, skb_get_hash(skb), key->flow_flags); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e65e948cab9f..019f3b0839c5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -295,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), dev_net(dev), - tunnel->parms.link, tunnel->fwmark, 0); + tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -570,7 +570,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - dev_net(dev), 0, skb->mark, skb_get_hash(skb)); + dev_net(dev), 0, skb->mark, skb_get_hash(skb), + key->flow_flags); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; @@ -729,7 +730,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), dev_net(dev), tunnel->parms.link, - tunnel->fwmark, skb_get_hash(skb)); + tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; -- cgit v1.2.3 From fce1c23f629173e0db78b79a74f2052044a00e65 Mon Sep 17 00:00:00 2001 From: Alvaro Karsz Date: Tue, 23 Aug 2022 10:39:47 +0300 Subject: net: virtio_net: fix notification coalescing comments Fix wording in comments for the notifications coalescing feature. Signed-off-by: Alvaro Karsz Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20220823073947.14774-1-alvaro.karsz@solid-run.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/virtio_net.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 29ced55514d4..6cb842ea8979 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -56,7 +56,7 @@ #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ -#define VIRTIO_NET_F_NOTF_COAL 53 /* Guest can handle notifications coalescing */ +#define VIRTIO_NET_F_NOTF_COAL 53 /* Device supports notifications coalescing */ #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ @@ -364,24 +364,24 @@ struct virtio_net_hash_config { */ #define VIRTIO_NET_CTRL_NOTF_COAL 6 /* - * Set the tx-usecs/tx-max-packets patameters. - * tx-usecs - Maximum number of usecs to delay a TX notification. - * tx-max-packets - Maximum number of packets to send before a TX notification. + * Set the tx-usecs/tx-max-packets parameters. */ struct virtio_net_ctrl_coal_tx { + /* Maximum number of packets to send before a TX notification */ __le32 tx_max_packets; + /* Maximum number of usecs to delay a TX notification */ __le32 tx_usecs; }; #define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET 0 /* - * Set the rx-usecs/rx-max-packets patameters. - * rx-usecs - Maximum number of usecs to delay a RX notification. - * rx-max-frames - Maximum number of packets to receive before a RX notification. + * Set the rx-usecs/rx-max-packets parameters. */ struct virtio_net_ctrl_coal_rx { + /* Maximum number of packets to receive before a RX notification */ __le32 rx_max_packets; + /* Maximum number of usecs to delay a RX notification */ __le32 rx_usecs; }; -- cgit v1.2.3 From 79e3602caa6f9d59c4f66a268407080496dae408 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2022 11:56:56 -0700 Subject: tcp: make global challenge ack rate limitation per net-ns and default disabled Because per host rate limiting has been proven problematic (side channel attacks can be based on it), per host rate limiting of challenge acks ideally should be per netns and turned off by default. This is a long due followup of following commits: 083ae308280d ("tcp: enable per-socket rate limiting of all 'challenge acks'") f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") 75ff39ccc1bd ("tcp: make challenge acks less predictable") Signed-off-by: Eric Dumazet Cc: Jason Baron Acked-by: Neal Cardwell Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 5 ++++- include/net/netns/ipv4.h | 2 ++ net/ipv4/tcp_input.c | 21 +++++++++++---------- net/ipv4/tcp_ipv4.c | 6 ++++-- 4 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 56cd4ea059b2..a759872a2883 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1035,7 +1035,10 @@ tcp_limit_output_bytes - INTEGER tcp_challenge_ack_limit - INTEGER Limits number of Challenge ACK sent per second, as recommended in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) - Default: 1000 + Note that this per netns rate limit can allow some side channel + attacks and probably should not be enabled. + TCP stack implements per TCP socket limits anyway. + Default: INT_MAX (unlimited) UDP variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c7320ef356d9..6320a76cefdc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -179,6 +179,8 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + u32 tcp_challenge_timestamp; + u32 tcp_challenge_count; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c184e15397a2..b85a9f755da4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3614,12 +3614,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk) { - /* unprotected vars, we dont care of overwrites */ - static u32 challenge_timestamp; - static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - u32 count, now; + u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, @@ -3627,18 +3624,22 @@ static void tcp_send_challenge_ack(struct sock *sk) &tp->last_oow_ack_time)) return; + ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (ack_limit == INT_MAX) + goto send_ack; + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != READ_ONCE(challenge_timestamp)) { - u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; - WRITE_ONCE(challenge_timestamp, now); - WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); + WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); } - count = READ_ONCE(challenge_count); + count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { - WRITE_ONCE(challenge_count, count - 1); + WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); +send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0c83780dc9bf..5b019ba2b9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3139,8 +3139,10 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of 16 TSO segments */ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; - /* rfc5961 challenge ack rate limiting */ - net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + + /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ + net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; + net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ net->ipv4.sysctl_tcp_min_rtt_wlen = 300; -- cgit v1.2.3