From e8d283b6cf0e83d5fcb5345e037956eb3e9b2483 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 Jan 2023 14:15:37 -0800 Subject: net: ipv6: rpl_iptunnel: Replace 0-length arrays with flexible arrays Zero-length arrays are deprecated[1]. Replace struct ipv6_rpl_sr_hdr's "segments" union of 0-length arrays with flexible arrays. Detected with GCC 13, using -fstrict-flex-arrays=3: In function 'rpl_validate_srh', inlined from 'rpl_build_state' at ../net/ipv6/rpl_iptunnel.c:96:7: ../net/ipv6/rpl_iptunnel.c:60:28: warning: array subscript is outside array bounds of 'struct in6_addr[0]' [-Warray-bounds=] 60 | if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) & | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from ../include/net/rpl.h:12, from ../net/ipv6/rpl_iptunnel.c:13: ../include/uapi/linux/rpl.h: In function 'rpl_build_state': ../include/uapi/linux/rpl.h:40:33: note: while referencing 'addr' 40 | struct in6_addr addr[0]; | ^~~~ [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays Cc: Hideaki YOSHIFUJI Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230105221533.never.711-kees@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/rpl_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index ff691d9f4a04..b1c028df686e 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -13,7 +13,7 @@ #include struct rpl_iptunnel_encap { - struct ipv6_rpl_sr_hdr srh[0]; + DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { -- cgit v1.2.3 From af6d10345ca76670c1b7c37799f0d5576ccef277 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 12 Jan 2023 12:25:32 +1100 Subject: ipv6: remove max_size check inline with ipv4 In ip6_dst_gc() replace: if (entries > gc_thresh) With: if (entries > ops->gc_thresh) Sending Ipv6 packets in a loop via a raw socket triggers an issue where a route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly consumes the Ipv6 max_size threshold which defaults to 4096 resulting in these warnings: [1] 99.187805] dst_alloc: 7728 callbacks suppressed [2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. . . [300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. When this happens the packet is dropped and sendto() gets a network is unreachable error: remaining pkt 200557 errno 101 remaining pkt 196462 errno 101 . . remaining pkt 126821 errno 101 Implement David Aherns suggestion to remove max_size check seeing that Ipv6 has a GC to manage memory usage. Ipv4 already does not check max_size. Here are some memory comparisons for Ipv4 vs Ipv6 with the patch: Test by running 5 instances of a program that sends UDP packets to a raw socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar program. Ipv4: Before test: MemFree: 29427108 kB Slab: 237612 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0 During test: MemFree: 29417608 kB Slab: 247712 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0 After test: MemFree: 29422308 kB Slab: 238104 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 Ipv6 with patch: Errno 101 errors are not observed anymore with the patch. Before test: MemFree: 29422308 kB Slab: 238104 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 During Test: MemFree: 29431516 kB Slab: 240940 kB ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 After Test: MemFree: 29441816 kB Slab: 238132 kB ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 Tested-by: Andrea Mayer Signed-off-by: Jon Maxwell Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 2 +- net/core/dst.c | 8 ++------ net/ipv6/route.c | 13 +++++-------- 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 88ff7bb2bb9b..632086b2f644 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -16,7 +16,7 @@ struct dst_ops { unsigned short family; unsigned int gc_thresh; - int (*gc)(struct dst_ops *ops); + void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); diff --git a/net/core/dst.c b/net/core/dst.c index 6d2dd03dafa8..31c08a3386d3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -82,12 +82,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e74e0361fd92..b643dda68d31 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -91,7 +91,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -3284,11 +3284,10 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; @@ -3296,11 +3295,10 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) + if (entries > ops->gc_thresh) entries = dst_entries_get_slow(ops); - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -3310,7 +3308,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -6512,7 +6509,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; -- cgit v1.2.3 From 9259f6b573cf17c00f50c4b626983a5347b1abe9 Mon Sep 17 00:00:00 2001 From: Tanmay Bhushan <007047221b@gmail.com> Date: Mon, 16 Jan 2023 15:55:00 +0100 Subject: ipv6: Remove extra counter pull before gc Per cpu entries are no longer used in consideration for doing gc or not. Remove the extra per cpu entries pull to directly check for time and perform gc. Signed-off-by: Tanmay Bhushan <007047221b@gmail.com> Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b643dda68d31..76889ceeead9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3294,10 +3294,6 @@ static void ip6_dst_gc(struct dst_ops *ops) unsigned int val; int entries; - entries = dst_entries_get_fast(ops); - if (entries > ops->gc_thresh) - entries = dst_entries_get_slow(ops); - if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; -- cgit v1.2.3 From 90317bcdbd337b9e88f253650f6ab9dfe667be64 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 23 Jan 2023 18:47:09 +0100 Subject: ipv6: Make ip6_route_output_flags_noref() static. This function is only used in net/ipv6/route.c and has no reason to be visible outside of it. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://lore.kernel.org/r/50706db7f675e40b3594d62011d9363dce32b92e.1674495822.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip6_route.h | 4 ---- net/ipv6/route.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 035d61d50a98..81ee387a1fc4 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,10 +84,6 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags); - struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 76889ceeead9..c180c2ef17c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2593,9 +2593,10 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags) +static struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) { bool any_src; @@ -2624,7 +2625,6 @@ struct dst_entry *ip6_route_output_flags_noref(struct net *net, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } -EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, -- cgit v1.2.3 From d0941130c93515411c8d66fc22bdae407b509a6d Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 25 Jan 2023 11:16:52 +1100 Subject: icmp: Add counters for rate limits There are multiple ICMP rate limiting mechanisms: * Global limits: net.ipv4.icmp_msgs_burst/icmp_msgs_per_sec * v4 per-host limits: net.ipv4.icmp_ratelimit/ratemask * v6 per-host limits: net.ipv6.icmp_ratelimit/ratemask However, when ICMP output is limited, there is no way to tell which limit has been hit or even if the limits are responsible for the lack of ICMP output. Add counters for each of the cases above. As we are within local_bh_disable(), use the __INC stats variant. Example output: # nstat -sz "*RateLimit*" IcmpOutRateLimitGlobal 134 0.0 IcmpOutRateLimitHost 770 0.0 Icmp6OutRateLimitHost 84 0.0 Signed-off-by: Jamie Bainbridge Suggested-by: Abhishek Rawal Link: https://lore.kernel.org/r/273b32241e6b7fdc5c609e6f5ebc68caf3994342.1674605770.git.jamie.bainbridge@gmail.com Signed-off-by: Paolo Abeni --- include/uapi/linux/snmp.h | 3 +++ net/ipv4/icmp.c | 3 +++ net/ipv4/proc.c | 8 +++++--- net/ipv6/icmp.c | 4 ++++ net/ipv6/proc.c | 1 + 5 files changed, 16 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 6600cb0164c2..26f33a4c253d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -95,6 +95,8 @@ enum ICMP_MIB_OUTADDRMASKS, /* OutAddrMasks */ ICMP_MIB_OUTADDRMASKREPS, /* OutAddrMaskReps */ ICMP_MIB_CSUMERRORS, /* InCsumErrors */ + ICMP_MIB_RATELIMITGLOBAL, /* OutRateLimitGlobal */ + ICMP_MIB_RATELIMITHOST, /* OutRateLimitHost */ __ICMP_MIB_MAX }; @@ -112,6 +114,7 @@ enum ICMP6_MIB_OUTMSGS, /* OutMsgs */ ICMP6_MIB_OUTERRORS, /* OutErrors */ ICMP6_MIB_CSUMERRORS, /* InCsumErrors */ + ICMP6_MIB_RATELIMITHOST, /* OutRateLimitHost */ __ICMP6_MIB_MAX }; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 46aa2d65e40a..8cebb476b3ab 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -296,6 +296,7 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -325,6 +326,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (peer) inet_putpeer(peer); out: + if (!rc) + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); return rc; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f88daace9de3..eaf1d3113b62 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -353,7 +353,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_puts(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", @@ -363,9 +363,11 @@ static void icmp_put(struct seq_file *seq) for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); - seq_printf(seq, " %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9d92d51c4757..79c769c0d113 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -183,6 +183,7 @@ static bool icmpv6_global_allow(struct net *net, int type) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -224,6 +225,9 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (peer) inet_putpeer(peer); } + if (!res) + __ICMP6_INC_STATS(net, ip6_dst_idev(dst), + ICMP6_MIB_RATELIMITHOST); dst_release(dst); return res; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d6306aa46bb1..e20b3705c2d2 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; -- cgit v1.2.3 From bc61761394ce0f0cc35c6fc60426f08d83d0d488 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 31 Jan 2023 14:34:56 +0800 Subject: ipv6: ICMPV6: Use swap() instead of open coding it Swap is a function interface that provides exchange function. To avoid code duplication, we can use swap function. ./net/ipv6/icmp.c:344:25-26: WARNING opportunity for swap(). Reported-by: Abaci Robot Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3896 Signed-off-by: Jiapeng Chong Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230131063456.76302-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 79c769c0d113..c9346515e24d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -332,7 +332,6 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; - struct in6_addr tmp; int off; if (opt->dsthao) { @@ -340,9 +339,7 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - tmp = iph->saddr; - iph->saddr = hao->addr; - hao->addr = tmp; + swap(iph->saddr, hao->addr); } } } -- cgit v1.2.3 From 2798e36dc233a409a5d3f26f73029596dc504020 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Feb 2023 17:43:45 +0000 Subject: tcp: add TCP_MINTTL drop reason In the unlikely case incoming packets are dropped because of IP_MINTTL / IPV6_MINHOPCOUNT constraints... Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230201174345.2708943-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 6 ++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 70539288f995..94bc3d5d8803 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -71,6 +71,7 @@ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ FN(FRAG_TOO_FAR) \ + FN(TCP_MINTTL) \ FNe(MAX) /** @@ -312,6 +313,11 @@ enum skb_drop_reason { * (/proc/sys/net/ipv4/ipfrag_max_dist) */ SKB_DROP_REASON_FRAG_TOO_FAR, + /** + * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below + * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT). + */ + SKB_DROP_REASON_TCP_MINTTL, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8320d0ecb13a..ea370afa70ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2102,6 +2102,7 @@ process: /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..543ee2167720 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1708,8 +1708,9 @@ process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } -- cgit v1.2.3 From 8d8ebd77f5ede7ff9e3072653221706655924191 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Feb 2023 09:40:58 +0000 Subject: ipv6: raw: add drop reasons Use existing helpers and drop reason codes for RAW input path. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv6/raw.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ada087b50541..2e1c8060b51a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -355,17 +355,19 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -386,7 +388,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -410,7 +412,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet->hdrincl) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } -- cgit v1.2.3 From 6579f5bacc2c4cbc5ef6abb45352416939d1f844 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Feb 2023 09:41:00 +0000 Subject: raw: use net_hash_mix() in hash function Some applications seem to rely on RAW sockets. If they use private netns, we can avoid piling all RAW sockets bound to a given protocol into a single bucket. Also place (struct raw_hashinfo).lock into its own cache line to limit false sharing. Alternative would be to have per-netns hashtables, but this seems too expensive for most netns where RAW sockets are not used. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/raw.h | 13 +++++++++++-- net/ipv4/raw.c | 13 +++++++------ net/ipv6/raw.c | 4 ++-- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/raw.h b/include/net/raw.h index 5e665934ebc7..2c004c20ed99 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -15,6 +15,8 @@ #include #include +#include +#include #include extern struct proto raw_prot; @@ -29,13 +31,20 @@ int raw_local_deliver(struct sk_buff *, int); int raw_rcv(struct sock *, struct sk_buff *); -#define RAW_HTABLE_SIZE MAX_INET_PROTOS +#define RAW_HTABLE_LOG 8 +#define RAW_HTABLE_SIZE (1U << RAW_HTABLE_LOG) struct raw_hashinfo { spinlock_t lock; - struct hlist_nulls_head ht[RAW_HTABLE_SIZE]; + + struct hlist_nulls_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; }; +static inline u32 raw_hashfunc(const struct net *net, u32 proto) +{ + return hash_32(net_hash_mix(net) ^ proto, RAW_HTABLE_LOG); +} + static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo) { int i; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9865d15a08df..94df935ee0c5 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ int raw_hash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_nulls_head *hlist; - hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); __sk_nulls_add_node_rcu(sk, hlist); @@ -160,9 +160,9 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) +static int raw_v4_input(struct net *net, struct sk_buff *skb, + const struct iphdr *iph, int hash) { - struct net *net = dev_net(skb->dev); struct hlist_nulls_head *hlist; struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); @@ -193,9 +193,10 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) int raw_local_deliver(struct sk_buff *skb, int protocol) { - int hash = protocol & (RAW_HTABLE_SIZE - 1); + struct net *net = dev_net(skb->dev); - return raw_v4_input(skb, ip_hdr(skb), hash); + return raw_v4_input(net, skb, ip_hdr(skb), + raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) @@ -271,7 +272,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) struct sock *sk; int hash; - hash = protocol & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 2e1c8060b51a..bac9ba747bde 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -152,7 +152,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { @@ -338,7 +338,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, struct sock *sk; int hash; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { -- cgit v1.2.3 From 30c89bad3ea2ef7a2d4686f9c3cc08420fe627bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Feb 2023 18:47:07 +0000 Subject: ipv6: icmp6: add drop reason support to icmpv6_notify() Accurately reports what happened in icmpv6_notify() when handling a packet. This makes use of the new IPV6_BAD_EXTHDR drop reason. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 3 ++- net/ipv6/icmp.c | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 03f3af02a9a6..7332296eca44 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -436,7 +436,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) atomic_dec(&fl->users); } -void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); +enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c9346515e24d..40bb5dedac09 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -813,16 +813,19 @@ out_bh_enable: local_bh_enable(); } -void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); const struct inet6_protocol *ipprot; + enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; - struct net *net = dev_net(skb->dev); - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); + if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); @@ -832,14 +835,17 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (inner_offset < 0) + if (inner_offset < 0) { + SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; + } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ - if (!pskb_may_pull(skb, inner_offset+8)) + reason = pskb_may_pull_reason(skb, inner_offset + 8); + if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. @@ -854,10 +860,11 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); - return; + return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return reason; } /* @@ -953,7 +960,8 @@ static int icmpv6_rcv(struct sk_buff *skb) case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: @@ -995,7 +1003,8 @@ static int icmpv6_rcv(struct sk_buff *skb) * must pass to upper level */ - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and -- cgit v1.2.3 From 545dbcd124b02c9dc93c8a5894c71d682effc3e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Feb 2023 18:47:08 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_rcv() Creates three new drop reasons: SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 9 +++++++++ include/net/ndisc.h | 2 +- net/ipv6/icmp.c | 2 +- net/ipv6/ndisc.c | 13 +++++++------ 4 files changed, 18 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 6c41e535175c..ef3f65d135d3 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -73,6 +73,9 @@ FN(FRAG_TOO_FAR) \ FN(TCP_MINTTL) \ FN(IPV6_BAD_EXTHDR) \ + FN(IPV6_NDISC_FRAG) \ + FN(IPV6_NDISC_HOP_LIMIT) \ + FN(IPV6_NDISC_BAD_CODE) \ FNe(MAX) /** @@ -321,6 +324,12 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_MINTTL, /** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */ SKB_DROP_REASON_IPV6_BAD_EXTHDR, + /** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */ + SKB_DROP_REASON_IPV6_NDISC_FRAG, + /** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */ + SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT, + /** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */ + SKB_DROP_REASON_IPV6_NDISC_BAD_CODE, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/include/net/ndisc.h b/include/net/ndisc.h index da7eec8669ec..07e5168cdaf9 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -445,7 +445,7 @@ int ndisc_late_init(void); void ndisc_late_cleanup(void); void ndisc_cleanup(void); -int ndisc_rcv(struct sk_buff *skb); +enum skb_drop_reason ndisc_rcv(struct sk_buff *skb); struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 40bb5dedac09..f32bc98155bf 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -969,7 +969,7 @@ static int icmpv6_rcv(struct sk_buff *skb) case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: - ndisc_rcv(skb); + reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3a553494ff16..9548b5a44714 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1804,15 +1804,16 @@ static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) return false; } -int ndisc_rcv(struct sk_buff *skb) +enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) - return 0; + return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) - return 0; + return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); @@ -1821,13 +1822,13 @@ int ndisc_rcv(struct sk_buff *skb) if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); - return 0; + return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); - return 0; + return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { @@ -1853,7 +1854,7 @@ int ndisc_rcv(struct sk_buff *skb) break; } - return 0; + return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) -- cgit v1.2.3 From fe33311c3e371855c4f4c0ab8a5fce5b9a9fdafd Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 14 Feb 2023 12:14:10 +0800 Subject: net: no longer support SOCK_REFCNT_DEBUG feature Commit e48c414ee61f ("[INET]: Generalise the TCP sock ID lookup routines") commented out the definition of SOCK_REFCNT_DEBUG in 2005 and later another commit 463c84b97f24 ("[NET]: Introduce inet_connection_sock") removed it. Since we could track all of them through bpf and kprobe related tools and the feature could print loads of information which might not be that helpful even under a little bit pressure, the whole feature which has been inactive for many years is no longer supported. Link: https://lore.kernel.org/lkml/20230211065153.54116-1-kerneljasonxing@gmail.com/ Suggested-by: Kuniyuki Iwashima Signed-off-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Acked-by: Wenjia Zhang Reviewed-by: Eric Dumazet Acked-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/sock.h | 28 ---------------------------- net/core/sock.c | 13 ------------- net/ipv4/af_inet.c | 3 --- net/ipv4/inet_connection_sock.c | 2 -- net/ipv4/inet_timewait_sock.c | 3 --- net/ipv6/af_inet6.c | 10 ---------- net/ipv6/ipv6_sockglue.c | 12 ------------ net/mptcp/protocol.c | 1 - net/packet/af_packet.c | 4 ---- net/sctp/ipv6.c | 2 -- net/sctp/protocol.c | 2 -- net/smc/af_smc.c | 3 --- net/xdp/xsk.c | 4 ---- 13 files changed, 87 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/sock.h b/include/net/sock.h index 937e842dc930..2cb258fde072 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1349,9 +1349,6 @@ struct proto { char name[32]; struct list_head node; -#ifdef SOCK_REFCNT_DEBUG - atomic_t socks; -#endif int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; @@ -1359,31 +1356,6 @@ int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); -#ifdef SOCK_REFCNT_DEBUG -static inline void sk_refcnt_debug_inc(struct sock *sk) -{ - atomic_inc(&sk->sk_prot->socks); -} - -static inline void sk_refcnt_debug_dec(struct sock *sk) -{ - atomic_dec(&sk->sk_prot->socks); - printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", - sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); -} - -static inline void sk_refcnt_debug_release(const struct sock *sk) -{ - if (refcount_read(&sk->sk_refcnt) != 1) - printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", - sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt)); -} -#else /* SOCK_REFCNT_DEBUG */ -#define sk_refcnt_debug_inc(sk) do { } while (0) -#define sk_refcnt_debug_dec(sk) do { } while (0) -#define sk_refcnt_debug_release(sk) do { } while (0) -#endif /* SOCK_REFCNT_DEBUG */ - INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline int sk_forward_alloc_get(const struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index afbb02984d5f..341c565dbc26 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2340,17 +2340,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); - /* Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); @@ -3710,8 +3699,6 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2c778b013cb0..8db6747f892f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -156,7 +156,6 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); - sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -357,8 +356,6 @@ lookup_protocol: inet->mc_list = NULL; inet->rcv_tos = 0; - sk_refcnt_debug_inc(sk); - if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7d206a10ad14..eedcf4146d29 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1199,8 +1199,6 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index beed32fff484..40052414c7c7 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -77,9 +77,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); -#ifdef SOCK_REFCNT_DEBUG - pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); -#endif kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 847934763868..38689bedfce7 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -239,16 +239,6 @@ lookup_protocol: inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; - /* - * Increment only the relevant sk_prot->socks debug field, this changes - * the previous behaviour of incrementing both the equivalent to - * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. - * - * This allows better debug granularity as we'll know exactly how many - * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 - * transport protocol socks. -acme - */ - sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9ce51680290b..2917dd8d198c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -464,13 +464,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); - /* - * Sock is moving from IPv6 to IPv4 (sk_prot), so - * remove it from the refcnt debug socks count in the - * original family... - */ - sk_refcnt_debug_dec(sk); - if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -507,11 +500,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, inet6_cleanup_sock(sk); - /* - * ... and add it to the refcnt debug socks count - * in the new family. -acme - */ - sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c9817aa0f413..3ad9c46202fc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2875,7 +2875,6 @@ static void __mptcp_destroy_sock(struct sock *sk) sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); sock_put(sk); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ffb19c643ab..d4e76e2ae153 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1335,8 +1335,6 @@ static void packet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive packet socket: %p\n", sk); return; } - - sk_refcnt_debug_dec(sk); } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) @@ -3174,7 +3172,6 @@ static int packet_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -3364,7 +3361,6 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; - sk_refcnt_debug_inc(sk); /* * Attach a protocol block diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 097bd60ce964..62b436a2c8fe 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -807,8 +807,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - sk_refcnt_debug_inc(newsk); - if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 909a89a1cff4..c365df24ad33 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -601,8 +601,6 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; - sk_refcnt_debug_inc(newsk); - if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b163266e581a..d7a7420e81ec 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -360,8 +360,6 @@ static void smc_destruct(struct sock *sk) return; if (!sock_flag(sk, SOCK_DEAD)) return; - - sk_refcnt_debug_dec(sk); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, @@ -390,7 +388,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); - sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9f0561b67c12..a245c1b4a21b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -845,7 +845,6 @@ static int xsk_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -1396,8 +1395,6 @@ static void xsk_destruct(struct sock *sk) if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); - - sk_refcnt_debug_dec(sk); } static int xsk_create(struct net *net, struct socket *sock, int protocol, @@ -1427,7 +1424,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; - sk_refcnt_debug_inc(sk); sock_set_flag(sk, SOCK_RCU_FREE); -- cgit v1.2.3 From 525c65ff5696f7ccd8335edde1d9964420707910 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Wed, 15 Feb 2023 14:46:57 +0100 Subject: seg6: factor out End lookup nexthop processing to a dedicated function The End nexthop lookup/input operations are moved into a new helper function named input_action_end_finish(). This avoids duplicating the code needed to compute the nexthop in the different flavors of the End behavior. Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv6/seg6_local.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 487f8e98deaa..765e89a24bc2 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -364,6 +364,14 @@ static void seg6_next_csid_advance_arg(struct in6_addr *addr, memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects); } +static int input_action_end_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); +} + static int input_action_end_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -375,9 +383,7 @@ static int input_action_end_core(struct sk_buff *skb, advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - seg6_lookup_nexthop(skb, NULL, 0); - - return dst_input(skb); + return input_action_end_finish(skb, slwt); drop: kfree_skb(skb); @@ -395,9 +401,7 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) /* update DA */ seg6_next_csid_advance_arg(daddr, finfo); - seg6_lookup_nexthop(skb, NULL, 0); - - return dst_input(skb); + return input_action_end_finish(skb, slwt); } static bool seg6_next_csid_enabled(__u32 fops) -- cgit v1.2.3 From bdf3c0b9c10b44c9355dd68f359bad25489445b6 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Wed, 15 Feb 2023 14:46:58 +0100 Subject: seg6: add PSP flavor support for SRv6 End behavior The "flavors" framework defined in RFC8986 [1] represents additional operations that can modify or extend a subset of existing behaviors such as SRv6 End, End.X and End.T. We report these flavors hereafter: - Penultimate Segment Pop (PSP); - Ultimate Segment Pop (USP); - Ultimate Segment Decapsulation (USD). Depending on how the Segment Routing Header (SRH) has to be handled, an SRv6 End* behavior can support these flavors either individually or in combinations. In this patch, we only consider the PSP flavor for the SRv6 End behavior. A PSP enabled SRv6 End behavior is used by the Source/Ingress SR node (i.e., the one applying the SRv6 Policy) when it needs to instruct the penultimate SR Endpoint node listed in the SID List (carried by the SRH) to remove the SRH from the IPv6 header. Specifically, a PSP enabled SRv6 End behavior processes the SRH by: i) decreasing the Segment Left (SL) from 1 to 0; ii) copying the Last Segment IDentifier (SID) into the IPv6 Destination Address (DA); iii) removing (i.e., popping) the outer SRH from the extension headers following the IPv6 header. It is important to note that PSP operation (steps i, ii, iii) takes place only at a penultimate SR Segment Endpoint node (i.e., when the SL=1) and does not happen at non-penultimate Endpoint nodes. Indeed, when a SID of PSP flavor is processed at a non-penultimate SR Segment Endpoint node, the PSP operation is not performed because it would not be possible to decrease the SL from 1 to 0. SL=2 SL=1 SL=0 | | | For example, given the SRv6 policy (SID List := < X, Y, Z >): - a PSP enabled SRv6 End behavior bound to SID "Y" will apply the PSP operation as Segment Left (SL) is 1, corresponding to the Penultimate Segment of the SID List; - a PSP enabled SRv6 End behavior bound to SID "X" will *NOT* apply the PSP operation as the Segment Left is 2. This behavior instance will apply the "standard" End packet processing, ignoring the configured PSP flavor at all. [1] - RFC8986: https://datatracker.ietf.org/doc/html/rfc8986 Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv6/seg6_local.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 333 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 765e89a24bc2..dd433cc265c8 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -109,8 +109,15 @@ struct bpf_lwt_prog { #define next_csid_chk_lcnode_fn_bits(flen) \ next_csid_chk_lcblock_bits(flen) +#define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname) +#define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP) + +/* Supported RFC8986 Flavor operations are reported in this bitmask */ +#define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP + /* Supported Flavor operations are reported in this bitmask */ -#define SEG6_LOCAL_FLV_SUPP_OPS (BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID)) +#define SEG6_LOCAL_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_OP(NEXT_CSID) | \ + SEG6_LOCAL_FLV8986_SUPP_OPS) struct seg6_flavors_info { /* Flavor operations */ @@ -409,15 +416,331 @@ static bool seg6_next_csid_enabled(__u32 fops) return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); } +/* We describe the packet state in relation to the absence/presence of the SRH + * and the Segment Left (SL) field. + * For our purposes, it is not necessary to record the exact value of the SL + * when the SID List consists of two or more segments. + */ +enum seg6_local_pktinfo { + /* the order really matters! */ + SEG6_LOCAL_PKTINFO_NOHDR = 0, + SEG6_LOCAL_PKTINFO_SL_ZERO, + SEG6_LOCAL_PKTINFO_SL_ONE, + SEG6_LOCAL_PKTINFO_SL_MORE, + __SEG6_LOCAL_PKTINFO_MAX, +}; + +#define SEG6_LOCAL_PKTINFO_MAX (__SEG6_LOCAL_PKTINFO_MAX - 1) + +static enum seg6_local_pktinfo seg6_get_srh_pktinfo(struct ipv6_sr_hdr *srh) +{ + __u8 sgl; + + if (!srh) + return SEG6_LOCAL_PKTINFO_NOHDR; + + sgl = srh->segments_left; + if (sgl < 2) + return SEG6_LOCAL_PKTINFO_SL_ZERO + sgl; + + return SEG6_LOCAL_PKTINFO_SL_MORE; +} + +enum seg6_local_flv_action { + SEG6_LOCAL_FLV_ACT_UNSPEC = 0, + SEG6_LOCAL_FLV_ACT_END, + SEG6_LOCAL_FLV_ACT_PSP, + SEG6_LOCAL_FLV_ACT_USP, + SEG6_LOCAL_FLV_ACT_USD, + __SEG6_LOCAL_FLV_ACT_MAX +}; + +#define SEG6_LOCAL_FLV_ACT_MAX (__SEG6_LOCAL_FLV_ACT_MAX - 1) + +/* The action table for RFC8986 flavors (see the flv8986_act_tbl below) + * contains the actions (i.e. processing operations) to be applied on packets + * when flavors are configured for an End* behavior. + * By combining the pkinfo data and from the flavors mask, the macro + * computes the index used to access the elements (actions) stored in the + * action table. The index is structured as follows: + * + * index + * _______________/\________________ + * / \ + * +----------------+----------------+ + * | pf | afm | + * +----------------+----------------+ + * ph-1 ... p1 p0 fk-1 ... f1 f0 + * MSB LSB + * + * where: + * - 'afm' (adjusted flavor mask) is the mask containing a combination of the + * RFC8986 flavors currently supported. 'afm' corresponds to the @fm + * argument of the macro whose value is righ-shifted by 1 bit. By doing so, + * we discard the SEG6_LOCAL_FLV_OP_UNSPEC flag (bit 0 in @fm) which is + * never used here; + * - 'pf' encodes the packet info (pktinfo) regarding the presence/absence of + * the SRH, SL = 0, etc. 'pf' is set with the value of @pf provided as + * argument to the macro. + */ +#define flv8986_act_tbl_idx(pf, fm) \ + ((((pf) << bits_per(SEG6_LOCAL_FLV8986_SUPP_OPS)) | \ + ((fm) & SEG6_LOCAL_FLV8986_SUPP_OPS)) >> SEG6_LOCAL_FLV_OP_PSP) + +/* We compute the size of the action table by considering the RFC8986 flavors + * actually supported by the kernel. In this way, the size is automatically + * adjusted when new flavors are supported. + */ +#define FLV8986_ACT_TBL_SIZE \ + roundup_pow_of_two(flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_MAX, \ + SEG6_LOCAL_FLV8986_SUPP_OPS)) + +/* tbl_cfg(act, pf, fm) macro is used to easily configure the action + * table; it accepts 3 arguments: + * i) @act, the suffix from SEG6_LOCAL_FLV_ACT_{act} representing + * the action that should be applied on the packet; + * ii) @pf, the suffix from SEG6_LOCAL_PKTINFO_{pf} reporting the packet + * info about the lack/presence of SRH, SRH with SL = 0, etc; + * iii) @fm, the mask of flavors. + */ +#define tbl_cfg(act, pf, fm) \ + [flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_##pf, \ + (fm))] = SEG6_LOCAL_FLV_ACT_##act + +/* shorthand for improving readability */ +#define F_PSP SEG6_F_LOCAL_FLV_PSP + +/* The table contains, for each combination of the pktinfo data and + * flavors, the action that should be taken on a packet (e.g. + * "standard" Endpoint processing, Penultimate Segment Pop, etc). + * + * By default, table entries not explicitly configured are initialized with the + * SEG6_LOCAL_FLV_ACT_UNSPEC action, which generally has the effect of + * discarding the processed packet. + */ +static const u8 flv8986_act_tbl[FLV8986_ACT_TBL_SIZE] = { + /* PSP variant for packet where SRH with SL = 1 */ + tbl_cfg(PSP, SL_ONE, F_PSP), + /* End for packet where the SRH with SL > 1*/ + tbl_cfg(END, SL_MORE, F_PSP), +}; + +#undef F_PSP +#undef tbl_cfg + +/* For each flavor defined in RFC8986 (or a combination of them) an action is + * performed on the packet. The specific action depends on: + * - info extracted from the packet (i.e. pktinfo data) regarding the + * lack/presence of the SRH, and if the SRH is available, on the value of + * Segment Left field; + * - the mask of flavors configured for the specific SRv6 End* behavior. + * + * The function combines both the pkinfo and the flavors mask to evaluate the + * corresponding action to be taken on the packet. + */ +static enum seg6_local_flv_action +seg6_local_flv8986_act_lookup(enum seg6_local_pktinfo pinfo, __u32 flvmask) +{ + unsigned long index; + + /* check if the provided mask of flavors is supported */ + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + index = flv8986_act_tbl_idx(pinfo, flvmask); + if (unlikely(index >= FLV8986_ACT_TBL_SIZE)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + return flv8986_act_tbl[index]; +} + +/* skb->data must be aligned with skb->network_header */ +static bool seg6_pop_srh(struct sk_buff *skb, int srhoff) +{ + struct ipv6_sr_hdr *srh; + struct ipv6hdr *iph; + __u8 srh_nexthdr; + int thoff = -1; + int srhlen; + int nhlen; + + if (unlikely(srhoff < sizeof(*iph) || + !pskb_may_pull(skb, srhoff + sizeof(*srh)))) + return false; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srhlen = ipv6_optlen(srh); + + /* we are about to mangle the pkt, let's check if we can write on it */ + if (unlikely(skb_ensure_writable(skb, srhoff + srhlen))) + return false; + + /* skb_ensure_writable() may change skb pointers; evaluate srh again */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_nexthdr = srh->nexthdr; + + if (unlikely(!skb_transport_header_was_set(skb))) + goto pull; + + nhlen = skb_network_header_len(skb); + /* we have to deal with the transport header: it could be set before + * the SRH, after the SRH, or within it (which is considered wrong, + * however). + */ + if (likely(nhlen <= srhoff)) + thoff = nhlen; + else if (nhlen >= srhoff + srhlen) + /* transport_header is set after the SRH */ + thoff = nhlen - srhlen; + else + /* transport_header falls inside the SRH; hence, we can't + * restore the transport_header pointer properly after + * SRH removing operation. + */ + return false; +pull: + /* we need to pop the SRH: + * 1) first of all, we pull out everything from IPv6 header up to SRH + * (included) evaluating also the rcsum; + * 2) we overwrite (and then remove) the SRH by properly moving the + * IPv6 along with any extension header that precedes the SRH; + * 3) At the end, we push back the pulled headers (except for SRH, + * obviously). + */ + skb_pull_rcsum(skb, srhoff + srhlen); + memmove(skb_network_header(skb) + srhlen, skb_network_header(skb), + srhoff); + skb_push(skb, srhoff); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + if (likely(thoff >= 0)) + skb_set_transport_header(skb, thoff); + + iph = ipv6_hdr(skb); + if (iph->nexthdr == NEXTHDR_ROUTING) { + iph->nexthdr = srh_nexthdr; + } else { + /* we must look for the extension header (EXTH, for short) that + * immediately precedes the SRH we have just removed. + * Then, we update the value of the EXTH nexthdr with the one + * contained in the SRH nexthdr. + */ + unsigned int off = sizeof(*iph); + struct ipv6_opt_hdr *hp, _hdr; + __u8 nexthdr = iph->nexthdr; + + for (;;) { + if (unlikely(!ipv6_ext_hdr(nexthdr) || + nexthdr == NEXTHDR_NONE)) + return false; + + hp = skb_header_pointer(skb, off, sizeof(_hdr), &_hdr); + if (unlikely(!hp)) + return false; + + if (hp->nexthdr == NEXTHDR_ROUTING) { + hp->nexthdr = srh_nexthdr; + break; + } + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + fallthrough; + case NEXTHDR_AUTH: + /* we expect SRH before FRAG and AUTH */ + return false; + default: + off += ipv6_optlen(hp); + break; + } + + nexthdr = hp->nexthdr; + } + } + + iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + skb_postpush_rcsum(skb, iph, srhoff); + + return true; +} + +/* process the packet on the basis of the RFC8986 flavors set for the given + * SRv6 End behavior instance. + */ +static int end_flv8986_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + enum seg6_local_flv_action action; + enum seg6_local_pktinfo pinfo; + struct ipv6_sr_hdr *srh; + __u32 flvmask; + int srhoff; + + srh = seg6_get_srh(skb, 0); + srhoff = srh ? ((unsigned char *)srh - skb->data) : 0; + pinfo = seg6_get_srh_pktinfo(srh); +#ifdef CONFIG_IPV6_SEG6_HMAC + if (srh && !seg6_hmac_validate_skb(skb)) + goto drop; +#endif + flvmask = finfo->flv_ops; + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) { + pr_warn_once("seg6local: invalid RFC8986 flavors\n"); + goto drop; + } + + /* retrieve the action triggered by the combination of pktinfo data and + * the flavors mask. + */ + action = seg6_local_flv8986_act_lookup(pinfo, flvmask); + switch (action) { + case SEG6_LOCAL_FLV_ACT_END: + /* process the packet as the "standard" End behavior */ + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + break; + case SEG6_LOCAL_FLV_ACT_PSP: + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + if (unlikely(!seg6_pop_srh(skb, srhoff))) + goto drop; + break; + case SEG6_LOCAL_FLV_ACT_UNSPEC: + fallthrough; + default: + /* by default, we drop the packet since we could not find a + * suitable action. + */ + goto drop; + } + + return input_action_end_finish(skb, slwt); + +drop: + kfree_skb(skb); + return -EINVAL; +} + /* regular endpoint function */ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) { const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; - if (seg6_next_csid_enabled(finfo->flv_ops)) + if (!fops) + return input_action_end_core(skb, slwt); + + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) return end_next_csid_core(skb, slwt); - return input_action_end_core(skb, slwt); + /* the specific processing function to be performed on the packet + * depends on the combination of flavors defined in RFC8986 and some + * information extracted from the packet, e.g. presence/absence of SRH, + * Segment Left = 0, etc. + */ + return end_flv8986_core(skb, slwt); } /* regular endpoint, and forward to specified nexthop */ @@ -2304,6 +2627,13 @@ int __init seg6_local_init(void) BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS)); BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS)); + /* To be memory efficient, we use 'u8' to represent the different + * actions related to RFC8986 flavors. If the kernel build stops here, + * it means that it is not possible to correctly encode these actions + * with the data type chosen for the action table. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_ACT_MAX > (typeof(flv8986_act_tbl[0]))~0U); + return lwtunnel_encap_add_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); } -- cgit v1.2.3 From 2954fe60e33da0f4de4d81a4c95c7dddb517d00c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 1 Feb 2023 14:45:22 +0100 Subject: netfilter: let reset rules clean out conntrack entries iptables/nftables support responding to tcp packets with tcp resets. The generated tcp reset packet passes through both output and postrouting netfilter hooks, but conntrack will never see them because the generated skb has its ->nfct pointer copied over from the packet that triggered the reset rule. If the reset rule is used for established connections, this may result in the conntrack entry to be around for a very long time (default timeout is 5 days). One way to avoid this would be to not copy the nf_conn pointer so that the rest packet passes through conntrack too. Problem is that output rules might not have the same conntrack zone setup as the prerouting ones, so its possible that the reset skb won't find the correct entry. Generating a template entry for the skb seems error prone as well. Add an explicit "closing" function that switches a confirmed conntrack entry to closed state and wire this up for tcp. If the entry isn't confirmed, no action is needed because the conntrack entry will never be committed to the table. Reported-by: Russel King Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 3 +++ include/net/netfilter/nf_conntrack.h | 8 ++++++++ net/ipv4/netfilter/nf_reject_ipv4.c | 1 + net/ipv6/netfilter/nf_reject_ipv6.c | 1 + net/netfilter/core.c | 16 ++++++++++++++++ net/netfilter/nf_conntrack_core.c | 12 ++++++++++++ net/netfilter/nf_conntrack_proto_tcp.c | 35 ++++++++++++++++++++++++++++++++++ 7 files changed, 76 insertions(+) (limited to 'net/ipv6') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d8817d381c14..6863e271a9de 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -437,11 +437,13 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #include void nf_ct_attach(struct sk_buff *, const struct sk_buff *); +void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} +static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) @@ -459,6 +461,7 @@ struct nf_ct_hook { bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); + void (*set_closing)(struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 6a2019aaa464..3dbf947285be 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -125,6 +125,12 @@ struct nf_conn { union nf_conntrack_proto proto; }; +static inline struct nf_conn * +nf_ct_to_nf_conn(const struct nf_conntrack *nfct) +{ + return container_of(nfct, struct nf_conn, ct_general); +} + static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { @@ -175,6 +181,8 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) void nf_ct_destroy(struct nf_conntrack *nfct); +void nf_conntrack_tcp_set_closing(struct nf_conn *ct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index d640adcaf1b1..f33aeab9424f 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -280,6 +280,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index f61d4f18e1cf..58ccdb08c0fd 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -345,6 +345,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5a6705a0e4ec..b2fdbbed2b4b 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -702,6 +702,22 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +void nf_ct_set_closing(struct nf_conntrack *nfct) +{ + const struct nf_ct_hook *ct_hook; + + if (!nfct) + return; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->set_closing(nfct); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_set_closing); + bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c00858344f02..430bb52b6454 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2747,11 +2747,23 @@ err_cachep: return ret; } +static void nf_conntrack_set_closing(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = nf_ct_to_nf_conn(nfct); + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + nf_conntrack_tcp_set_closing(ct); + break; + } +} + static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, + .set_closing = nf_conntrack_set_closing, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 16ee5ebe1ce1..4018acb1d674 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -911,6 +911,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) return false; } +void nf_conntrack_tcp_set_closing(struct nf_conn *ct) +{ + enum tcp_conntrack old_state; + const unsigned int *timeouts; + u32 timeout; + + if (!nf_ct_is_confirmed(ct)) + return; + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; + + if (old_state == TCP_CONNTRACK_CLOSE || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_unlock_bh(&ct->lock); + return; + } + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + const struct nf_tcp_net *tn; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = tn->timeouts; + } + + timeout = timeouts[TCP_CONNTRACK_CLOSE]; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + + spin_unlock_bh(&ct->lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); +} + static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; -- cgit v1.2.3 From 7c9c8913f4523cda30a710736844d74bfee060a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:35 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_recv_ns() Change ndisc_recv_ns() to return a drop reason. For the moment, return PKT_TOO_SMALL, NOT_SPECIFIED or SKB_CONSUMED. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9548b5a44714..039722f83c66 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -783,7 +783,7 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh, ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } -static void ndisc_recv_ns(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -797,18 +797,17 @@ static void ndisc_recv_ns(struct sk_buff *skb) struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); - bool inc; int is_router = -1; + SKB_DR(reason); u64 nonce = 0; + bool inc; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NS: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); - return; + return reason; } /* @@ -817,12 +816,12 @@ static void ndisc_recv_ns(struct sk_buff *skb) */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); - return; + return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); - return; + return reason; } if (ndopts.nd_opts_src_lladdr) { @@ -830,7 +829,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); - return; + return reason; } /* RFC2461 7.1.1: @@ -841,7 +840,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); - return; + return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) @@ -869,7 +868,7 @@ have_ifp: * so fail our DAD process */ addrconf_dad_failure(skb, ifp); - return; + return reason; } else { /* * This is not a dad solicitation. @@ -901,7 +900,7 @@ have_ifp: idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ - return; + return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || @@ -958,6 +957,7 @@ have_ifp: true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); + reason = SKB_CONSUMED; } out: @@ -965,6 +965,7 @@ out: in6_ifa_put(ifp); else in6_dev_put(idev); + return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) @@ -1781,8 +1782,9 @@ release: static void pndisc_redo(struct sk_buff *skb) { - ndisc_recv_ns(skb); - kfree_skb(skb); + enum skb_drop_reason reason = ndisc_recv_ns(skb); + + kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) @@ -1834,7 +1836,7 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - ndisc_recv_ns(skb); + reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: -- cgit v1.2.3 From 3009f9ae21ec539985977dcaa0cebb628afeb181 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:36 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_recv_na() Change ndisc_recv_na() to return a drop reason. For the moment, return PKT_TOO_SMALL, NOT_SPECIFIED or SKB_CONSUMED. More reasons are added later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 039722f83c66..9354cb3669c8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -987,7 +987,7 @@ static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) } } -static void ndisc_recv_na(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -1000,22 +1000,21 @@ static void ndisc_recv_na(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; + SKB_DR(reason); u8 new_state; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NA: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); - return; + return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); - return; + return reason; } /* For some 802.11 wireless deployments (and possibly other networks), @@ -1025,18 +1024,18 @@ static void ndisc_recv_na(struct sk_buff *skb) */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) - return; + return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); - return; + return reason; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); - return; + return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); @@ -1044,7 +1043,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); - return; + return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing @@ -1060,7 +1059,7 @@ static void ndisc_recv_na(struct sk_buff *skb) "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); - return; + return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); @@ -1121,10 +1120,11 @@ static void ndisc_recv_na(struct sk_buff *skb) */ rt6_clean_tohost(dev_net(dev), saddr); } - + reason = SKB_CONSUMED; out: neigh_release(neigh); } + return reason; } static void ndisc_recv_rs(struct sk_buff *skb) @@ -1840,7 +1840,7 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) break; case NDISC_NEIGHBOUR_ADVERTISEMENT: - ndisc_recv_na(skb); + reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: -- cgit v1.2.3 From 243e37c642ac1d52858bc5f3559e380babf21169 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:37 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_recv_rs() Change ndisc_recv_rs() to return a drop reason. For the moment, return PKT_TOO_SMALL, NOT_SPECIFIED or SKB_CONSUMED. More reasons are added later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9354cb3669c8..514eb8cc7879 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1127,7 +1127,7 @@ out: return reason; } -static void ndisc_recv_rs(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); @@ -1136,14 +1136,15 @@ static void ndisc_recv_rs(struct sk_buff *skb) const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; + SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); - return; + return reason; } /* Don't accept RS if we're not in router mode */ @@ -1178,9 +1179,10 @@ static void ndisc_recv_rs(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); + reason = SKB_CONSUMED; } out: - return; + return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) @@ -1844,7 +1846,7 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) break; case NDISC_ROUTER_SOLICITATION: - ndisc_recv_rs(skb); + reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: -- cgit v1.2.3 From 2f326d9d9ff46fb2e45fb3b6ae77eff04332dde6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:38 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_router_discovery() Change ndisc_router_discovery() to return a drop reason. For the moment, return PKT_TOO_SMALL, NOT_SPECIFIED and SKB_CONSUMED. More reasons are added later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 514eb8cc7879..7c8ba308ea49 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1231,20 +1231,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } -static void ndisc_router_discovery(struct sk_buff *skb) +static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); + bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; - struct inet6_dev *in6_dev; + struct ndisc_options ndopts; struct fib6_info *rt = NULL; + struct inet6_dev *in6_dev; u32 defrtr_usr_metric; + unsigned int pref = 0; + __u32 old_if_flags; struct net *net; + SKB_DR(reason); int lifetime; - struct ndisc_options ndopts; int optlen; - unsigned int pref = 0; - __u32 old_if_flags; - bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1256,17 +1257,15 @@ static void ndisc_router_discovery(struct sk_buff *skb) __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); - return; - } - if (optlen < 0) { - ND_PRINTK(2, warn, "RA: packet too short\n"); - return; + return reason; } + if (optlen < 0) + return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); - return; + return reason; } #endif @@ -1278,12 +1277,12 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); - return; + return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); - return; + return reason; } if (!ipv6_accept_ra(in6_dev)) { @@ -1365,7 +1364,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); - return; + return reason; } } /* Set default route metric as specified by user */ @@ -1390,7 +1389,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); - return; + return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, @@ -1401,7 +1400,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); - return; + return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { @@ -1488,6 +1487,7 @@ skip_linkparms: NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); + reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { @@ -1598,6 +1598,7 @@ out: fib6_info_release(rt); if (neigh) neigh_release(neigh); + return reason; } static void ndisc_redirect_rcv(struct sk_buff *skb) @@ -1850,7 +1851,7 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) break; case NDISC_ROUTER_ADVERTISEMENT: - ndisc_router_discovery(skb); + reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: -- cgit v1.2.3 From ec993edf05ca4eee5878edf51fdb5ffa2b1decc3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:39 +0000 Subject: ipv6: icmp6: add drop reason support to ndisc_redirect_rcv() Change ndisc_redirect_rcv() to return a drop reason. For the moment, return PKT_TOO_SMALL, NOT_SPECIFIED and values from icmpv6_notify(). More reasons are added later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7c8ba308ea49..e9776aa6f167 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1601,13 +1601,14 @@ out: return reason; } -static void ndisc_redirect_rcv(struct sk_buff *skb) +static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { - u8 *hdr; - struct ndisc_options ndopts; struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); + struct ndisc_options ndopts; + SKB_DR(reason); + u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { @@ -1615,31 +1616,31 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); - return; + return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); - return; + return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) - return; + return reason; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); - return; + return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; - icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); + return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, @@ -1855,7 +1856,7 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) break; case NDISC_REDIRECT: - ndisc_redirect_rcv(skb); + reason = ndisc_redirect_rcv(skb); break; } -- cgit v1.2.3 From 784d4477f07b930df73bc77e842e03f1dacb83aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:40 +0000 Subject: ipv6: icmp6: add SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS This is a generic drop reason for any error detected in ndisc_parse_options(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dropreason.h | 3 +++ net/ipv6/ndisc.c | 27 ++++++++++----------------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index ef3f65d135d3..239a5c0ea83e 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -76,6 +76,7 @@ FN(IPV6_NDISC_FRAG) \ FN(IPV6_NDISC_HOP_LIMIT) \ FN(IPV6_NDISC_BAD_CODE) \ + FN(IPV6_NDISC_BAD_OPTIONS) \ FNe(MAX) /** @@ -330,6 +331,8 @@ enum skb_drop_reason { SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT, /** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */ SKB_DROP_REASON_IPV6_NDISC_BAD_CODE, + /** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */ + SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e9776aa6f167..b47e845d66eb 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -819,10 +819,8 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) return reason; } - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND options\n"); - return reason; - } + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); @@ -1026,10 +1024,9 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) idev->cnf.drop_unsolicited_na) return reason; - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND option\n"); - return reason; - } + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; + if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { @@ -1159,10 +1156,8 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); - goto out; - } + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, @@ -1280,10 +1275,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) return reason; } - if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { - ND_PRINTK(2, warn, "RA: invalid ND options\n"); - return reason; - } + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, @@ -1627,7 +1620,7 @@ static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) - return reason; + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), -- cgit v1.2.3 From c34b8bb11ebc135e970653bd6fc8e3f863fb6a81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:41 +0000 Subject: ipv6: icmp6: add SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST Hosts can often receive neighbour discovery messages that are not for them. Use a dedicated drop reason to make clear the packet is dropped for this normal case. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dropreason.h | 5 +++++ net/ipv6/ndisc.c | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 239a5c0ea83e..c0a3ea806cd5 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -77,6 +77,7 @@ FN(IPV6_NDISC_HOP_LIMIT) \ FN(IPV6_NDISC_BAD_CODE) \ FN(IPV6_NDISC_BAD_OPTIONS) \ + FN(IPV6_NDISC_NS_OTHERHOST) \ FNe(MAX) /** @@ -333,6 +334,10 @@ enum skb_drop_reason { SKB_DROP_REASON_IPV6_NDISC_BAD_CODE, /** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */ SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS, + /** @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION + * for another host. + */ + SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b47e845d66eb..c4be62c99f73 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -921,8 +921,10 @@ have_ifp: pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } - } else + } else { + SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; + } } if (is_router < 0) -- cgit v1.2.3 From ac03694bc009703022cf45a9c90675d5505c584c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Feb 2023 16:28:42 +0000 Subject: ipv6: icmp6: add drop reason support to icmpv6_echo_reply() Change icmpv6_echo_reply() to return a drop reason. For the moment, return NOT_SPECIFIED or SKB_CONSUMED. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f32bc98155bf..1f53f2a74480 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -705,7 +705,7 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); -static void icmpv6_echo_reply(struct sk_buff *skb) +static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; @@ -719,18 +719,19 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); + SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) - return; + return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) - return; + return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) @@ -804,6 +805,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); @@ -811,6 +813,7 @@ out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); + return reason; } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, @@ -929,12 +932,12 @@ static int icmpv6_rcv(struct sk_buff *skb) switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: -- cgit v1.2.3