From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 11 Feb 2020 23:20:43 +0100 Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports In the case of huge hash:* types of sets, due to the single spinlock of a set the processing of the whole set under spinlock protection could take too long. There were four places where the whole hash table of the set was processed from bucket to bucket under holding the spinlock: - During resizing a set, the original set was locked to exclude kernel side add/del element operations (userspace add/del is excluded by the nfnetlink mutex). The original set is actually just read during the resize, so the spinlocking is replaced with rcu locking of regions. However, thus there can be parallel kernel side add/del of entries. In order not to loose those operations a backlog is added and replayed after the successful resize. - Garbage collection of timed out entries was also protected by the spinlock. In order not to lock too long, region locking is introduced and a single region is processed in one gc go. Also, the simple timer based gc running is replaced with a workqueue based solution. The internal book-keeping (number of elements, size of extensions) is moved to region level due to the region locking. - Adding elements: when the max number of the elements is reached, the gc was called to evict the timed out entries. The new approach is that the gc is called just for the matching region, assuming that if the region (proportionally) seems to be full, then the whole set does. We could scan the other regions to check every entry under rcu locking, but for huge sets it'd mean a slowdown at adding elements. - Listing the set header data: when the set was defined with timeout support, the garbage collector was called to clean up timed out entries to get the correct element numbers and set size values. Now the set is scanned to check non-timed out entries, without actually calling the gc for the whole set. Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> SOFTIRQ-unsafe lock order issues during working on the patch. Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 908d38dbcb91..5448c8b443db 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -121,6 +121,7 @@ struct ip_set_ext { u32 timeout; u8 packets_op; u8 bytes_op; + bool target; }; struct ip_set; @@ -187,6 +188,14 @@ struct ip_set_type_variant { /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); + /* Region-locking is used */ + bool region_lock; +}; + +struct ip_set_region { + spinlock_t lock; /* Region lock */ + size_t ext_size; /* Size of the dynamic extensions */ + u32 elements; /* Number of elements vs timeout */ }; /* The core set type structure */ @@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, } #define IP_SET_INIT_KEXT(skb, opt, set) \ - { .bytes = (skb)->len, .packets = 1, \ + { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ -- cgit v1.2.3 From a8e41f6033a0c5633d55d6e35993c9e2005d872f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 25 Feb 2020 18:05:35 +0800 Subject: icmp: allow icmpv6_ndo_send to work with CONFIG_IPV6=n The icmpv6_send function has long had a static inline implementation with an empty body for CONFIG_IPV6=n, so that code calling it doesn't need to be ifdef'd. The new icmpv6_ndo_send function, which is intended for drivers as a drop-in replacement with an identical function signature, should follow the same pattern. Without this patch, drivers that used to work with CONFIG_IPV6=n now result in a linker error. Cc: Chen Zhou Reported-by: Hulk Robot Fixes: 0b41713b6066 ("icmp: introduce helper for nat'd source address in network device context") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 93338fd54af8..33d379602314 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -22,19 +22,23 @@ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); +#if IS_ENABLED(CONFIG_NF_NAT) +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); +#else +#define icmpv6_ndo_send icmpv6_send +#endif + #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { - } -#endif -#if IS_ENABLED(CONFIG_NF_NAT) -void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); -#else -#define icmpv6_ndo_send icmpv6_send +static inline void icmpv6_ndo_send(struct sk_buff *skb, + u8 type, u8 code, __u32 info) +{ +} #endif extern int icmpv6_init(void); -- cgit v1.2.3