From cc5453a5b7e90c39f713091a7ebc53c1f87d1700 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Aug 2020 16:15:58 +0200 Subject: netfilter: conntrack: allow sctp hearbeat after connection re-use If an sctp connection gets re-used, heartbeats are flagged as invalid because their vtag doesn't match. Handle this in a similar way as TCP conntrack when it suspects that the endpoints and conntrack are out-of-sync. When a HEARTBEAT request fails its vtag validation, flag this in the conntrack state and accept the packet. When a HEARTBEAT_ACK is received with an invalid vtag in the reverse direction after we allowed such a HEARTBEAT through, assume we are out-of-sync and re-set the vtag info. v2: remove left-over snippet from an older incarnation that moved new_state/old_state assignments, thats not needed so keep that as-is. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 39 +++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4f897b14b606..810cca24b399 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -62,6 +62,8 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; +#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 + #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT @@ -369,6 +371,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -427,15 +430,39 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; - } else if (sch->type == SCTP_CID_HEARTBEAT || - sch->type == SCTP_CID_HEARTBEAT_ACK) { + } else if (sch->type == SCTP_CID_HEARTBEAT) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.last_dir = dir; + ignore = true; + continue; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + } + } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out_unlock; + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || + ct->proto.sctp.last_dir == dir) + goto out_unlock; + + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.vtag[dir] = sh->vtag; + ct->proto.sctp.vtag[!dir] = 0; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } @@ -470,6 +497,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); + /* allow but do not refresh timeout */ + if (ignore) + return NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; -- cgit v1.2.3 From 226a88de473e475cb9f993682a1c7d0c2b451ad8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 19 Aug 2020 23:59:14 +0200 Subject: netfilter: nft_set_rbtree: Handle outcomes of tree rotations in overlap detection Checks for partial overlaps on insertion assume that end elements are always descendant nodes of their corresponding start, because they are inserted later. However, this is not the case if a previous delete operation caused a tree rotation as part of rebalancing. Taking the issue reported by Andreas Fischer as an example, if we omit delete operations, the existing procedure works because, equivalently, we are inserting a start item with value 40 in the this region of the red-black tree with single-sized intervals: overlap flag 10 (start) / \ false 20 (start) / \ false 30 (start) / \ false 60 (start) / \ false 50 (end) / \ false 20 (end) / \ false 40 (start) if we now delete interval 30 - 30, the tree can be rearranged in a way similar to this (note the rotation involving 50 - 50): overlap flag 10 (start) / \ false 20 (start) / \ false 25 (start) / \ false 70 (start) / \ false 50 (end) / \ true (from rule a1.) 50 (start) / \ true 40 (start) and we traverse interval 50 - 50 from the opposite direction compared to what was expected. To deal with those cases, add a start-before-start rule, b4., that covers traversal of existing intervals from the right. We now need to restrict start-after-end rule b3. to cases where there are no occurring nodes between existing start and end elements, because addition of rule b4. isn't sufficient to ensure that the pre-existing end element we encounter while descending the tree corresponds to a start element of an interval that we already traversed entirely. Different types of overlap detection on trees with rotations resulting from re-balancing will be covered by nft test case sets/0044interval_overlap_1. Reported-by: Andreas Fischer Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1449 Cc: # 5.6.x Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 4b2834fd17b2..27668f4e44ea 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -238,21 +238,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * * b1. _ _ __>| !_ _ __| (insert end before existing start) * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end) + * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) + * '--' no nodes falling in this range + * b4. >|_ _ ! (insert start before existing start) * * Case a3. resolves to b3.: * - if the inserted start element is the leftmost, because the '0' * element in the tree serves as end element - * - otherwise, if an existing end is found. Note that end elements are - * always inserted after corresponding start elements. + * - otherwise, if an existing end is found immediately to the left. If + * there are existing nodes in between, we need to further descend the + * tree before we can conclude the new start isn't causing an overlap + * + * or to b4., which, preceded by a3., means we already traversed one or + * more existing intervals entirely, from the right. * * For a new, rightmost pair of elements, we'll hit cases b3. and b2., * in that order. * * The flag is also cleared in two special cases: * - * b4. |__ _ _!|<_ _ _ (insert start right before existing end) - * b5. |__ _ >|!__ _ _ (insert end right after existing start) + * b5. |__ _ _!|<_ _ _ (insert start right before existing end) + * b6. |__ _ >|!__ _ _ (insert end right after existing start) * * which always happen as last step and imply that no further * overlapping is possible. @@ -272,7 +278,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (nft_rbtree_interval_start(new)) { if (nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) + !nft_set_elem_expired(&rbe->ext) && !*p) overlap = false; } else { overlap = nft_rbtree_interval_end(rbe) && @@ -288,10 +294,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); - } else if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && + } else if (nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext)) { - overlap = true; + overlap = nft_rbtree_interval_end(rbe); } } else { if (nft_rbtree_interval_end(rbe) && -- cgit v1.2.3 From 0726763043dc10dd4c12481f050b1a5ef8f15410 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 19 Aug 2020 23:59:15 +0200 Subject: netfilter: nft_set_rbtree: Detect partial overlap with start endpoint match Getting creative with nft and omitting the interval_overlap() check from the set_overlap() function, without omitting set_overlap() altogether, led to the observation of a partial overlap that wasn't detected, and would actually result in replacement of the end element of an existing interval. This is due to the fact that we'll return -EEXIST on a matching, pre-existing start element, instead of -ENOTEMPTY, and the error is cleared by API if NLM_F_EXCL is not given. At this point, we can insert a matching start, and duplicate the end element as long as we don't end up into other intervals. For instance, inserting interval 0 - 2 with an existing 0 - 3 interval would result in a single 0 - 2 interval, and a dangling '3' end element. This is because nft will proceed after inserting the '0' start element as no error is reported, and no further conflicting intervals are detected on insertion of the end element. This needs a different approach as it's a local condition that can be detected by looking for duplicate ends coming from left and right, separately. Track those and directly report -ENOTEMPTY on duplicated end elements for a matching start. Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 27668f4e44ea..217ab3644c25 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -218,11 +218,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { + bool overlap = false, dup_end_left = false, dup_end_right = false; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; - bool overlap = false; int d; /* Detect overlaps as we descend the tree. Set the flag in these cases: @@ -262,6 +262,20 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * * which always happen as last step and imply that no further * overlapping is possible. + * + * Another special case comes from the fact that start elements matching + * an already existing start element are allowed: insertion is not + * performed but we return -EEXIST in that case, and the error will be + * cleared by the caller if NLM_F_EXCL is not present in the request. + * This way, request for insertion of an exact overlap isn't reported as + * error to userspace if not desired. + * + * However, if the existing start matches a pre-existing start, but the + * end element doesn't match the corresponding pre-existing end element, + * we need to report a partial overlap. This is a local condition that + * can be noticed without need for a tracking flag, by checking for a + * local duplicated end for a corresponding start, from left and right, + * separately. */ parent = NULL; @@ -281,19 +295,35 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, !nft_set_elem_expired(&rbe->ext) && !*p) overlap = false; } else { + if (dup_end_left && !*p) + return -ENOTEMPTY; + overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); + + if (overlap) { + dup_end_right = true; + continue; + } } } else if (d > 0) { p = &parent->rb_right; if (nft_rbtree_interval_end(new)) { + if (dup_end_right && !*p) + return -ENOTEMPTY; + overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); + + if (overlap) { + dup_end_left = true; + continue; + } } else if (nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext)) { overlap = nft_rbtree_interval_end(rbe); @@ -321,6 +351,8 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_left; } } + + dup_end_left = dup_end_right = false; } if (overlap) -- cgit v1.2.3 From 6f03bf43ee05b31d3822def2a80f11b3591c55b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Aug 2020 14:12:54 +0200 Subject: netfilter: nf_tables: add NFTA_SET_USERDATA if not null Kernel sends an empty NFTA_SET_USERDATA attribute with no value if userspace adds a set with no NFTA_SET_USERDATA attribute. Fixes: e6d8ecac9e68 ("netfilter: nf_tables: Add new attributes into nft_set to store user data.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd814e514f94..71e501c5ad21 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3770,7 +3770,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } - if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + if (set->udata && + nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); -- cgit v1.2.3 From 1e105e6afa6c3d32bfb52c00ffa393894a525c27 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Aug 2020 21:05:50 +0200 Subject: netfilter: nf_tables: fix destination register zeroing Following bug was reported via irc: nft list ruleset set knock_candidates_ipv4 { type ipv4_addr . inet_service size 65535 elements = { 127.0.0.1 . 123, 127.0.0.1 . 123 } } .. udp dport 123 add @knock_candidates_ipv4 { ip saddr . 123 } udp dport 123 add @knock_candidates_ipv4 { ip saddr . udp dport } It should not have been possible to add a duplicate set entry. After some debugging it turned out that the problem is the immediate value (123) in the second-to-last rule. Concatenations use 32bit registers, i.e. the elements are 8 bytes each, not 6 and it turns out the kernel inserted inet firewall @knock_candidates_ipv4 element 0100007f ffff7b00 : 0 [end] element 0100007f 00007b00 : 0 [end] Note the non-zero upper bits of the first element. It turns out that nft_immediate doesn't zero the destination register, but this is needed when the length isn't a multiple of 4. Furthermore, the zeroing in nft_payload is broken. We can't use [len / 4] = 0 -- if len is a multiple of 4, index is off by one. Skip zeroing in this case and use a conditional instead of (len -1) / 4. Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_payload.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bf9491b77d16..224d194ad29d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -143,6 +143,8 @@ static inline u64 nft_reg_load64(const u32 *sreg) static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { + if (len % NFT_REG32_SIZE) + dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index ed7cb9f747f6..7a2e59638499 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -87,7 +87,9 @@ void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) -- cgit v1.2.3 From 4b7ddc58e61ada82be1f47f82d5139f0ab2ba478 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 22 Aug 2020 18:07:27 -0700 Subject: netfilter: delete repeated words Drop duplicated words in net/netfilter/ and net/ipv4/netfilter/. Signed-off-by: Randy Dunlap Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_pptp.c | 2 +- net/netfilter/nf_conntrack_pptp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/xt_recent.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 7afde8828b4c..3f248a19faa3 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -3,7 +3,7 @@ * nf_nat_pptp.c * * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. + * PPTP is a protocol for creating virtual private networks. * It is a specification defined by Microsoft and some vendors * working with Microsoft. PPTP is built on top of a modified * version of the Internet Generic Routing Encapsulation Protocol. diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 1f44d523b512..5105d4250012 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. + * PPTP is a protocol for creating virtual private networks. * It is a specification defined by Microsoft and some vendors * working with Microsoft. PPTP is built on top of a modified * version of the Internet Generic Routing Encapsulation Protocol. diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6892e497781c..e8c86ee4c1c4 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1152,7 +1152,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { - /* Set ASSURED if we see see valid ack in ESTABLISHED + /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 19bef176145e..606411869698 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -640,7 +640,7 @@ static void __net_exit recent_proc_net_exit(struct net *net) struct recent_table *t; /* recent_net_exit() is called before recent_mt_destroy(). Make sure - * that the parent xt_recent proc entry is is empty before trying to + * that the parent xt_recent proc entry is empty before trying to * remove it. */ spin_lock_bh(&recent_lock); -- cgit v1.2.3 From ee921183557af39c1a0475f982d43b0fcac25e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 23 Aug 2020 13:55:36 +0200 Subject: netfilter: nfnetlink: nfnetlink_unicast() reports EAGAIN instead of ENOBUFS Frontend callback reports EAGAIN to nfnetlink to retry a command, this is used to signal that module autoloading is required. Unfortunately, nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets full, so it enters a busy-loop. This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast() since this is always MSG_DONTWAIT in the existing code which is exactly what nlmsg_unicast() passes to netlink_unicast() as parameter. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 3 +- net/netfilter/nf_tables_api.c | 61 ++++++++++++++++++------------------- net/netfilter/nfnetlink.c | 11 +++++-- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 5 files changed, 40 insertions(+), 40 deletions(-) (limited to 'net/netfilter') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 851425c3178f..89016d08f6a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -43,8 +43,7 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags); +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 71e501c5ad21..b7dc1cbf40ea 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -815,11 +815,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) - goto err; + goto err_fill_table_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_table_info: kfree_skb(skb2); return err; } @@ -1563,11 +1563,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) - goto err; + goto err_fill_chain_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_chain_info: kfree_skb(skb2); return err; } @@ -3008,11 +3008,11 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, NULL); if (err < 0) - goto err; + goto err_fill_rule_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_rule_info: kfree_skb(skb2); return err; } @@ -3968,11 +3968,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) - goto err; + goto err_fill_set_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_set_info: kfree_skb(skb2); return err; } @@ -4860,24 +4860,18 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) - goto err1; + return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, &elem); if (err < 0) - goto err2; + goto err_fill_setelem; - err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); - /* This avoids a loop in nfnetlink. */ - if (err < 0) - goto err1; + return nfnetlink_unicast(skb, ctx->net, ctx->portid); - return 0; -err2: +err_fill_setelem: kfree_skb(skb); -err1: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return err; } /* called with rcu_read_lock held */ @@ -6182,10 +6176,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) - goto err; + goto err_fill_obj_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_obj_info: kfree_skb(skb2); return err; } @@ -7045,10 +7040,11 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, &flowtable->hook_list); if (err < 0) - goto err; + goto err_fill_flowtable_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_flowtable_info: kfree_skb(skb2); return err; } @@ -7234,10 +7230,11 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) - goto err; + goto err_fill_gen_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_gen_info: kfree_skb(skb2); return err; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 5f24edf95830..3a2e64e13b22 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -149,10 +149,15 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { - return netlink_unicast(net->nfnl, skb, portid, flags); + int err; + + err = nlmsg_unicast(net->nfnl, skb, portid); + if (err == -EAGAIN) + err = -ENOBUFS; + + return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f02992419850..b35e8d9a5b37 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -356,8 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst) goto out; } } - nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, - MSG_DONTWAIT); + nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index dadfc06245a3..d1d8bca03b4f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -681,7 +681,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; -- cgit v1.2.3 From c46172147ebbeb70094db48d76ab7945d96c638b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:07:18 +0200 Subject: netfilter: conntrack: do not auto-delete clash entries on reply Its possible that we have more than one packet with the same ct tuple simultaneously, e.g. when an application emits n packets on same UDP socket from multiple threads. NAT rules might be applied to those packets. With the right set of rules, n packets will be mapped to m destinations, where at least two packets end up with the same destination. When this happens, the existing clash resolution may merge the skb that is processed after the first has been received with the identical tuple already in hash table. However, its possible that this identical tuple is a NAT_CLASH tuple. In that case the second skb will be sent, but no reply can be received since the reply that is processed first removes the NAT_CLASH tuple. Do not auto-delete, this gives a 1 second window for replies to be passed back to originator. Packets that are coming later (udp stream case) will not be affected: they match the original ct entry, not a NAT_CLASH one. Also prevent NAT_CLASH entries from getting offloaded. Fixes: 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing entries") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_udp.c | 26 ++++++++++---------------- net/netfilter/nft_flow_offload.c | 2 +- 2 files changed, 11 insertions(+), 17 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 760ca2422816..af402f458ee0 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,18 +81,6 @@ static bool udp_error(struct sk_buff *skb, return false; } -static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, - struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - u32 extra_jiffies) -{ - if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && - ct->status & IPS_NAT_CLASH)) - nf_ct_kill(ct); - else - nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); -} - /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -124,12 +112,15 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, extra); + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((ct->status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -206,12 +197,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); + + if (unlikely((ct->status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3b9b97aa4b32..3a6c84fb2c90 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -102,7 +102,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || - ct->status & IPS_SEQ_ADJUST) + ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) -- cgit v1.2.3