From 95224166a9032ff5d08fca633d37113078ce7d01 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 13 Jan 2020 09:32:46 +0100 Subject: vti[6]: fix packet tx through bpf_redirect() With an ebpf program that redirects packets through a vti[6] interface, the packets are dropped because no dst is attached. This could also be reproduced with an AF_PACKET socket, with the following python script (vti1 is an ip_vti interface): import socket send_s = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, 0) # scapy # p = IP(src='10.100.0.2', dst='10.200.0.1')/ICMP(type='echo-request') # raw(p) req = b'E\x00\x00\x1c\x00\x01\x00\x00@\x01e\xb2\nd\x00\x02\n\xc8\x00\x01\x08\x00\xf7\xff\x00\x00\x00\x00' send_s.sendto(req, ('vti1', 0x800, 0, 0)) Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 13 +++++++++++-- net/ipv6/ip6_vti.c | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e90b600c7a25..37cddd18f282 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -187,8 +187,17 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int mtu; if (!dst) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); } dst_hold(dst); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 6f08b760c2a7..524006aa0d78 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -449,8 +449,17 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; + if (!dst) { + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + } dst_hold(dst); dst = xfrm_lookup(t->net, dst, fl, NULL, 0); -- cgit v1.2.3 From f042365dbffea98fb8148c98c700402e8d099f02 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 13 Jan 2020 09:32:47 +0100 Subject: xfrm interface: fix packet tx through bpf_redirect() With an ebpf program that redirects packets through a xfrm interface, packets are dropped because no dst is attached to skb. This could also be reproduced with an AF_PACKET socket, with the following python script (xfrm1 is a xfrm interface): import socket send_s = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, 0) # scapy # p = IP(src='10.100.0.2', dst='10.200.0.1')/ICMP(type='echo-request') # raw(p) req = b'E\x00\x00\x1c\x00\x01\x00\x00@\x01e\xb2\nd\x00\x02\n\xc8\x00\x01\x08\x00\xf7\xff\x00\x00\x00\x00' send_s.sendto(req, ('xfrm1', 0x800, 0, 0)) It was also not possible to send an ip packet through an AF_PACKET socket because a LL header was expected. Let's remove those LL header constraints. Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 7ac1542feaf8..00393179f185 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -268,9 +268,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; - dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { @@ -343,6 +340,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -352,10 +350,33 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) case htons(ETH_P_IPV6): xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + if (!dst) { + fl.u.ip6.flowi6_oif = dev->ifindex; + fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, dst); + } break; case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (!dst) { + struct rtable *rt; + + fl.u.ip4.flowi4_oif = dev->ifindex; + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, &rt->dst); + } break; default: goto tx_err; @@ -563,12 +584,9 @@ static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->type = ARPHRD_NONE; - dev->hard_header_len = ETH_HLEN; - dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = ETH_DATA_LEN; - dev->addr_len = ETH_ALEN; + dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; -- cgit v1.2.3 From 8aaea2b0428b6aad7c7e22d3fddc31a78bb1d724 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 13 Jan 2020 09:00:36 +0000 Subject: xfrm: interface: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. Signed-off-by: Xu Wang Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 00393179f185..dc651a628dcf 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -294,7 +294,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) -- cgit v1.2.3 From 4e4362d2bf2a49ff44dbbc9585207977ca3d71d0 Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Wed, 15 Jan 2020 12:11:29 +0100 Subject: xfrm: support output_mark for offload ESP packets Commit 9b42c1f179a6 ("xfrm: Extend the output_mark") added output_mark support but missed ESP offload support. xfrm_smark_get() is not called within xfrm_input() for packets coming from esp4_gro_receive() or esp6_gro_receive(). Therefore call xfrm_smark_get() directly within these functions. Fixes: 9b42c1f179a6 ("xfrm: Extend the output_mark to support input direction and masking.") Signed-off-by: Ulrich Weber Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 2 ++ net/ipv6/esp6_offload.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 0e4a7cf6bc87..e2e219c7854a 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -57,6 +57,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e31626ffccd1..fd535053245b 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -79,6 +79,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; -- cgit v1.2.3 From 7eaecf7963c1c8f62d62c6a8e7c439b0e7f2d365 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 18 Jan 2020 11:27:25 +0100 Subject: netfilter: nft_osf: add missing check for DREG attribute syzbot reports just another NULL deref crash because of missing test for presence of the attribute. Reported-by: syzbot+cf23983d697c26c34f60@syzkaller.appspotmail.com Fixes: b96af92d6eaf9fadd ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index f54d6ae15bb1..b42247aa48a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; + if (!tb[NFTA_OSF_DREG]) + return -EINVAL; + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) -- cgit v1.2.3 From 690afc165bb314354667f67157c1a1aea7dc797a Mon Sep 17 00:00:00 2001 From: Niko Kortstrom Date: Thu, 16 Jan 2020 11:43:27 +0200 Subject: net: ip6_gre: fix moving ip6gre between namespaces Support for moving IPv4 GRE tunnels between namespaces was added in commit b57708add314 ("gre: add x-netns support"). The respective change for IPv6 tunnels, commit 22f08069e8b4 ("ip6gre: add x-netns support") did not drop NETIF_F_NETNS_LOCAL flag so moving them from one netns to another is still denied in IPv6 case. Drop NETIF_F_NETNS_LOCAL flag from ip6gre tunnels to allow moving ip6gre tunnel endpoints between network namespaces. Signed-off-by: Niko Kortstrom Acked-by: Nicolas Dichtel Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ee968d980746..55bfc5149d0c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1466,7 +1466,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) dev->mtu -= 8; if (tunnel->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; netif_keep_dst(dev); } ip6gre_tnl_init_features(dev); @@ -1894,7 +1893,6 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2197,7 +2195,6 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); -- cgit v1.2.3 From 80892772c4edac88c538165d26a0105f19b61c1c Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Mon, 20 Jan 2020 14:26:39 +0800 Subject: hsr: Fix a compilation error A compliation error happen when building branch 5.5-rc7 In file included from net/hsr/hsr_main.c:12:0: net/hsr/hsr_main.h:194:20: error: two or more data types in declaration specifiers static inline void void hsr_debugfs_rename(struct net_device *dev) So Removed one void. Fixes: 4c2d5e33dcd3 ("hsr: rename debugfs file when interface name is changed") Signed-off-by: xiaofeng.yan Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index d40de84a637f..754d84b217f0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -191,7 +191,7 @@ void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else -static inline void void hsr_debugfs_rename(struct net_device *dev) +static inline void hsr_debugfs_rename(struct net_device *dev) { } static inline void hsr_debugfs_init(struct hsr_priv *priv, -- cgit v1.2.3 From 32c72165dbd0e246e69d16a3ad348a4851afd415 Mon Sep 17 00:00:00 2001 From: Kadlecsik József Date: Sun, 19 Jan 2020 22:06:49 +0100 Subject: netfilter: ipset: use bitmap infrastructure completely The bitmap allocation did not use full unsigned long sizes when calculating the required size and that was triggered by KASAN as slab-out-of-bounds read in several places. The patch fixes all of them. Reported-by: syzbot+fabca5cbf5e54f3fe2de@syzkaller.appspotmail.com Reported-by: syzbot+827ced406c9a1d9570ed@syzkaller.appspotmail.com Reported-by: syzbot+190d63957b22ef673ea5@syzkaller.appspotmail.com Reported-by: syzbot+dfccdb2bdb4a12ad425e@syzkaller.appspotmail.com Reported-by: syzbot+df0d0f5895ef1f41a65b@syzkaller.appspotmail.com Reported-by: syzbot+b08bd19bb37513357fd4@syzkaller.appspotmail.com Reported-by: syzbot+53cdd0ec0bbabd53370a@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 7 ------- net/netfilter/ipset/ip_set_bitmap_gen.h | 2 +- net/netfilter/ipset/ip_set_bitmap_ip.c | 6 +++--- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 6 +++--- net/netfilter/ipset/ip_set_bitmap_port.c | 6 +++--- 5 files changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 4d8b1eaf7708..908d38dbcb91 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -426,13 +426,6 @@ ip6addrptr(const struct sk_buff *skb, bool src, struct in6_addr *addr) sizeof(*addr)); } -/* Calculate the bytes required to store the inclusive range of a-b */ -static inline int -bitmap_bytes(u32 a, u32 b) -{ - return 4 * ((((b - a + 8) / 8) + 3) / 4); -} - /* How often should the gc be run by default */ #define IPSET_GC_TIME (3 * 60) diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 077a2cb65fcb..26ab0e9612d8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -75,7 +75,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index abe8f77d7d23..0a2196f59106 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -322,7 +322,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index b618713297da..739e343efaf6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -42,7 +42,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -299,7 +299,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -360,7 +360,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 23d6095cb196..b49978dd810d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -30,7 +30,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -231,7 +231,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -271,7 +271,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); -- cgit v1.2.3 From 62ebaeaedee7591c257543d040677a60e35c7aec Mon Sep 17 00:00:00 2001 From: Yuki Taguchi Date: Mon, 20 Jan 2020 13:48:37 +0900 Subject: ipv6: sr: remove SKB_GSO_IPXIP6 on End.D* actions After LRO/GRO is applied, SRv6 encapsulated packets have SKB_GSO_IPXIP6 feature flag, and this flag must be removed right after decapulation procedure. Currently, SKB_GSO_IPXIP6 flag is not removed on End.D* actions, which creates inconsistent packet state, that is, a normal TCP/IP packets have the SKB_GSO_IPXIP6 flag. This behavior can cause unexpected fallback to GSO on routing to netdevices that do not support SKB_GSO_IPXIP6. For example, on inter-VRF forwarding, decapsulated packets separated into small packets by GSO because VRF devices do not support TSO for packets with SKB_GSO_IPXIP6 flag, and this degrades forwarding performance. This patch removes encapsulation related GSO flags from the skb right after the End.D* action is applied. Fixes: d7a669dd2f8b ("ipv6: sr: add helper functions for seg6local") Signed-off-by: Yuki Taguchi Signed-off-by: David S. Miller --- net/ipv6/seg6_local.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 85a5447a3e8d..7cbc19731997 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_SEG6_HMAC #include #endif @@ -135,7 +136,8 @@ static bool decap_and_validate(struct sk_buff *skb, int proto) skb_reset_network_header(skb); skb_reset_transport_header(skb); - skb->encapsulation = 0; + if (iptunnel_pull_offloads(skb)) + return false; return true; } -- cgit v1.2.3 From cb626bf566eb4433318d35681286c494f04fedcc Mon Sep 17 00:00:00 2001 From: Jouni Hogander Date: Mon, 20 Jan 2020 09:51:03 +0200 Subject: net-sysfs: Fix reference count leak Netdev_register_kobject is calling device_initialize. In case of error reference taken by device_initialize is not given up. Drivers are supposed to call free_netdev in case of error. In non-error case the last reference is given up there and device release sequence is triggered. In error case this reference is kept and the release sequence is never started. Fix this by setting reg_state as NETREG_UNREGISTERED if registering fails. This is the rootcause for couple of memory leaks reported by Syzkaller: BUG: memory leak unreferenced object 0xffff8880675ca008 (size 256): comm "netdev_register", pid 281, jiffies 4294696663 (age 6.808s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000058ca4711>] kmem_cache_alloc_trace+0x167/0x280 [<000000002340019b>] device_add+0x882/0x1750 [<000000001d588c3a>] netdev_register_kobject+0x128/0x380 [<0000000011ef5535>] register_netdevice+0xa1b/0xf00 [<000000007fcf1c99>] __tun_chr_ioctl+0x20d5/0x3dd0 [<000000006a5b7b2b>] tun_chr_ioctl+0x2f/0x40 [<00000000f30f834a>] do_vfs_ioctl+0x1c7/0x1510 [<00000000fba062ea>] ksys_ioctl+0x99/0xb0 [<00000000b1c1b8d2>] __x64_sys_ioctl+0x78/0xb0 [<00000000984cabb9>] do_syscall_64+0x16f/0x580 [<000000000bde033d>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<00000000e6ca2d9f>] 0xffffffffffffffff BUG: memory leak unreferenced object 0xffff8880668ba588 (size 8): comm "kobject_set_nam", pid 286, jiffies 4294725297 (age 9.871s) hex dump (first 8 bytes): 6e 72 30 00 cc be df 2b nr0....+ backtrace: [<00000000a322332a>] __kmalloc_track_caller+0x16e/0x290 [<00000000236fd26b>] kstrdup+0x3e/0x70 [<00000000dd4a2815>] kstrdup_const+0x3e/0x50 [<0000000049a377fc>] kvasprintf_const+0x10e/0x160 [<00000000627fc711>] kobject_set_name_vargs+0x5b/0x140 [<0000000019eeab06>] dev_set_name+0xc0/0xf0 [<0000000069cb12bc>] netdev_register_kobject+0xc8/0x320 [<00000000f2e83732>] register_netdevice+0xa1b/0xf00 [<000000009e1f57cc>] __tun_chr_ioctl+0x20d5/0x3dd0 [<000000009c560784>] tun_chr_ioctl+0x2f/0x40 [<000000000d759e02>] do_vfs_ioctl+0x1c7/0x1510 [<00000000351d7c31>] ksys_ioctl+0x99/0xb0 [<000000008390040a>] __x64_sys_ioctl+0x78/0xb0 [<0000000052d196b7>] do_syscall_64+0x16f/0x580 [<0000000019af9236>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<00000000bc384531>] 0xffffffffffffffff v3 -> v4: Set reg_state to NETREG_UNREGISTERED if registering fails v2 -> v3: * Replaced BUG_ON with WARN_ON in free_netdev and netdev_release v1 -> v2: * Relying on driver calling free_netdev rather than calling put_device directly in error path Reported-by: syzbot+ad8ca40ecd77896d51e2@syzkaller.appspotmail.com Cc: David Miller Cc: Greg Kroah-Hartman Cc: Lukas Bulwahn Signed-off-by: Jouni Hogander Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7e885d069707..e82e9b82dfd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9302,8 +9302,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) + if (ret) { + dev->reg_state = NETREG_UNREGISTERED; goto err_uninit; + } dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); -- cgit v1.2.3 From 5b2f1f3070b6447b76174ea8bfb7390dc6253ebd Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 20 Jan 2020 18:04:56 +0800 Subject: tcp_bbr: improve arithmetic division in bbr_update_bw() do_div() does a 64-by-32 division. Use div64_long() instead of it if the divisor is long, to avoid truncation to 32-bit. And as a nice side effect also cleans up the function a bit. Signed-off-by: Wen Yang Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a6545ef0d27b..6c4d79baff26 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -779,8 +779,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) * bandwidth sample. Delivered is in packets and interval_us in uS and * ratio will be <<1 for most connections. So delivered is first scaled. */ - bw = (u64)rs->delivered * BW_UNIT; - do_div(bw, rs->interval_us); + bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); /* If this sample is application-limited, it is likely to have a very * low delivered count that represents application behavior rather than -- cgit v1.2.3 From bfe02b9f9476bc8784ebb0f78fa244ad15bb15ea Mon Sep 17 00:00:00 2001 From: Theodore Dubois Date: Mon, 20 Jan 2020 14:10:53 -0800 Subject: tcp: remove redundant assigment to snd_cwnd Not sure how this got in here. git blame says the second assignment was added in 3a9a57f6, but that commit also removed the first assignment. Signed-off-by: Theodore Dubois Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d885ba868822..04273d6aa36b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2614,7 +2614,6 @@ int tcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; - tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -- cgit v1.2.3 From d0f418516022c32ecceaf4275423e5bd3f8743a9 Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Tue, 21 Jan 2020 15:26:24 +0100 Subject: net, ip_tunnel: fix namespaces move in the same manner as commit 690afc165bb3 ("net: ip6_gre: fix moving ip6gre between namespaces"), fix namespace moving as it was broken since commit 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata."). Indeed, the ip6_gre commit removed the local flag for collect_md condition, so there is no reason to keep it for ip_gre/ip_tunnel. this patch will fix both ip_tunnel and ip_gre modules. Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.") Signed-off-by: William Dauchy Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0fe2a5d3e258..74e1d964a615 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1236,10 +1236,8 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; - if (tunnel->collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (tunnel->collect_md) netif_keep_dst(dev); - } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); -- cgit v1.2.3 From 58c8db929db1c1d785a6f5d8f8692e5dbcc35e84 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 21 Jan 2020 13:31:47 +0100 Subject: net, sk_msg: Don't check if sock is locked when tearing down psock As John Fastabend reports [0], psock state tear-down can happen on receive path *after* unlocking the socket, if the only other psock user, that is sockmap or sockhash, releases its psock reference before tcp_bpf_recvmsg does so: tcp_bpf_recvmsg() psock = sk_psock_get(sk) <- refcnt 2 lock_sock(sk); ... sock_map_free() <- refcnt 1 release_sock(sk) sk_psock_put() <- refcnt 0 Remove the lockdep check for socket lock in psock tear-down that got introduced in 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down"). [0] https://lore.kernel.org/netdev/5e25dc995d7d_74082aaee6e465b441@john-XPS-13-9370.notmuch/ Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Reported-by: syzbot+d73682fcf7fee6982fe3@syzkaller.appspotmail.com Suggested-by: John Fastabend Signed-off-by: Jakub Sitnicki Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/skmsg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3866d7e20c07..ded2d5227678 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -594,8 +594,6 @@ EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sock_owned_by_me(sk); - sk_psock_cork_free(psock); sk_psock_zap_ingress(psock); -- cgit v1.2.3 From c80794323e82ac6ab45052ebba5757ce47b4b588 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 21 Jan 2020 15:09:40 +0000 Subject: net: Fix packet reordering caused by GRO and listified RX cooperation Commit 323ebb61e32b ("net: use listified RX for handling GRO_NORMAL skbs") introduces batching of GRO_NORMAL packets in napi_frags_finish, and commit 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()") adds the same to napi_skb_finish. However, dev_gro_receive (that is called just before napi_{frags,skb}_finish) can also pass skbs to the networking stack: e.g., when the GRO session is flushed, napi_gro_complete is called, which passes pp directly to netif_receive_skb_internal, skipping napi->rx_list. It means that the packet stored in pp will be handled by the stack earlier than the packets that arrived before, but are still waiting in napi->rx_list. It leads to TCP reorderings that can be observed in the TCPOFOQueue counter in netstat. This commit fixes the reordering issue by making napi_gro_complete also use napi->rx_list, so that all packets going through GRO will keep their order. In order to keep napi_gro_flush working properly, gro_normal_list calls are moved after the flush to clear napi->rx_list. iwlwifi calls napi_gro_flush directly and does the same thing that is done by gro_normal_list, so the same change is applied there: napi_gro_flush is moved to be before the flush of napi->rx_list. A few other drivers also use napi_gro_flush (brocade/bna/bnad.c, cortina/gemini.c, hisilicon/hns3/hns3_enet.c). The first two also use napi_complete_done afterwards, which performs the gro_normal_list flush, so they are fine. The latter calls napi_gro_receive right after napi_gro_flush, so it can end up with non-empty napi->rx_list anyway. Fixes: 323ebb61e32b ("net: use listified RX for handling GRO_NORMAL skbs") Signed-off-by: Maxim Mikityanskiy Cc: Alexander Lobakin Cc: Edward Cree Acked-by: Alexander Lobakin Acked-by: Saeed Mahameed Acked-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 4 +- net/core/dev.c | 64 ++++++++++++++-------------- 2 files changed, 35 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 452da44a21e0..f0b8ff67a1bc 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1529,13 +1529,13 @@ out: napi = &rxq->napi; if (napi->poll) { + napi_gro_flush(napi, false); + if (napi->rx_count) { netif_receive_skb_list(&napi->rx_list); INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; } - - napi_gro_flush(napi, false); } iwl_pcie_rxq_restock(trans, rxq); diff --git a/net/core/dev.c b/net/core/dev.c index e82e9b82dfd9..cca03914108a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5491,9 +5491,29 @@ static void flush_all_backlogs(void) put_online_cpus(); } +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); -static int napi_gro_complete(struct sk_buff *skb) +static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5526,7 +5546,8 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - return netif_receive_skb_internal(skb); + gro_normal_one(napi, skb); + return NET_RX_SUCCESS; } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, @@ -5539,7 +5560,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(skb); + napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } @@ -5641,7 +5662,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct list_head *head) +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; @@ -5657,7 +5678,7 @@ static void gro_flush_oldest(struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(oldest); + napi_gro_complete(napi, oldest); } INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, @@ -5733,7 +5754,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); - napi_gro_complete(pp); + napi_gro_complete(napi, pp); napi->gro_hash[hash].count--; } @@ -5744,7 +5765,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(gro_head); + gro_flush_oldest(napi, gro_head); } else { napi->gro_hash[hash].count++; } @@ -5802,26 +5823,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -6200,8 +6201,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - gro_normal_list(n); - if (n->gro_bitmask) { unsigned long timeout = 0; @@ -6217,6 +6216,9 @@ bool napi_complete_done(struct napi_struct *n, int work_done) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } + + gro_normal_list(n); + if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); @@ -6548,8 +6550,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - gro_normal_list(n); - if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. @@ -6557,6 +6557,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) napi_gro_flush(n, HZ >= 1000); } + gro_normal_list(n); + /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ -- cgit v1.2.3 From d39ca2590d10712f412add7a88e1dd467a7246f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 21 Jan 2020 16:50:49 +0100 Subject: Revert "udp: do rmem bulk free even if the rx sk queue is empty" This reverts commit 0d4a6608f68c7532dcbfec2ea1150c9761767d03. Williem reported that after commit 0d4a6608f68c ("udp: do rmem bulk free even if the rx sk queue is empty") the memory allocated by an almost idle system with many UDP sockets can grow a lot. For stable kernel keep the solution as simple as possible and revert the offending commit. Reported-by: Willem de Bruijn Diagnosed-by: Eric Dumazet Fixes: 0d4a6608f68c ("udp: do rmem bulk free even if the rx sk queue is empty") Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 93a355b6b092..030d43c7c957 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1368,7 +1368,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2)) + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; -- cgit v1.2.3 From 36d79af7fb59d6d9106feb9c1855eb93d6d53fe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jan 2020 11:02:20 -0800 Subject: net_sched: use validated TCA_KIND attribute in tc_new_tfilter() sysbot found another issue in tc_new_tfilter(). We probably should use @name which contains the sanitized version of TCA_KIND. BUG: KMSAN: uninit-value in string_nocheck lib/vsprintf.c:608 [inline] BUG: KMSAN: uninit-value in string+0x522/0x690 lib/vsprintf.c:689 CPU: 1 PID: 10753 Comm: syz-executor.1 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 string_nocheck lib/vsprintf.c:608 [inline] string+0x522/0x690 lib/vsprintf.c:689 vsnprintf+0x207d/0x31b0 lib/vsprintf.c:2574 __request_module+0x2ad/0x11c0 kernel/kmod.c:143 tcf_proto_lookup_ops+0x241/0x720 net/sched/cls_api.c:139 tcf_proto_create net/sched/cls_api.c:262 [inline] tc_new_tfilter+0x2a4e/0x5010 net/sched/cls_api.c:2058 rtnetlink_rcv_msg+0xcb7/0x1570 net/core/rtnetlink.c:5415 netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5442 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45b349 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f88b3948c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f88b39496d4 RCX: 000000000045b349 RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 000000000075bfc8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000099f R14: 00000000004cb163 R15: 000000000075bfd4 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:82 slab_alloc_node mm/slub.c:2774 [inline] __kmalloc_node_track_caller+0xb40/0x1200 mm/slub.c:4382 __kmalloc_reserve net/core/skbuff.c:141 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209 alloc_skb include/linux/skbuff.h:1049 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1174 [inline] netlink_sendmsg+0x7d3/0x14d0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 6f96c3c6904c ("net_sched: fix backward compatibility for TCA_KIND") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Cong Wang Cc: Marcelo Ricardo Leitner Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 76e0d122616a..c2cdd0fc2e70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2055,9 +2055,8 @@ replay: &chain_info)); mutex_unlock(&chain->filter_chain_lock); - tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain, rtnl_held, - extack); + tp_new = tcf_proto_create(name, protocol, prio, chain, + rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; -- cgit v1.2.3 From 5311a69aaca30fa849c3cc46fb25f75727fb72d0 Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Tue, 21 Jan 2020 21:49:54 +0100 Subject: net, ip6_tunnel: fix namespaces move in the same manner as commit d0f418516022 ("net, ip_tunnel: fix namespaces move"), fix namespace moving as it was broken since commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnel"), but for ipv6 this time; there is no reason to keep it for ip6_tunnel. Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnel") Signed-off-by: William Dauchy Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2f376dbc37d5..b5dd20c4599b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1877,10 +1877,8 @@ static int ip6_tnl_dev_init(struct net_device *dev) if (err) return err; ip6_tnl_link_config(t); - if (t->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (t->parms.collect_md) netif_keep_dst(dev); - } return 0; } -- cgit v1.2.3 From d836f5c69d87473ff65c06a6123e5b2cf5e56f5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jan 2020 22:47:29 -0800 Subject: net: rtnetlink: validate IFLA_MTU attribute in rtnl_create_link() rtnl_create_link() needs to apply dev->min_mtu and dev->max_mtu checks that we apply in do_setlink() Otherwise malicious users can crash the kernel, for example after an integer overflow : BUG: KASAN: use-after-free in memset include/linux/string.h:365 [inline] BUG: KASAN: use-after-free in __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238 Write of size 32 at addr ffff88819f20b9c0 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:639 check_memory_region_inline mm/kasan/generic.c:185 [inline] check_memory_region+0x134/0x1a0 mm/kasan/generic.c:192 memset+0x24/0x40 mm/kasan/common.c:108 memset include/linux/string.h:365 [inline] __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238 alloc_skb include/linux/skbuff.h:1049 [inline] alloc_skb_with_frags+0x93/0x590 net/core/skbuff.c:5664 sock_alloc_send_pskb+0x7ad/0x920 net/core/sock.c:2242 sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2259 mld_newpack+0x1d7/0x7f0 net/ipv6/mcast.c:1609 add_grhead.isra.0+0x299/0x370 net/ipv6/mcast.c:1713 add_grec+0x7db/0x10b0 net/ipv6/mcast.c:1844 mld_send_cr net/ipv6/mcast.c:1970 [inline] mld_ifc_timer_expire+0x3d3/0x950 net/ipv6/mcast.c:2477 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786 __do_softirq+0x262/0x98c kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x19b/0x1e0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: 98 6b ea f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 44 1c 60 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 34 1c 60 00 fb f4 cc 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 4e 5d 9a f9 e8 79 RSP: 0018:ffffffff89807ce8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff13266ae RBX: ffffffff8987a1c0 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffffffff8987aa54 RBP: ffffffff89807d18 R08: ffffffff8987a1c0 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff8a799980 R14: 0000000000000000 R15: 0000000000000000 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:690 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361 rest_init+0x23b/0x371 init/main.c:451 arch_call_rest_init+0xe/0x1b start_kernel+0x904/0x943 init/main.c:784 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x77/0x7b arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 The buggy address belongs to the page: page:ffffea00067c82c0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 raw: 057ffe0000000000 ffffea00067c82c8 ffffea00067c82c8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88819f20b880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88819f20b900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88819f20b980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88819f20ba00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88819f20ba80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 +++++++++++++++++++---------- net/core/rtnetlink.c | 13 +++++++++++-- 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ae5e260911e2..cac56fb59af8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3698,6 +3698,8 @@ int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); int dev_set_mtu_ext(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); diff --git a/net/core/dev.c b/net/core/dev.c index cca03914108a..81befd0c2510 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8196,6 +8196,22 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL(__dev_set_mtu); +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} + /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device @@ -8212,16 +8228,9 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, if (new_mtu == dev->mtu) return 0; - /* MTU must be positive, and in range */ - if (new_mtu < 0 || new_mtu < dev->min_mtu) { - NL_SET_ERR_MSG(extack, "mtu less than device minimum"); - return -EINVAL; - } - - if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { - NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); - return -EINVAL; - } + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; if (!netif_device_present(dev)) return -ENODEV; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 02916f43bf63..d9001b5c48eb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3048,8 +3048,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + int err; + + err = dev_validate_mtu(dev, mtu, extack); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + dev->mtu = mtu; + } if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); -- cgit v1.2.3 From bb48eb9b12a95db9d679025927269d4adda6dbd1 Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 23 Jan 2020 13:20:18 +0100 Subject: fou: Fix IPv6 netlink policy When submitting v2 of "fou: Support binding FoU socket" (1713cb37bf67), I accidentally sent the wrong version of the patch and one fix was missing. In the initial version of the patch, as well as the version 2 that I submitted, I incorrectly used ".type" for the two V6-attributes. The correct is to use ".len". Reported-by: Dmitry Vyukov Fixes: 1713cb37bf67 ("fou: Support binding FoU socket") Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- net/ipv4/fou.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 30fa771d382a..dcc79ff54b41 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -662,8 +662,8 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), }, [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; -- cgit v1.2.3 From 61678d28d4a45ef376f5d02a839cc37509ae9281 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 22 Jan 2020 15:42:02 -0800 Subject: net_sched: fix datalen for ematch syzbot reported an out-of-bound access in em_nbyte. As initially analyzed by Eric, this is because em_nbyte sets its own em->datalen in em_nbyte_change() other than the one specified by user, but this value gets overwritten later by its caller tcf_em_validate(). We should leave em->datalen untouched to respect their choices. I audit all the in-tree ematch users, all of those implement ->change() set em->datalen, so we can just avoid setting it twice in this case. Reported-and-tested-by: syzbot+5af9a90dad568aa9f611@syzkaller.appspotmail.com Reported-by: syzbot+2f07903a5b05e7f36410@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/ematch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 8f2ad706784d..d0140a92694a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -263,12 +263,12 @@ static int tcf_em_validate(struct tcf_proto *tp, } em->data = (unsigned long) v; } + em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; - em->datalen = data_len; em->net = net; err = 0; -- cgit v1.2.3 From 971485a0d681954677ec6a7ed990359aceabc06e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jan 2020 00:43:28 +0000 Subject: ipvs: fix spelling mistake "to" -> "too" There is a spelling mistake in a IP_VS_ERR_RL message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 8dc892a9dc91..605e0f68f8bd 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1239,7 +1239,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, p = msg_end; if (p + sizeof(s->v4) > buffer+buflen) { - IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); return; } s = (union ip_vs_sync_conn *)p; -- cgit v1.2.3 From 43d88774d138496c88920e2bb012f5fd098ee6ad Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jan 2020 00:45:55 +0000 Subject: caif_usb: fix spelling mistake "to" -> "too" There is a spelling mistake in a pr_warn message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 76bd67891fb3..a0116b9503d9 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -62,7 +62,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { - pr_warn("Headroom to small\n"); + pr_warn("Headroom too small\n"); kfree_skb(skb); return -EIO; } -- cgit v1.2.3 From 4d299f183314aa79131abdfa2c528f8807010ea8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jan 2020 09:27:30 +0000 Subject: net/rose: fix spelling mistake "to" -> "too" There is a spelling mistake in a printk message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/rose/af_rose.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 46b8ff24020d..1e8eeb044b07 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1475,7 +1475,7 @@ static int __init rose_proto_init(void) int rc; if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { - printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n"); + printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n"); rc = -EINVAL; goto out; } -- cgit v1.2.3 From 2bec445f9bf35e52e395b971df48d3e1e5dc704a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Jan 2020 21:03:00 -0800 Subject: tcp: do not leave dangling pointers in tp->highest_sack Latest commit 853697504de0 ("tcp: Fix highest_sack and highest_sack_seq") apparently allowed syzbot to trigger various crashes in TCP stack [1] I believe this commit only made things easier for syzbot to find its way into triggering use-after-frees. But really the bugs could lead to bad TCP behavior or even plain crashes even for non malicious peers. I have audited all calls to tcp_rtx_queue_unlink() and tcp_rtx_queue_unlink_and_free() and made sure tp->highest_sack would be updated if we are removing from rtx queue the skb that tp->highest_sack points to. These updates were missing in three locations : 1) tcp_clean_rtx_queue() [This one seems quite serious, I have no idea why this was not caught earlier] 2) tcp_rtx_queue_purge() [Probably not a big deal for normal operations] 3) tcp_send_synack() [Probably not a big deal for normal operations] [1] BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1864 [inline] BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1856 [inline] BUG: KASAN: use-after-free in tcp_check_sack_reordering+0x33c/0x3a0 net/ipv4/tcp_input.c:891 Read of size 4 at addr ffff8880a488d068 by task ksoftirqd/1/16 CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:639 __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:134 tcp_highest_sack_seq include/net/tcp.h:1864 [inline] tcp_highest_sack_seq include/net/tcp.h:1856 [inline] tcp_check_sack_reordering+0x33c/0x3a0 net/ipv4/tcp_input.c:891 tcp_try_undo_partial net/ipv4/tcp_input.c:2730 [inline] tcp_fastretrans_alert+0xf74/0x23f0 net/ipv4/tcp_input.c:2847 tcp_ack+0x2577/0x5bf0 net/ipv4/tcp_input.c:3710 tcp_rcv_established+0x6dd/0x1e90 net/ipv4/tcp_input.c:5706 tcp_v4_do_rcv+0x619/0x8d0 net/ipv4/tcp_ipv4.c:1619 tcp_v4_rcv+0x307f/0x3b40 net/ipv4/tcp_ipv4.c:2001 ip_protocol_deliver_rcu+0x5a/0x880 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x23b/0x380 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_local_deliver+0x1e9/0x520 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x1db/0x2f0 net/ipv4/ip_input.c:428 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_rcv+0xe8/0x3f0 net/ipv4/ip_input.c:538 __netif_receive_skb_one_core+0x113/0x1a0 net/core/dev.c:5148 __netif_receive_skb+0x2c/0x1d0 net/core/dev.c:5262 process_backlog+0x206/0x750 net/core/dev.c:6093 napi_poll net/core/dev.c:6530 [inline] net_rx_action+0x508/0x1120 net/core/dev.c:6598 __do_softirq+0x262/0x98c kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:603 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:595 smpboot_thread_fn+0x6a3/0xa40 kernel/smpboot.c:165 kthread+0x361/0x430 kernel/kthread.c:255 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352 Allocated by task 10091: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:513 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:486 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:521 slab_post_alloc_hook mm/slab.h:584 [inline] slab_alloc_node mm/slab.c:3263 [inline] kmem_cache_alloc_node+0x138/0x740 mm/slab.c:3575 __alloc_skb+0xd5/0x5e0 net/core/skbuff.c:198 alloc_skb_fclone include/linux/skbuff.h:1099 [inline] sk_stream_alloc_skb net/ipv4/tcp.c:875 [inline] sk_stream_alloc_skb+0x113/0xc90 net/ipv4/tcp.c:852 tcp_sendmsg_locked+0xcf9/0x3470 net/ipv4/tcp.c:1282 tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1432 inet_sendmsg+0x9e/0xe0 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 __sys_sendto+0x262/0x380 net/socket.c:1998 __do_sys_sendto net/socket.c:2010 [inline] __se_sys_sendto net/socket.c:2006 [inline] __x64_sys_sendto+0xe1/0x1a0 net/socket.c:2006 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 10095: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:335 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:474 kasan_slab_free+0xe/0x10 mm/kasan/common.c:483 __cache_free mm/slab.c:3426 [inline] kmem_cache_free+0x86/0x320 mm/slab.c:3694 kfree_skbmem+0x178/0x1c0 net/core/skbuff.c:645 __kfree_skb+0x1e/0x30 net/core/skbuff.c:681 sk_eat_skb include/net/sock.h:2453 [inline] tcp_recvmsg+0x1252/0x2930 net/ipv4/tcp.c:2166 inet_recvmsg+0x136/0x610 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:886 [inline] sock_recvmsg net/socket.c:904 [inline] sock_recvmsg+0xce/0x110 net/socket.c:900 __sys_recvfrom+0x1ff/0x350 net/socket.c:2055 __do_sys_recvfrom net/socket.c:2073 [inline] __se_sys_recvfrom net/socket.c:2069 [inline] __x64_sys_recvfrom+0xe1/0x1a0 net/socket.c:2069 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8880a488d040 which belongs to the cache skbuff_fclone_cache of size 456 The buggy address is located 40 bytes inside of 456-byte region [ffff8880a488d040, ffff8880a488d208) The buggy address belongs to the page: page:ffffea0002922340 refcount:1 mapcount:0 mapping:ffff88821b057000 index:0x0 raw: 00fffe0000000200 ffffea00022a5788 ffffea0002624a48 ffff88821b057000 raw: 0000000000000000 ffff8880a488d040 0000000100000006 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880a488cf00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8880a488cf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8880a488d000: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ^ ffff8880a488d080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880a488d100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 853697504de0 ("tcp: Fix highest_sack and highest_sack_seq") Fixes: 50895b9de1d3 ("tcp: highest_sack fix") Fixes: 737ff314563c ("tcp: use sequence distance to detect reordering") Signed-off-by: Eric Dumazet Cc: Cambda Zhu Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 04273d6aa36b..a7d766e6390e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2524,6 +2524,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5347ab2c9c58..2a976f57f7e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3164,6 +3164,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..b62b59b18db9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3232,6 +3232,7 @@ int tcp_send_synack(struct sock *sk) if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); -- cgit v1.2.3 From 8bf7092021f283944f0c5f4c364853201c45c611 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 23 Jan 2020 10:11:20 +0300 Subject: vcc_seq_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/atm/proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/atm/proc.c b/net/atm/proc.c index d79221fd4dae..c31896707313 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -134,8 +134,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v) static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { v = vcc_walk(seq, 1); - if (v) - (*pos)++; + (*pos)++; return v; } -- cgit v1.2.3 From 1e3f9f073c47bee7c23e77316b07bc12338c5bba Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 23 Jan 2020 10:11:28 +0300 Subject: neigh_stat_seq_next() should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 920784a9b7ff..789a73aa7bd8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3290,6 +3290,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; } -- cgit v1.2.3 From a3ea86739f1bc7e121d921842f0f4a8ab1af94d9 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 23 Jan 2020 10:11:35 +0300 Subject: rt_cpu_seq_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 87e979f2b74a..e356ea779227 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -271,6 +271,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL; } -- cgit v1.2.3 From 4fc427e0515811250647d44de38d87d7b0e0790f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 23 Jan 2020 10:12:06 +0300 Subject: ipv6_route_seq_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7bae6a91b487..cfae0a1529a1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2495,14 +2495,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; + ++(*pos); if (!v) goto iter_table; n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); - if (n) { - ++*pos; + if (n) return n; - } iter_table: ipv6_route_check_sernum(iter); @@ -2510,8 +2509,6 @@ iter_table: r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { - if (v) - ++*pos; return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); -- cgit v1.2.3 From ab658b9fa7a2c467f79eac8b53ea308b8f98113d Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Sat, 18 Jan 2020 13:10:50 +0100 Subject: netfilter: conntrack: sctp: use distinct states for new SCTP connections The netlink notifications triggered by the INIT and INIT_ACK chunks for a tracked SCTP association do not include protocol information for the corresponding connection - SCTP state and verification tags for the original and reply direction are missing. Since the connection tracking implementation allows user space programs to receive notifications about a connection and then create a new connection based on the values received in a notification, it makes sense that INIT and INIT_ACK notifications should contain the SCTP state and verification tags available at the time when a notification is sent. The missing verification tags cause a newly created netfilter connection to fail to verify the tags of SCTP packets when this connection has been created from the values previously received in an INIT or INIT_ACK notification. A PROTOINFO event is cached in sctp_packet() when the state of a connection changes. The CLOSED and COOKIE_WAIT state will be used for connections that have seen an INIT and INIT_ACK chunk, respectively. The distinct states will cause a connection state change in sctp_packet(). Signed-off-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 0399ae8f1188..4f897b14b606 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, @@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, @@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - ct->proto.sctp.state = new_state; + ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; -- cgit v1.2.3 From c83de17dd6308fb74696923e5245de0e3c427206 Mon Sep 17 00:00:00 2001 From: wenxu Date: Sun, 19 Jan 2020 13:18:30 +0800 Subject: netfilter: nf_tables_offload: fix check the chain offload flag In the nft_indr_block_cb the chain should check the flag with NFT_CHAIN_HW_OFFLOAD. Fixes: 9a32669fecfb ("netfilter: nf_tables_offload: support indr block call") Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index a9ea29afb09f..2bb28483af22 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -564,7 +564,7 @@ static void nft_indr_block_cb(struct net_device *dev, mutex_lock(&net->nft.commit_mutex); chain = __nft_offload_get_chain(dev); - if (chain) { + if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { struct nft_base_chain *basechain; basechain = nft_base_chain(chain); -- cgit v1.2.3 From 826035498ec14b77b62a44f0cb6b94d45530db6f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Jan 2020 16:07:00 +0100 Subject: netfilter: nf_tables: add __nft_chain_type_get() This new helper function validates that unknown family and chain type coming from userspace do not trigger an out-of-bound array access. Bail out in case __nft_chain_type_get() returns NULL from nft_chain_parse_hook(). Fixes: 9370761c56b6 ("netfilter: nf_tables: convert built-in tables/chains to chain types") Reported-by: syzbot+156a04714799b1d480bc@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 65f51a2e9c2a..4aa01c1253b1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -552,15 +552,28 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; +static const struct nft_chain_type * +__nft_chain_type_get(u8 family, enum nft_chain_types type) +{ + if (family >= NFPROTO_NUMPROTO || + type >= NFT_CHAIN_T_MAX) + return NULL; + + return chain_type[family][type]; +} + static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { + const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { - if (chain_type[family][i] != NULL && - !nla_strcmp(nla, chain_type[family][i]->name)) - return chain_type[family][i]; + type = __nft_chain_type_get(family, i); + if (!type) + continue; + if (!nla_strcmp(nla, type->name)) + return type; } return NULL; } @@ -1162,11 +1175,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) void nft_register_chain_type(const struct nft_chain_type *ctype) { - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return; - nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } @@ -1768,7 +1778,10 @@ static int nft_chain_parse_hook(struct net *net, hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, autoload); -- cgit v1.2.3 From eb014de4fd418de1a277913cba244e47274fe392 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Jan 2020 16:48:03 +0100 Subject: netfilter: nf_tables: autoload modules from the abort path This patch introduces a list of pending module requests. This new module list is composed of nft_module_request objects that contain the module name and one status field that tells if the module has been already loaded (the 'done' field). In the first pass, from the preparation phase, the netlink command finds that a module is missing on this list. Then, a module request is allocated and added to this list and nft_request_module() returns -EAGAIN. This triggers the abort path with the autoload parameter set on from nfnetlink, request_module() is called and the module request enters the 'done' state. Since the mutex is released when loading modules from the abort phase, the module list is zapped so this is iteration occurs over a local list. Therefore, the request_module() calls happen when object lists are in consistent state (after fulling aborting the transaction) and the commit list is empty. On the second pass, the netlink command will find that it already tried to load the module, so it does not request it again and nft_request_module() returns 0. Then, there is a look up to find the object that the command was missing. If the module was successfully loaded, the command proceeds normally since it finds the missing object in place, otherwise -ENOENT is reported to userspace. This patch also updates nfnetlink to include the reason to enter the abort phase, which is required for this new autoload module rationale. Fixes: ec7470b834fe ("netfilter: nf_tables: store transaction list locally while requesting module") Reported-by: syzbot+29125d208b3dae9a7019@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 2 +- include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 126 ++++++++++++++++++++++++------------ net/netfilter/nfnetlink.c | 6 +- 4 files changed, 91 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index cf09ab37b45b..851425c3178f 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -31,7 +31,7 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); - int (*abort)(struct net *net, struct sk_buff *skb); + int (*abort)(struct net *net, struct sk_buff *skb, bool autoload); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 286fd960896f..a1a8d45adb42 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,6 +7,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; + struct list_head module_list; struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4aa01c1253b1..7e63b481cc86 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -578,35 +578,45 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) return NULL; } -/* - * Loading a module requires dropping mutex that guards the transaction. - * A different client might race to start a new transaction meanwhile. Zap the - * list of pending transaction and then restore it once the mutex is grabbed - * again. Users of this function return EAGAIN which implicitly triggers the - * transaction abort path to clean up the list of pending transactions. - */ +struct nft_module_request { + struct list_head list; + char module[MODULE_NAME_LEN]; + bool done; +}; + #ifdef CONFIG_MODULES -static void nft_request_module(struct net *net, const char *fmt, ...) +static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; - LIST_HEAD(commit_list); + struct nft_module_request *req; va_list args; int ret; - list_splice_init(&net->nft.commit_list, &commit_list); - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) - return; + return 0; - mutex_unlock(&net->nft.commit_mutex); - request_module("%s", module_name); - mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(req, &net->nft.module_list, list) { + if (!strcmp(req->module, module_name)) { + if (req->done) + return 0; - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); - list_splice(&commit_list, &net->nft.commit_list); + /* A request to load this module already exists. */ + return -EAGAIN; + } + } + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->done = false; + strlcpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &net->nft.module_list); + + return -EAGAIN; } #endif @@ -630,10 +640,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nft_request_module(net, "nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - type = __nf_tables_chain_type_lookup(nla, family); - if (type != NULL) + if (nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), + (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2341,9 +2350,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; @@ -2369,9 +2377,9 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); - nft_request_module(net, "nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), + (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2462,9 +2470,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) - nft_expr_type_request_module(ctx->net, - ctx->family, - tb[NFTA_EXPR_NAME]); + if (nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]) != -EAGAIN) + err = -ENOENT; #endif goto err1; } @@ -3301,8 +3310,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nft_request_module(ctx->net, "nft-set"); - if (!list_empty(&nf_tables_set_types)) + if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -5428,8 +5436,7 @@ nft_obj_type_get(struct net *net, u32 objtype) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-obj-%u", objtype); - if (__nft_obj_type_get(objtype)) + if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -6002,8 +6009,7 @@ nft_flowtable_type_get(struct net *net, u8 family) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nf-flowtable-%u", family); - if (__nft_flowtable_type_get(family)) + if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -7005,6 +7011,18 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_module_autoload_cleanup(struct net *net) +{ + struct nft_module_request *req, *next; + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!req->done); + list_del(&req->list); + kfree(req); + } +} + static void nf_tables_commit_release(struct net *net) { struct nft_trans *trans; @@ -7017,6 +7035,7 @@ static void nf_tables_commit_release(struct net *net) * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&net->nft.commit_list)) { + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); return; } @@ -7031,6 +7050,7 @@ static void nf_tables_commit_release(struct net *net) list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); schedule_work(&trans_destroy_work); @@ -7222,6 +7242,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_module_autoload(struct net *net) +{ + struct nft_module_request *req, *next; + LIST_HEAD(module_list); + + list_splice_init(&net->nft.module_list, &module_list); + mutex_unlock(&net->nft.commit_mutex); + list_for_each_entry_safe(req, next, &module_list, list) { + if (req->done) { + list_del(&req->list); + kfree(req); + } else { + request_module("%s", req->module); + req->done = true; + } + } + mutex_lock(&net->nft.commit_mutex); + list_splice(&module_list, &net->nft.module_list); +} + static void nf_tables_abort_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -7251,7 +7291,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net) +static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -7373,6 +7413,11 @@ static int __nf_tables_abort(struct net *net) nf_tables_abort_release(trans); } + if (autoload) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + return 0; } @@ -7381,9 +7426,9 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) { - int ret = __nf_tables_abort(net); + int ret = __nf_tables_abort(net, autoload); mutex_unlock(&net->nft.commit_mutex); @@ -7978,6 +8023,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + INIT_LIST_HEAD(&net->nft.module_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7989,7 +8035,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net); + __nf_tables_abort(net, false); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abbb452cf6c..99127e2d95a8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -476,7 +476,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + ss->abort(net, oskb, true); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -487,11 +487,11 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb); + ss->abort(net, oskb, false); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb); + ss->abort(net, oskb, false); } if (ss->cleanup) ss->cleanup(net); -- cgit v1.2.3 From 189c9b1e94539b11c80636bc13e9cf47529e7bba Mon Sep 17 00:00:00 2001 From: Praveen Chaudhary Date: Thu, 23 Jan 2020 12:33:28 -0800 Subject: net: Fix skb->csum update in inet_proto_csum_replace16(). skb->csum is updated incorrectly, when manipulation for NF_NAT_MANIP_SRC\DST is done on IPV6 packet. Fix: There is no need to update skb->csum in inet_proto_csum_replace16(), because update in two fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to update skb->csum, because update in 3 fields a.) IPv4 src/dst address, b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as L4 Header checksum for skb->csum calculation. [ pablo@netfilter.org: a few comestic documentation edits ] Signed-off-by: Praveen Chaudhary Signed-off-by: Zhenggen Xu Signed-off-by: Andy Stracner Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/core/utils.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 6b6e51db9f3b..1f31a39236d5 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -438,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr) @@ -449,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); -- cgit v1.2.3