From 8f4d19aacb64f2b3d65c8cf7974c3d153224b5f2 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 30 May 2018 10:29:31 +0800 Subject: netfilter: xt_CT: Reject the non-null terminated string from user space The helper and timeout strings are from user-space, we need to make sure they are null terminated. If not, evil user could make kernel read the unexpected memory, even print it when fail to find by the following codes. pr_info_ratelimited("No such helper \"%s\"\n", helper_name); Signed-off-by: Gao Feng Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 8790190c6feb..03b9a50ec93b 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -245,12 +245,22 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, } if (info->helper[0]) { + if (strnlen(info->helper, sizeof(info->helper)) == sizeof(info->helper)) { + ret = -ENAMETOOLONG; + goto err3; + } + ret = xt_ct_set_helper(ct, info->helper, par); if (ret < 0) goto err3; } if (info->timeout[0]) { + if (strnlen(info->timeout, sizeof(info->timeout)) == sizeof(info->timeout)) { + ret = -ENAMETOOLONG; + goto err4; + } + ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) goto err4; -- cgit v1.2.3 From 9c7f96fd77b0dbe1fe7ed1f9c462c45dc48a1076 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 31 May 2018 19:53:33 +0300 Subject: netfilter: nf_tables: check msg_type before nft_trans_set(trans) The patch moves the "trans->msg_type == NFT_MSG_NEWSET" check before using nft_trans_set(trans). Otherwise we can get out of bounds read. For example, KASAN reported the one when running 0001_cache_handling_0 nft test. In this case "trans->msg_type" was NFT_MSG_NEWTABLE: [75517.177808] BUG: KASAN: slab-out-of-bounds in nft_set_lookup_global+0x22f/0x270 [nf_tables] [75517.279094] Read of size 8 at addr ffff881bdb643fc8 by task nft/7356 ... [75517.375605] CPU: 26 PID: 7356 Comm: nft Tainted: G E 4.17.0-rc7.1.x86_64 #1 [75517.489587] Hardware name: Oracle Corporation SUN SERVER X4-2 [75517.618129] Call Trace: [75517.648821] dump_stack+0xd1/0x13b [75517.691040] ? show_regs_print_info+0x5/0x5 [75517.742519] ? kmsg_dump_rewind_nolock+0xf5/0xf5 [75517.799300] ? lock_acquire+0x143/0x310 [75517.846738] print_address_description+0x85/0x3a0 [75517.904547] kasan_report+0x18d/0x4b0 [75517.949892] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.019153] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.088420] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.157689] nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.224869] nf_tables_newsetelem+0x1a5/0x5d0 [nf_tables] [75518.291024] ? nft_add_set_elem+0x2280/0x2280 [nf_tables] [75518.357154] ? nla_parse+0x1a5/0x300 [75518.401455] ? kasan_kmalloc+0xa6/0xd0 [75518.447842] nfnetlink_rcv+0xc43/0x1bdf [nfnetlink] [75518.507743] ? nfnetlink_rcv+0x7a5/0x1bdf [nfnetlink] [75518.569745] ? nfnl_err_reset+0x3c0/0x3c0 [nfnetlink] [75518.631711] ? lock_acquire+0x143/0x310 [75518.679133] ? netlink_deliver_tap+0x9b/0x1070 [75518.733840] ? kasan_unpoison_shadow+0x31/0x40 [75518.788542] netlink_unicast+0x45d/0x680 [75518.837111] ? __isolate_free_page+0x890/0x890 [75518.891913] ? netlink_attachskb+0x6b0/0x6b0 [75518.944542] netlink_sendmsg+0x6fa/0xd30 [75518.993107] ? netlink_unicast+0x680/0x680 [75519.043758] ? netlink_unicast+0x680/0x680 [75519.094402] sock_sendmsg+0xd9/0x160 [75519.138810] ___sys_sendmsg+0x64d/0x980 [75519.186234] ? copy_msghdr_from_user+0x350/0x350 [75519.243118] ? lock_downgrade+0x650/0x650 [75519.292738] ? do_raw_spin_unlock+0x5d/0x250 [75519.345456] ? _raw_spin_unlock+0x24/0x30 [75519.395065] ? __handle_mm_fault+0xbde/0x3410 [75519.448830] ? sock_setsockopt+0x3d2/0x1940 [75519.500516] ? __lock_acquire.isra.25+0xdc/0x19d0 [75519.558448] ? lock_downgrade+0x650/0x650 [75519.608057] ? __audit_syscall_entry+0x317/0x720 [75519.664960] ? __fget_light+0x58/0x250 [75519.711325] ? __sys_sendmsg+0xde/0x170 [75519.758850] __sys_sendmsg+0xde/0x170 [75519.804193] ? __ia32_sys_shutdown+0x90/0x90 [75519.856725] ? syscall_trace_enter+0x897/0x10e0 [75519.912354] ? trace_event_raw_event_sys_enter+0x920/0x920 [75519.979432] ? __audit_syscall_entry+0x720/0x720 [75520.036118] do_syscall_64+0xa3/0x3d0 [75520.081248] ? prepare_exit_to_usermode+0x47/0x1d0 [75520.139904] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75520.201680] RIP: 0033:0x7fc153320ba0 [75520.245772] RSP: 002b:00007ffe294c3638 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [75520.337708] RAX: ffffffffffffffda RBX: 00007ffe294c4820 RCX: 00007fc153320ba0 [75520.424547] RDX: 0000000000000000 RSI: 00007ffe294c46b0 RDI: 0000000000000003 [75520.511386] RBP: 00007ffe294c47b0 R08: 0000000000000004 R09: 0000000002114090 [75520.598225] R10: 00007ffe294c30a0 R11: 0000000000000246 R12: 00007ffe294c3660 [75520.684961] R13: 0000000000000001 R14: 00007ffe294c3650 R15: 0000000000000001 [75520.790946] Allocated by task 7356: [75520.833994] kasan_kmalloc+0xa6/0xd0 [75520.878088] __kmalloc+0x189/0x450 [75520.920107] nft_trans_alloc_gfp+0x20/0x190 [nf_tables] [75520.983961] nf_tables_newtable+0xcd0/0x1bd0 [nf_tables] [75521.048857] nfnetlink_rcv+0xc43/0x1bdf [nfnetlink] [75521.108655] netlink_unicast+0x45d/0x680 [75521.157013] netlink_sendmsg+0x6fa/0xd30 [75521.205271] sock_sendmsg+0xd9/0x160 [75521.249365] ___sys_sendmsg+0x64d/0x980 [75521.296686] __sys_sendmsg+0xde/0x170 [75521.341822] do_syscall_64+0xa3/0x3d0 [75521.386957] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75521.467867] Freed by task 23454: [75521.507804] __kasan_slab_free+0x132/0x180 [75521.558137] kfree+0x14d/0x4d0 [75521.596005] free_rt_sched_group+0x153/0x280 [75521.648410] sched_autogroup_create_attach+0x19a/0x520 [75521.711330] ksys_setsid+0x2ba/0x400 [75521.755529] __ia32_sys_setsid+0xa/0x10 [75521.802850] do_syscall_64+0xa3/0x3d0 [75521.848090] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75521.929000] The buggy address belongs to the object at ffff881bdb643f80 which belongs to the cache kmalloc-96 of size 96 [75522.079797] The buggy address is located 72 bytes inside of 96-byte region [ffff881bdb643f80, ffff881bdb643fe0) [75522.221234] The buggy address belongs to the page: [75522.280100] page:ffffea006f6d90c0 count:1 mapcount:0 mapping:0000000000000000 index:0x0 [75522.377443] flags: 0x2fffff80000100(slab) [75522.426956] raw: 002fffff80000100 0000000000000000 0000000000000000 0000000180200020 [75522.521275] raw: ffffea006e6fafc0 0000000c0000000c ffff881bf180f400 0000000000000000 [75522.615601] page dumped because: kasan: bad access detected Fixes: 37a9cc525525 ("netfilter: nf_tables: add generation mask to sets") Signed-off-by: Alexey Kodanev Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 501e48a7965b..8d8dfe417014 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2728,12 +2728,13 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, u32 id = ntohl(nla_get_be32(nla)); list_for_each_entry(trans, &net->nft.commit_list, list) { - struct nft_set *set = nft_trans_set(trans); + if (trans->msg_type == NFT_MSG_NEWSET) { + struct nft_set *set = nft_trans_set(trans); - if (trans->msg_type == NFT_MSG_NEWSET && - id == nft_trans_set_id(trans) && - nft_active_genmask(set, genmask)) - return set; + if (id == nft_trans_set_id(trans) && + nft_active_genmask(set, genmask)) + return set; + } } return ERR_PTR(-ENOENT); } -- cgit v1.2.3 From 31875d4970baa02e08b719fdfea6f43e9e2f7e77 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 24 May 2018 23:40:12 +0300 Subject: ipvs: register conntrack hooks for ftp ip_vs_ftp requires conntrack modules for mangling of FTP command responses in passive mode. Make sure the conntrack hooks are registered when real servers use NAT method in FTP virtual service. The hooks will be registered while the service is present. Fixes: 0c66dc1ea3f0 ("netfilter: conntrack: register hooks in netns when needed by ruleset") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 30 ++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++ 2 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..ae72d9057eda 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -643,6 +643,7 @@ struct ip_vs_service { /* alternate persistence engine */ struct ip_vs_pe __rcu *pe; + int conntrack_afmask; struct rcu_head rcu_head; }; @@ -1620,6 +1621,35 @@ static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, return false; } +static inline int ip_vs_register_conntrack(struct ip_vs_service *svc) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + int afmask = (svc->af == AF_INET6) ? 2 : 1; + int ret = 0; + + if (!(svc->conntrack_afmask & afmask)) { + ret = nf_ct_netns_get(svc->ipvs->net, svc->af); + if (ret >= 0) + svc->conntrack_afmask |= afmask; + } + return ret; +#else + return 0; +#endif +} + +static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + int afmask = (svc->af == AF_INET6) ? 2 : 1; + + if (svc->conntrack_afmask & afmask) { + nf_ct_netns_put(svc->ipvs->net, svc->af); + svc->conntrack_afmask &= ~afmask; + } +#endif +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3ecca0616d8c..ee0ab278f1f1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -835,6 +835,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * For now only for NAT! */ ip_vs_rs_hash(ipvs, dest); + /* FTP-NAT requires conntrack for mangling */ + if (svc->port == FTPPORT) + ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); @@ -1458,6 +1461,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* -- cgit v1.2.3 From 0cafa3926f0d8d72a2a814843f4db8cfef66d4ce Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 1 Jun 2018 19:12:28 +0900 Subject: netfilter: nft_reject_bridge: fix skb allocation size in nft_reject_br_send_v6_unreach In order to allocate icmpv6 skb, sizeof(struct ipv6hdr) should be used. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index eaf05de37f75..6de981270566 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -261,7 +261,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, if (!reject6_br_csum_ok(oldskb, hook)) return; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) return; -- cgit v1.2.3 From 6fcc02e3c2bddeaf628fde3c6a5ab3216d45691a Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 2 Jun 2018 21:52:15 +0300 Subject: ipvs: fix check on xmit to non-local addresses There is mistake in the rt_mode_allow_non_local assignment. It should be used to check if sending to non-local addresses is allowed, now it checks if local addresses are allowed. As local addresses are allowed for most of the cases, the only places that are affected are for traffic to transparent cache servers: - bypass connections when cache server is not available - related ICMP in FORWARD hook when sent to cache server Fixes: 4a4739d56b00 ("ipvs: Pull out crosses_local_route_boundary logic") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4527921b1c3a..8f7fff774283 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -168,7 +168,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, bool new_rt_is_local) { bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); - bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); bool source_is_loopback; bool old_rt_is_local; -- cgit v1.2.3 From 9e8c8dabb78e886ace989729e763d28c76f5169e Mon Sep 17 00:00:00 2001 From: Alin Nastac Date: Wed, 30 May 2018 15:19:36 +0200 Subject: netfilter: ebtables: fix compat entry padding On arm64, ebt_entry_{match,watcher,target} structs are 40 bytes long while on 32-bit arm these structs have a size of 36 bytes. COMPAT_XT_ALIGN() macro cannot be used here to determine the necessary padding for the CONFIG_COMPAT because it imposes an 8-byte boundary alignment, condition that is not found in 32-bit ebtables application. Signed-off-by: Alin Nastac Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 6ba639f6c51d..5f459c8b7937 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1610,16 +1610,16 @@ struct compat_ebt_entry_mwt { compat_uptr_t ptr; } u; compat_uint_t match_size; - compat_uint_t data[0]; + compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace)))); }; /* account for possible padding between match_size and ->data */ static int ebt_compat_entry_padsize(void) { - BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) < - COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt))); - return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) - - COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)); + BUILD_BUG_ON(sizeof(struct ebt_entry_match) < + sizeof(struct compat_ebt_entry_mwt)); + return (int) sizeof(struct ebt_entry_match) - + sizeof(struct compat_ebt_entry_mwt); } static int ebt_compat_match_offset(const struct xt_match *match, -- cgit v1.2.3 From 9dcceb1378b6d66633f613805b2d5a22af4d5383 Mon Sep 17 00:00:00 2001 From: Serhey Popovych Date: Tue, 5 Jun 2018 11:46:13 +0200 Subject: netfilter: xt_set: Check hook mask correctly Inserting rule before one with SET target we get error with warning in dmesg(1) output: # iptables -A FORWARD -t mangle -j SET --map-set test src --map-prio # iptables -I FORWARD 1 -t mangle -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. # dmesg |tail -n1 [268578.026643] mapping of prio or/and queue is allowed only from \ OUTPUT/FORWARD/POSTROUTING chains Rather than checking for supported hook bits for SET target check for unsupported one as done in all rest of matches and targets. Signed-off-by: Serhey Popovych Signed-off-by: Jozsef Kadlecsik --- net/netfilter/xt_set.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 6f4c5217d835..07af7dbf7a30 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -470,7 +470,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - !(par->hook_mask & (1 << NF_INET_FORWARD | + (par->hook_mask & ~(1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); -- cgit v1.2.3 From 30a2e107108c66cbcb7776b58cbcd7db223a1cc9 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 5 Jun 2018 11:53:35 +0200 Subject: netfilter: ipset: Limit max timeout value Due to the negative value condition in msecs_to_jiffies(), the real max possible timeout value must be set to (UINT_MAX >> 1)/MSEC_PER_SEC. Neutron Soutmun proposed the proper fix, but an insufficient one was applied, see https://patchwork.ozlabs.org/patch/400405/. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++---- net/netfilter/xt_set.c | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 7ad8ddf9ca8a..8ce271e187b6 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -23,6 +23,9 @@ /* Set is defined with timeout support: timeout value may be 0 */ #define IPSET_NO_TIMEOUT UINT_MAX +/* Max timeout value, see msecs_to_jiffies() in jiffies.h */ +#define IPSET_MAX_TIMEOUT (UINT_MAX >> 1)/MSEC_PER_SEC + #define ip_set_adt_opt_timeout(opt, set) \ ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout) @@ -32,11 +35,10 @@ ip_set_timeout_uget(struct nlattr *tb) unsigned int timeout = ip_set_get_h32(tb); /* Normalize to fit into jiffies */ - if (timeout > UINT_MAX/MSEC_PER_SEC) - timeout = UINT_MAX/MSEC_PER_SEC; + if (timeout > IPSET_MAX_TIMEOUT) + timeout = IPSET_MAX_TIMEOUT; - /* Userspace supplied TIMEOUT parameter: adjust crazy size */ - return timeout == IPSET_NO_TIMEOUT ? IPSET_NO_TIMEOUT - 1 : timeout; + return timeout; } static inline bool diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 07af7dbf7a30..bf2890b13212 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -372,8 +372,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; + add_opt.ext.timeout > IPSET_MAX_TIMEOUT) + add_opt.ext.timeout = IPSET_MAX_TIMEOUT; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -407,8 +407,8 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; + add_opt.ext.timeout > IPSET_MAX_TIMEOUT) + add_opt.ext.timeout = IPSET_MAX_TIMEOUT; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) -- cgit v1.2.3 From cbdebe481a14b42c45aa9f4ceb5ff19b55de2c57 Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Mon, 4 Jun 2018 16:51:19 +0200 Subject: netfilter: ipset: forbid family for hash:mac sets Userspace `ipset` command forbids family option for hash:mac type: ipset create test hash:mac family inet4 ipset v6.30: Unknown argument: `family' However, this check is not done in kernel itself. When someone use external netlink applications (pyroute2 python library for example), one can create hash:mac with invalid family and inconsistant results from userspace (`ipset` command cannot read set content anymore). This patch enforce the logic in kernel, and forbids insertion of hash:mac with a family set. Since IP_SET_PROTO_UNDEF is defined only for hash:mac, this patch has no impact on other hash:* sets Signed-off-by: Florent Fourcot Signed-off-by: Victorien Molle Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index bbad940c0137..8a33dac4e805 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1234,7 +1234,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); -#ifndef IP_SET_PROTO_UNDEF +#ifdef IP_SET_PROTO_UNDEF + if (set->family != NFPROTO_UNSPEC) + return -IPSET_ERR_INVALID_FAMILY; +#else if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; #endif -- cgit v1.2.3 From 11ff7288beb2b7da889a014aff0a7b80bf8efcf3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jun 2018 12:14:56 +0200 Subject: netfilter: ebtables: reject non-bridge targets the ebtables evaluation loop expects targets to return positive values (jumps), or negative values (absolute verdicts). This is completely different from what xtables does. In xtables, targets are expected to return the standard netfilter verdicts, i.e. NF_DROP, NF_ACCEPT, etc. ebtables will consider these as jumps. Therefore reject any target found due to unspec fallback. v2: also reject watchers. ebtables ignores their return value, so a target that assumes skb ownership (and returns NF_STOLEN) causes use-after-free. The only watchers in the 'ebtables' front-end are log and nflog; both have AF_BRIDGE specific wrappers on kernel side. Reported-by: syzbot+2b43f681169a2a0d306a@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5f459c8b7937..08a65e4a77d0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -396,6 +396,12 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0); if (IS_ERR(watcher)) return PTR_ERR(watcher); + + if (watcher->family != NFPROTO_BRIDGE) { + module_put(watcher->me); + return -ENOENT; + } + w->u.watcher = watcher; par->target = watcher; @@ -715,6 +721,13 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, goto cleanup_watchers; } + /* Reject UNSPEC, xtables verdicts/return values are incompatible */ + if (target->family != NFPROTO_BRIDGE) { + module_put(target->me); + ret = -ENOENT; + goto cleanup_watchers; + } + t->u.target = target; if (t->u.target == &ebt_standard_target) { if (gap < sizeof(struct ebt_standard_target)) { -- cgit v1.2.3 From 82e20b44477ffe90a5866caa209ecc9df818c6a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 7 Jun 2018 02:05:12 +0900 Subject: netfilter: nft_set_rbtree: fix parameter of __nft_rbtree_lookup() The parameter this doesn't have a flags value. so that it can't be used by nft_rbtree_interval_end(). test commands: %nft add table ip filter %nft add set ip filter s { type ipv4_addr \; flags interval \; } %nft add element ip filter s {0-1} %nft add element ip filter s {2-10} %nft add chain ip filter input { type filter hook input priority 0\; } %nft add rule ip filter input ip saddr @s Splat looks like: [ 246.752502] BUG: KASAN: slab-out-of-bounds in __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] Read of size 1 at addr ffff88010d9efa47 by task http/1092 [ 246.752502] CPU: 1 PID: 1092 Comm: http Not tainted 4.17.0-rc6+ #185 [ 246.752502] Call Trace: [ 246.752502] [ 246.752502] dump_stack+0x74/0xbb [ 246.752502] ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] print_address_description+0xc7/0x290 [ 246.752502] ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] kasan_report+0x22c/0x350 [ 246.752502] __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] nft_rbtree_lookup+0xc9/0x2d2 [nft_set_rbtree] [ 246.752502] ? sched_clock_cpu+0x144/0x180 [ 246.752502] nft_lookup_eval+0x149/0x3a0 [nf_tables] [ 246.752502] ? __lock_acquire+0xcea/0x4ed0 [ 246.752502] ? nft_lookup_init+0x6b0/0x6b0 [nf_tables] [ 246.752502] nft_do_chain+0x263/0xf50 [nf_tables] [ 246.752502] ? __nft_trace_packet+0x1a0/0x1a0 [nf_tables] [ 246.752502] ? sched_clock_cpu+0x144/0x180 [ ... ] Fixes: f9121355eb6f ("netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e6f08bc5f359..26fa93b23805 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); if (interval && nft_rbtree_equal(set, this, interval) && - nft_rbtree_interval_end(this) && + nft_rbtree_interval_end(rbe) && !nft_rbtree_interval_end(interval)) continue; interval = rbe; -- cgit v1.2.3 From c568503ef02030f169c9e19204def610a3510918 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Jun 2018 21:34:43 +0200 Subject: netfilter: x_tables: initialise match/target check parameter struct syzbot reports following splat: BUG: KMSAN: uninit-value in ebt_stp_mt_check+0x24b/0x450 net/bridge/netfilter/ebt_stp.c:162 ebt_stp_mt_check+0x24b/0x450 net/bridge/netfilter/ebt_stp.c:162 xt_check_match+0x1438/0x1650 net/netfilter/x_tables.c:506 ebt_check_match net/bridge/netfilter/ebtables.c:372 [inline] ebt_check_entry net/bridge/netfilter/ebtables.c:702 [inline] The uninitialised access is xt_mtchk_param->nft_compat ... which should be set to 0. Fix it by zeroing the struct beforehand, same for tgchk. ip(6)tables targetinfo uses c99-style initialiser, so no change needed there. Reported-by: syzbot+da4494182233c23a5fcf@syzkaller.appspotmail.com Fixes: 55917a21d0cc0 ("netfilter: x_tables: add context to know if extension runs from nft_compat") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 2 ++ net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 3 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 08a65e4a77d0..ead123dab05e 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -700,6 +700,8 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, } i = 0; + memset(&mtpar, 0, sizeof(mtpar)); + memset(&tgpar, 0, sizeof(tgpar)); mtpar.net = tgpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e85f35b89c49..f6130704f052 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -531,6 +531,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, return -ENOMEM; j = 0; + memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 97f79dc943d7..685c2168f524 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -551,6 +551,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, return -ENOMEM; j = 0; + memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; -- cgit v1.2.3 From 0975764684487bf3f7a47eef009e750ea41bd514 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 11 Jun 2018 02:02:54 +0300 Subject: ipv6: allow PMTU exceptions to local routes IPVS setups with local client and remote tunnel server need to create exception for the local virtual IP. What we do is to change PMTU from 64KB (on "lo") to 1460 in the common case. Suggested-by: Martin KaFai Lau Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Fixes: 7343ff31ebf0 ("ipv6: Don't create clones of host routes.") Signed-off-by: Julian Anastasov Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb956989adaf..86a0e4333d42 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2307,9 +2307,6 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; - if (rt6->rt6i_flags & RTF_LOCAL) - return; - if (dst_metric_locked(dst, RTAX_MTU)) return; -- cgit v1.2.3 From 349b71d6f427ff8211adf50839dbbff3f27c1805 Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Mon, 11 Jun 2018 13:26:35 +0800 Subject: net: dsa: add error handling for pskb_trim_rcsum When pskb_trim_rcsum fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling pskb_trim_rcsum. Signed-off-by: Zhouyang Jia Signed-off-by: David S. Miller --- net/dsa/tag_trailer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 7d20e1f3de28..56197f0d9608 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -75,7 +75,8 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - pskb_trim_rcsum(skb, skb->len - 4); + if (pskb_trim_rcsum(skb, skb->len - 4)) + return NULL; return skb; } -- cgit v1.2.3 From a343993c518ce252b62ec00ac06bccfb1d17129d Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 11 Jun 2018 13:57:12 +0200 Subject: xsk: silence warning on memory allocation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported a warning from xdp_umem_pin_pages(): WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996 ... __do_kmalloc mm/slab.c:3713 [inline] __kmalloc+0x25/0x760 mm/slab.c:3727 kmalloc_array include/linux/slab.h:634 [inline] kcalloc include/linux/slab.h:645 [inline] xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline] xdp_umem_reg net/xdp/xdp_umem.c:318 [inline] xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is a warning about attempting to allocate more than KMALLOC_MAX_SIZE memory. The request originates from userspace, and if the request is too big, the kernel is free to deny its allocation. In this patch, the failed allocation attempt is silenced with __GFP_NOWARN. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b9ef487c4618..f47abb46c587 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -204,7 +204,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), + GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; -- cgit v1.2.3 From f6fadff33e8b09373eedf99822b89d9dd84545b8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jun 2018 23:22:04 +0200 Subject: tls: fix NULL pointer dereference on poll While hacking on kTLS, I ran into the following panic from an unprivileged netserver / netperf TCP session: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0 Oops: 0010 [#1] SMP KASAN PTI CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 RIP: 0010: (null) Code: Bad RIP value. RSP: 0018:ffff88036abcf740 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26 RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280 RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48 R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280 R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350 FS: 00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? tls_sw_poll+0xa4/0x160 [tls] ? sock_poll+0x20a/0x680 ? do_select+0x77b/0x11a0 ? poll_schedule_timeout.constprop.12+0x130/0x130 ? pick_link+0xb00/0xb00 ? read_word_at_a_time+0x13/0x20 ? vfs_poll+0x270/0x270 ? deref_stack_reg+0xad/0xe0 ? __read_once_size_nocheck.constprop.6+0x10/0x10 [...] Debugging further, it turns out that calling into ctx->sk_poll() is invalid since sk_poll itself is NULL which was saved from the original TCP socket in order for tls_sw_poll() to invoke it. Looks like the recent conversion from poll to poll_mask callback started in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed to eventually convert kTLS, too: TCP's ->poll was converted over to the ->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") and therefore kTLS wrongly saved the ->poll old one which is now NULL. Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN | POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in tcp_poll_mask() as well that is mangled here. Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") Signed-off-by: Daniel Borkmann Cc: Christoph Hellwig Cc: Dave Watson Tested-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 6 ++---- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 +++++++++---------- 3 files changed, 12 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 70c273777fe9..7f84ea3e217c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -109,8 +109,7 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 301f22430469..a127d61e8af9 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8ca57d01b18f..34895b7c132d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -915,23 +915,22 @@ splice_read_end: return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) { - unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + __poll_t mask; - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); + /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ + mask = ctx->sk_poll_mask(sock, events); - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); + /* Clear EPOLLIN bits, and set based on recv_pkt */ + mask &= ~(EPOLLIN | EPOLLRDNORM); if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; - return ret; + return mask; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; + sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; strp_check_rcv(&sw_ctx_rx->strp); } -- cgit v1.2.3 From 3f2d67b6bd24c615cfe3a6d793613bb2838dd9b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 11 Jun 2018 07:12:12 -0700 Subject: net/ipv6: Ensure cfg is properly initialized in ipv6_create_tempaddr Valdis reported a BUG in ipv6_add_addr: [ 1820.832682] BUG: unable to handle kernel NULL pointer dereference at 0000000000000209 [ 1820.832728] RIP: 0010:ipv6_add_addr+0x280/0xd10 [ 1820.832732] Code: 49 8b 1f 0f 84 6a 0a 00 00 48 85 db 0f 84 4e 0a 00 00 48 8b 03 48 8b 53 08 49 89 45 00 49 8b 47 10 49 89 55 08 48 85 c0 74 15 <48> 8b 50 08 48 8b 00 49 89 95 b8 01 00 00 49 89 85 b0 01 00 00 4c [ 1820.832847] RSP: 0018:ffffaa07c2fd7880 EFLAGS: 00010202 [ 1820.832853] RAX: 0000000000000201 RBX: ffffaa07c2fd79b0 RCX: 0000000000000000 [ 1820.832858] RDX: a4cfbfba2cbfa64c RSI: 0000000000000000 RDI: ffffffff8a8e9fa0 [ 1820.832862] RBP: ffffaa07c2fd7920 R08: 000000000000017a R09: ffffffff8a555300 [ 1820.832866] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888d18e71c00 [ 1820.832871] R13: ffff888d0a9b1200 R14: 0000000000000000 R15: ffffaa07c2fd7980 [ 1820.832876] FS: 00007faa51bdb800(0000) GS:ffff888d1d400000(0000) knlGS:0000000000000000 [ 1820.832880] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1820.832885] CR2: 0000000000000209 CR3: 000000021e8f8001 CR4: 00000000001606e0 [ 1820.832888] Call Trace: [ 1820.832898] ? __local_bh_enable_ip+0x119/0x260 [ 1820.832904] ? ipv6_create_tempaddr+0x259/0x5a0 [ 1820.832912] ? __local_bh_enable_ip+0x139/0x260 [ 1820.832921] ipv6_create_tempaddr+0x2da/0x5a0 [ 1820.832926] ? ipv6_create_tempaddr+0x2da/0x5a0 [ 1820.832941] manage_tempaddrs+0x1a5/0x240 [ 1820.832951] inet6_addr_del+0x20b/0x3b0 [ 1820.832959] ? nla_parse+0xce/0x1e0 [ 1820.832968] inet6_rtm_deladdr+0xd9/0x210 [ 1820.832981] rtnetlink_rcv_msg+0x1d4/0x5f0 Looking at the code I found 1 element (peer_pfx) of the newly introduced ifa6_config struct that is not initialized. Use a memset rather than hard coding an init for each struct element. Reported-by: Valdis Kletnieks Fixes: e6464b8c63619 ("net/ipv6: Convert ipv6_add_addr to struct ifa6_config") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 89019bf59f46..c134286d6a41 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1324,6 +1324,7 @@ retry: } } + memset(&cfg, 0, sizeof(cfg)); cfg.valid_lft = min_t(__u32, ifp->valid_lft, idev->cnf.temp_valid_lft + age); cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; @@ -1357,7 +1358,6 @@ retry: cfg.pfx = &addr; cfg.scope = ipv6_addr_scope(cfg.pfx); - cfg.rt_priority = 0; ift = ipv6_add_addr(idev, &cfg, block, NULL); if (IS_ERR(ift)) { -- cgit v1.2.3 From 6892286e9c09925780fe2cb6db3585b56b71fe8e Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 11 Jun 2018 18:00:13 -0700 Subject: tcp: Do not reload skb pointer after skb_gro_receive(). This is not necessary. skb_gro_receive() will never change what 'head' points to. In it's original implementation (see commit 71d93b39e52e ("net: Add skb_gro_receive")), it did: ==================== + *head = nskb; + nskb->next = p->next; + p->next = NULL; ==================== This sequence was removed in commit 58025e46ea2d ("net: gro: remove obsolete code from skb_gro_receive()") Signed-off-by: David S. Miller Signed-off-by: Eric Dumazet --- net/ipv4/tcp_offload.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4d58e2ce0b5b..8cc7c3487330 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -268,8 +268,6 @@ found: goto out_check_final; } - p = *head; - th2 = tcp_hdr(p); tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: -- cgit v1.2.3 From 3fb61eca185cc65a1be23d9a5a11347eef79f597 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 Jun 2018 17:56:08 +0200 Subject: netfilter: nft_socket: fix module autoload Add alias definition for module autoload when adding socket rules. Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f28a0b944087..74e1b3bd6954 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -142,3 +142,4 @@ module_exit(nft_socket_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Máté Eckl"); MODULE_DESCRIPTION("nf_tables socket match module"); +MODULE_ALIAS_NFT_EXPR("socket"); -- cgit v1.2.3 From 215a31f19dedd4e92a67cf5a9717ee898d012b3a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 Jun 2018 17:18:29 +0200 Subject: netfilter: nft_dynset: do not reject set updates with NFT_SET_EVAL NFT_SET_EVAL is signalling the kernel that this sets can be updated from the evaluation path, even if there are no expressions attached to the element. Otherwise, set updates with no expressions fail. Update description to describe the right semantics. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nft_dynset.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c9bf74b94f37..89438e68dc03 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -266,7 +266,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary * @NFT_SET_TIMEOUT: set uses timeouts - * @NFT_SET_EVAL: set contains expressions for evaluation + * @NFT_SET_EVAL: set can be updated from the evaluation path * @NFT_SET_OBJECT: set contains stateful objects */ enum nft_set_flags { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 4d49529cff61..27d7e4598ab6 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -203,9 +203,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, goto err1; set->ops->gc_init(set); } - - } else if (set->flags & NFT_SET_EVAL) - return -EINVAL; + } nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); -- cgit v1.2.3 From 71ad00c50d77e507138c792a9646b53c16f22e11 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 13:20:35 +0200 Subject: netfilter: nf_tables: fix module unload race We must first remove the nfnetlink protocol handler when nf_tables module is unloaded -- we don't want userspace to submit new change requests once we've started to tear down nft state. Furthermore, nfnetlink must not call any subsystem function after call_batch returned -EAGAIN. EAGAIN means the subsys mutex was dropped, so its unlikely but possible that nf_tables subsystem was removed due to 'rmmod nf_tables' on another cpu. Therefore, we must abort batch completely and not move on to next part of the batch. Last, we can't invoke ->abort unless we've checked that the subsystem is still registered. Change netns exit path of nf_tables to make sure any incompleted transaction gets removed on exit. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++++++--- net/netfilter/nfnetlink.c | 10 +++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7979095b69b0..ae312b31db28 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6439,7 +6439,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int __nf_tables_abort(struct net *net) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -6555,6 +6555,11 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } +static int nf_tables_abort(struct net *net, struct sk_buff *skb) +{ + return __nf_tables_abort(net); +} + static bool nf_tables_valid_genid(struct net *net, u32 genid) { return net->nft.base_seq == genid; @@ -7149,9 +7154,10 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + if (!list_empty(&net->nft.commit_list)) + __nf_tables_abort(net); __nft_release_tables(net); WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); } static struct pernet_operations nf_tables_net_ops = { @@ -7193,9 +7199,9 @@ err1: static void __exit nf_tables_module_exit(void) { - unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); + unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4d0da7042aff..e1b6be29848d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -429,7 +429,7 @@ replay: */ if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; - goto next; + goto done; } } ack: @@ -456,7 +456,7 @@ ack: if (err) status |= NFNL_BATCH_FAILURE; } -next: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; @@ -464,7 +464,11 @@ next: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + const struct nfnetlink_subsystem *ss2; + + ss2 = nfnl_dereference_protected(subsys_id); + if (ss2 == ss) + ss->abort(net, oskb); nfnl_err_reset(&err_list); nfnl_unlock(subsys_id); kfree_skb(skb); -- cgit v1.2.3 From 0a2cf5ee432c2e8718af3553a56a3760d767b736 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 13:20:36 +0200 Subject: netfilter: nf_tables: close race between netns exit and rmmod If net namespace is exiting while nf_tables module is being removed we can oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 IP: nf_tables_flowtable_event+0x43/0xf0 [nf_tables] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI Modules linked in: nf_tables(-) nfnetlink [..] unregister_netdevice_notifier+0xdd/0x130 nf_tables_module_exit+0x24/0x3a [nf_tables] SyS_delete_module+0x1c5/0x240 do_syscall_64+0x74/0x190 Avoid this by attempting to take reference on the net namespace from the notifiers. If it fails the namespace is exiting already, and nft core is taking care of cleanup work. We also need to make sure the netdev hook type gets removed before netns ops removal, else notifier might be invoked with device event for a netns where net->nft was never initialised (because pernet ops was removed beforehand). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++++++--- net/netfilter/nft_chain_filter.c | 5 +++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ae312b31db28..d23a5c269c44 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5837,18 +5837,23 @@ static int nf_tables_flowtable_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; struct nft_table *table; + struct net *net; if (event != NETDEV_UNREGISTER) return 0; + net = maybe_get_net(dev_net(dev)); + if (!net) + return 0; + nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &dev_net(dev)->nft.tables, list) { + list_for_each_entry(table, &net->nft.tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); - + put_net(net); return NOTIFY_DONE; } @@ -7154,9 +7159,11 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net); __nft_release_tables(net); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); WARN_ON_ONCE(!list_empty(&net->nft.tables)); } @@ -7201,11 +7208,11 @@ static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); + nft_chain_filter_fini(); unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); - nft_chain_filter_fini(); } module_init(nf_tables_module_init); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 84c902477a91..d21834bed805 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,6 +318,10 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; + ctx.net = maybe_get_net(ctx.net); + if (!ctx.net) + return NOTIFY_DONE; + nfnl_lock(NFNL_SUBSYS_NFTABLES); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) @@ -334,6 +338,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); + put_net(ctx.net); return NOTIFY_DONE; } -- cgit v1.2.3 From adc972c5b88829d38ede08b1069718661c7330ae Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 11 Jun 2018 22:16:33 +0900 Subject: netfilter: nf_tables: use WARN_ON_ONCE instead of BUG_ON in nft_do_chain() When depth of chain is bigger than NFT_JUMP_STACK_SIZE, the nft_do_chain crashes. But there is no need to crash hard here. Suggested-by: Florian Westphal Signed-off-by: Taehee Yoo Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index deff10adef9c..8de912ca53d3 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -183,7 +183,8 @@ next_rule: switch (regs.verdict.code) { case NFT_JUMP: - BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); + if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) + return NF_DROP; jumpstack[stackptr].chain = chain; jumpstack[stackptr].rules = rules + 1; stackptr++; -- cgit v1.2.3 From c05a45c0865d986a8aea373cd5297dbfded6882e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 22:22:19 +0200 Subject: netfilter: ctnetlink: avoid null pointer dereference Dan Carpenter points out that deref occurs after NULL check, we should re-fetch the pointer and check that instead. Fixes: 2c205dd3981f7 ("netfilter: add struct nf_nat_hook and use it") Reported-by: Dan Carpenter Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 39327a42879f..20a2e37c76d1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1446,7 +1446,8 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); - if (nat_hook->parse_nat_setup) + nat_hook = rcu_dereference(nf_nat_hook); + if (nat_hook) return -EAGAIN; #endif return -EOPNOTSUPP; -- cgit v1.2.3 From fc6ddbecce440df74fb4491c17c372b52cf5be83 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Jun 2018 18:36:19 +0200 Subject: netfilter: xt_connmark: fix list corruption on rmmod This needs to use xt_unregister_targets, else new revision is left on the list which then causes list to point to a target struct that has been free'd. Fixes: 472a73e00757 ("netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets.") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 94df000abb92..29c38aa7f726 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -211,7 +211,7 @@ static int __init connmark_mt_init(void) static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); - xt_unregister_target(connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_mt_init); -- cgit v1.2.3 From 21ba8847f857028dc83a0f341e16ecc616e34740 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 12 Jun 2018 10:51:34 -0700 Subject: netfilter: nf_conncount: Fix garbage collection with zones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, we use check_hlist() for garbage colleciton. However, we use the ‘zone’ from the counted entry to query the existence of existing entries in the hlist. This could be wrong when they are in different zones, and this patch fixes this issue. Fixes: e59ea3df3fc2 ("netfilter: xt_connlimit: honor conntrack zone if available") Signed-off-by: Yi-Hung Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 3 ++- net/netfilter/nf_conncount.c | 13 +++++++++---- net/netfilter/nft_connlimit.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1910b6572430..3a188a0923a3 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -20,7 +20,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, bool *addit); bool nf_conncount_add(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); void nf_conncount_cache_free(struct hlist_head *hhead); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 3b5059a8dcdd..d8383609fe28 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -46,6 +46,7 @@ struct nf_conncount_tuple { struct hlist_node node; struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; }; struct nf_conncount_rb { @@ -80,7 +81,8 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) } bool nf_conncount_add(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; @@ -88,6 +90,7 @@ bool nf_conncount_add(struct hlist_head *head, if (conn == NULL) return false; conn->tuple = *tuple; + conn->zone = *zone; hlist_add_head(&conn->node, head); return true; } @@ -108,7 +111,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, zone, &conn->tuple); + found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found == NULL) { hlist_del(&conn->node); kmem_cache_free(conncount_conn_cachep, conn); @@ -117,7 +120,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) { + if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks @@ -196,7 +200,7 @@ count_tree(struct net *net, struct rb_root *root, if (!addit) return count; - if (!nf_conncount_add(&rbconn->hhead, tuple)) + if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) return 0; /* hotdrop */ return count + 1; @@ -238,6 +242,7 @@ count_tree(struct net *net, struct rb_root *root, } conn->tuple = *tuple; + conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * keylen); INIT_HLIST_HEAD(&rbconn->hhead); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 50c068d660e5..a832c59f0a9c 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -52,7 +52,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, if (!addit) goto out; - if (!nf_conncount_add(&priv->hhead, tuple_ptr)) { + if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; spin_unlock_bh(&priv->lock); return; -- cgit v1.2.3 From cdb8744d80352b55c622d049a6c91f449cd291f8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 12 Jun 2018 10:05:55 -0700 Subject: Revert "net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert the patch mentioned in the subject because it breaks at least the Avahi mDNS daemon. That patch namely causes the Ubuntu 18.04 Avahi daemon to fail to start: Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully called chroot(). Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully dropped remaining capabilities. Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: No service file found in /etc/avahi/services. Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Failed to create server: No suitable network protocol available Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: avahi-daemon 0.7 exiting. Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Main process exited, code=exited, status=255/n/a Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Failed with result 'exit-code'. Jun 12 09:49:24 ubuntu-vm systemd[1]: Failed to start Avahi mDNS/DNS-SD Stack. Fixes: f396922d862a ("net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets") Cc: Maciej Żenczykowski Cc: Eric Dumazet Signed-off-by: Bart Van Assche Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f333d75ef1a9..bcc41829a16d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -728,22 +728,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: - val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); - if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && - inet_sk(sk)->inet_num && - (sk->sk_reuse != val)) { - ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; - break; - } - sk->sk_reuse = val; + sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && - inet_sk(sk)->inet_num && - (sk->sk_reuseport != valbool)) { - ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; - break; - } sk->sk_reuseport = valbool; break; case SO_TYPE: -- cgit v1.2.3 From c0129a0614428e5e4350fa963eecd1fbe19e57e9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Jun 2018 14:07:14 -0700 Subject: smc: convert to ->poll_mask smc->clcsock is an internal TCP socket, after TCP socket converts to ->poll_mask, ->poll doesn't exist any more. So just convert smc socket to ->poll_mask too. Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") Reported-by: syzbot+f5066e369b2d5fff630f@syzkaller.appspotmail.com Cc: Christoph Hellwig Cc: Ursula Braun Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/smc/af_smc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 973b4471b532..da7f02edcd37 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1273,8 +1273,7 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1290,7 +1289,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ release_sock(sk); - mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; if (sk->sk_err) { @@ -1308,11 +1307,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, } } } else { - if (sk->sk_state != SMC_CLOSED) { - release_sock(sk); - sock_poll_wait(file, sk_sleep(sk), wait); - lock_sock(sk); - } if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1625,7 +1619,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll = smc_poll, + .poll_mask = smc_poll_mask, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, -- cgit v1.2.3 From 995191220056300c51ed870a5d5321f91f3eef89 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 14 Jun 2018 07:37:02 +0800 Subject: sctp: define sctp_packet_gso_append to build GSO frames Now sctp GSO uses skb_gro_receive() to append the data into head skb frag_list. However it actually only needs very few code from skb_gro_receive(). Besides, NAPI_GRO_CB has to be set while most of its members are not needed here. This patch is to add sctp_packet_gso_append() to build GSO frames instead of skb_gro_receive(), and it would avoid many unnecessary checks and make the code clearer. Note that sctp will use page frags instead of frag_list to build GSO frames in another patch. But it may take time, as sctp's GSO frames may have different size. skb_segment() can only split it into the frags with the same size, which would break the border of sctp chunks. Signed-off-by: Xin Long Reviewed-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 5 +++++ net/sctp/output.c | 28 ++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ebf809eed33a..dbe1b911a24d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1133,6 +1133,11 @@ struct sctp_input_cb { }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) +struct sctp_output_cb { + struct sk_buff *last; +}; +#define SCTP_OUTPUT_CB(__skb) ((struct sctp_output_cb *)&((__skb)->cb[0])) + static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) { const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; diff --git a/net/sctp/output.c b/net/sctp/output.c index e672dee302c7..7f849b01ec8e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -409,6 +409,21 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) refcount_inc(&sk->sk_wmem_alloc); } +static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) +{ + if (SCTP_OUTPUT_CB(head)->last == head) + skb_shinfo(head)->frag_list = skb; + else + SCTP_OUTPUT_CB(head)->last->next = skb; + SCTP_OUTPUT_CB(head)->last = skb; + + head->truesize += skb->truesize; + head->data_len += skb->len; + head->len += skb->len; + + __skb_header_release(skb); +} + static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { @@ -422,7 +437,7 @@ static int sctp_packet_pack(struct sctp_packet *packet, if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; - NAPI_GRO_CB(head)->last = head; + SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; @@ -503,15 +518,8 @@ merge: &packet->chunk_list); } - if (gso) { - if (skb_gro_receive(&head, nskb)) { - kfree_skb(nskb); - return 0; - } - if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= - sk->sk_gso_max_segs)) - return 0; - } + if (gso) + sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); -- cgit v1.2.3 From 4fd44a98ffe0d048246efef67ed640fdf2098a62 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 12 Jun 2018 23:09:37 +0000 Subject: tcp: verify the checksum of the first data segment in a new connection commit 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") introduced an optimization for the handling of child sockets created for a new TCP connection. But this optimization passes any data associated with the last ACK of the connection handshake up the stack without verifying its checksum, because it calls tcp_child_process(), which in turn calls tcp_rcv_state_process() directly. These lower-level processing functions do not do any checksum verification. Insert a tcp_checksum_complete call in the TCP_NEW_SYN_RECEIVE path to fix this. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Frank van der Linden Signed-off-by: Eric Dumazet Tested-by: Balbir Singh Reviewed-by: Balbir Singh Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv6/tcp_ipv6.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fed3f1c66167..bea17f1e8302 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1730,6 +1730,10 @@ process: reqsk_put(req); goto discard_it; } + if (tcp_checksum_complete(skb)) { + reqsk_put(req); + goto csum_error; + } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b620d9b72e59..7efa9fd7e109 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1479,6 +1479,10 @@ process: reqsk_put(req); goto discard_it; } + if (tcp_checksum_complete(skb)) { + reqsk_put(req); + goto csum_error; + } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; -- cgit v1.2.3 From 90904ff5f958a215cc3d26f957a46e80fa178470 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:18 +0200 Subject: l2tp: fix pseudo-wire type for sessions created by pppol2tp_connect() Define cfg.pw_type so that the new session is created with its .pwtype field properly set (L2TP_PWTYPE_PPP). Not setting the pseudo-wire type had several annoying effects: * Invalid value returned in the L2TP_ATTR_PW_TYPE attribute when dumping sessions with the netlink API. * Impossibility to delete the session using the netlink API (because l2tp_nl_cmd_session_delete() gets the deletion callback function from an array indexed by the session's pseudo-wire type). Also, there are several cases where we should check a session's pseudo-wire type. For example, pppol2tp_connect() should refuse to connect a session that is not PPPoL2TP, but that requires the session's .pwtype field to be properly set. Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b56cb1df4fc0..270a0a999eaf 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -751,6 +751,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Default MTU must allow space for UDP/L2TP/PPP headers */ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; cfg.mru = cfg.mtu; + cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, -- cgit v1.2.3 From 7ac6ab1f8a38ba7f8d97f95475bb6a2575db4658 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:19 +0200 Subject: l2tp: only accept PPP sessions in pppol2tp_connect() l2tp_session_priv() returns a struct pppol2tp_session pointer only for PPPoL2TP sessions. In particular, if the session is an L2TP_PWTYPE_ETH pseudo-wire, l2tp_session_priv() returns a pointer to an l2tp_eth_sess structure, which is much smaller than struct pppol2tp_session. This leads to invalid memory dereference when trying to lock ps->sk_lock. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 270a0a999eaf..8b3b6947a07d 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -734,6 +734,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session = l2tp_session_get(sock_net(sk), tunnel, session_id); if (session) { drop_refcnt = true; + + if (session->pwtype != L2TP_PWTYPE_PPP) { + error = -EPROTOTYPE; + goto end; + } + ps = l2tp_session_priv(session); /* Using a pre-existing session is fine as long as it hasn't -- cgit v1.2.3 From 3e1bc8bf974e2d4e7beb842a4c801c2542eff3bd Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:20 +0200 Subject: l2tp: prevent pppol2tp_connect() from creating kernel sockets If 'fd' is negative, l2tp_tunnel_create() creates a tunnel socket using the configuration passed in 'tcfg'. Currently, pppol2tp_connect() sets the relevant fields to zero, tricking l2tp_tunnel_create() into setting up an unusable kernel socket. We can't set 'tcfg' with the required fields because there's no way to get them from the current connect() parameters. So let's restrict kernel sockets creation to the netlink API, which is the original use case. Fixes: 789a4a2c61d8 ("l2tp: Add support for static unmanaged L2TPv3 tunnels") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 8b3b6947a07d..1b24f76ae210 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -701,6 +701,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, .encap = L2TP_ENCAPTYPE_UDP, .debug = 0, }; + + /* Prevent l2tp_tunnel_register() from trying to set up + * a kernel socket. + */ + if (fd < 0) { + error = -EBADF; + goto end; + } + error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); if (error < 0) goto end; -- cgit v1.2.3 From bda06be2158c7aa7e41b15500c4d3840369c19a6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:21 +0200 Subject: l2tp: clean up stale tunnel or session in pppol2tp_connect's error path pppol2tp_connect() may create a tunnel or a session. Remove them in case of error. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1b24f76ae210..f429fed06a1e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -612,6 +612,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, u32 session_id, peer_session_id; bool drop_refcnt = false; bool drop_tunnel = false; + bool new_session = false; + bool new_tunnel = false; int ver = 2; int fd; @@ -722,6 +724,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } drop_tunnel = true; + new_tunnel = true; } } else { /* Error if we can't find the tunnel */ @@ -788,6 +791,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } drop_refcnt = true; + new_session = true; } /* Special case: if source & dest session_id == 0x0000, this @@ -834,6 +838,12 @@ out_no_ppp: session->name); end: + if (error) { + if (new_session) + l2tp_session_delete(session); + if (new_tunnel) + l2tp_tunnel_delete(tunnel); + } if (drop_refcnt) l2tp_session_dec_refcount(session); if (drop_tunnel) -- cgit v1.2.3 From f1693c63ab133d16994cc50f773982b5905af264 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 14 Jun 2018 11:52:34 -0700 Subject: rds: avoid unenecessary cong_update in loop transport Loop transport which is self loopback, remote port congestion update isn't relevant. Infact the xmit path already ignores it. Receive path needs to do the same. Reported-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/loop.c | 1 + net/rds/rds.h | 5 +++++ net/rds/recv.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/rds/loop.c b/net/rds/loop.c index f2bf78de5688..dac6218a460e 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -193,4 +193,5 @@ struct rds_transport rds_loop_transport = { .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", + .t_type = RDS_TRANS_LOOP, }; diff --git a/net/rds/rds.h b/net/rds/rds.h index b04c333d9d1c..f2272fb8cd45 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -479,6 +479,11 @@ struct rds_notifier { int n_status; }; +/* Available as part of RDS core, so doesn't need to participate + * in get_preferred transport etc + */ +#define RDS_TRANS_LOOP 3 + /** * struct rds_transport - transport specific behavioural hooks * diff --git a/net/rds/recv.c b/net/rds/recv.c index dc67458b52f0..192ac6f78ded 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -103,6 +103,11 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, rds_stats_add(s_recv_bytes_added_to_socket, delta); else rds_stats_add(s_recv_bytes_removed_from_socket, -delta); + + /* loop transport doesn't send/recv congestion updates */ + if (rs->rs_transport->t_type == RDS_TRANS_LOOP) + return; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " -- cgit v1.2.3 From 3c12d0486856b9eb89c2a9ac336713cba90813e3 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 Jun 2018 10:53:55 +0200 Subject: cfg80211: initialize sinfo in cfg80211_get_station Most of the implementations behind cfg80211_get_station will not initialize sinfo to zero before manipulating it. For example, the member "filled", which indicates the filled in parts of this struct, is often only modified by enabling certain bits in the bitfield while keeping the remaining bits in their original state. A caller without a preinitialized sinfo.filled can then no longer decide which parts of sinfo were filled in by cfg80211_get_station (or actually the underlying implementations). cfg80211_get_station must therefore take care that sinfo is initialized to zero. Otherwise, the caller may tries to read information which was not filled in and which must therefore also be considered uninitialized. In batadv_v_elp_get_throughput's case, an invalid "random" expected throughput may be stored for this neighbor and thus the B.A.T.M.A.N V algorithm may switch to non-optimal neighbors for certain destinations. Fixes: 7406353d43c8 ("cfg80211: implement cfg80211_get_station cfg80211 API") Reported-by: Thomas Lauer Reported-by: Marcel Schmidt Cc: b.a.t.m.a.n@lists.open-mesh.org Signed-off-by: Sven Eckelmann Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index b5bb1c309914..3c654cd7ba56 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1746,6 +1746,8 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, if (!rdev->ops->get_station) return -EOPNOTSUPP; + memset(sinfo, 0, sizeof(*sinfo)); + return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); -- cgit v1.2.3 From dc8b274f0952f604d72b10698cde6887321a669f Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 25 May 2018 14:29:21 +0200 Subject: mac80211: Move up init of TXQs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On init, ieee80211_if_add() dumps the interface. Since that now includes a dump of the TXQ state, we need to initialise that before the dump happens. So move up the TXQ initialisation to to before the call to ieee80211_if_add(). Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace") Reported-by: Niklas Cassel Signed-off-by: Toke Høiland-Jørgensen Tested-by: Niklas Cassel Signed-off-by: Johannes Berg --- net/mac80211/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4d2e797e3f16..722f3d9fb416 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1098,6 +1098,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_init(local); + result = ieee80211_txq_setup_flows(local); + if (result) + goto fail_flows; + rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, @@ -1120,10 +1124,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); - result = ieee80211_txq_setup_flows(local); - if (result) - goto fail_flows; - #ifdef CONFIG_INET local->ifa_notifier.notifier_call = ieee80211_ifa_changed; result = register_inetaddr_notifier(&local->ifa_notifier); @@ -1149,8 +1149,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) #if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: #endif - ieee80211_txq_teardown_flows(local); - fail_flows: rtnl_lock(); rate_control_deinitialize(local); ieee80211_remove_interfaces(local); @@ -1158,6 +1156,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); ieee80211_led_exit(local); ieee80211_wep_free(local); + ieee80211_txq_teardown_flows(local); + fail_flows: destroy_workqueue(local->workqueue); fail_workqueue: wiphy_unregister(local->hw.wiphy); -- cgit v1.2.3 From bf2b61a6838f19cbc33f6732715012c483fa3795 Mon Sep 17 00:00:00 2001 From: Dedy Lansky Date: Fri, 15 Jun 2018 13:05:01 +0200 Subject: cfg80211: fix rcu in cfg80211_unregister_wdev Callers of cfg80211_unregister_wdev can free the wdev object immediately after this function returns. This may crash the kernel because this wdev object is still in use by other threads. Add synchronize_rcu() after list_del_rcu to make sure wdev object can be safely freed. Signed-off-by: Dedy Lansky Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 5fe35aafdd9c..48e8097339ab 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1012,6 +1012,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); list_del_rcu(&wdev->list); + synchronize_rcu(); rdev->devlist_generation++; switch (wdev->iftype) { -- cgit v1.2.3 From 6eba08c3626bca42b3cb0c9d43ac37ab11b4be1a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jun 2018 16:23:35 +0300 Subject: ipv6: Only emit append events for appended routes Current code will emit an append event in the FIB notification chain for any route added with NLM_F_APPEND set, even if the route was not appended to any existing route. This is inconsistent with IPv4 where such an event is only emitted when the new route is appended after an existing one. Align IPv6 behavior with IPv4, thereby allowing listeners to more easily handle these events. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7aa4c41a3bd9..39d1d487eca2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,6 +934,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); + enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct fib6_info *iter = NULL, *match = NULL; struct fib6_info __rcu **ins; int replace = (info->nlh && @@ -1013,6 +1014,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, "Can not append to a REJECT route"); return -EINVAL; } + event = FIB_EVENT_ENTRY_APPEND; rt->fib6_nsiblings = match->fib6_nsiblings; list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); match->fib6_nsiblings++; @@ -1034,15 +1036,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, * insert node */ if (!replace) { - enum fib_event_type event; - if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; - event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD; err = call_fib6_entry_notifiers(info->nl_net, event, rt, extack); if (err) -- cgit v1.2.3 From de9bada5d389903f4faf33980e6a95a2911c7e6d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 15 Jun 2018 15:39:17 +0200 Subject: l2tp: reject creation of non-PPP sessions on L2TPv2 tunnels The /proc/net/pppol2tp handlers (pppol2tp_seq_*()) iterate over all L2TPv2 tunnels, and rightfully expect that only PPP sessions can be found there. However, l2tp_netlink accepts creating Ethernet sessions regardless of the underlying tunnel version. This confuses pppol2tp_seq_session_show(), which expects that l2tp_session_priv() returns a pppol2tp_session structure. When the session is an Ethernet pseudo-wire, a struct l2tp_eth_sess is returned instead. This leads to invalid memory access when pppol2tp_session_get_sock() later tries to dereference ps->sk. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 6616c9fd292f..5b9900889e31 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -553,6 +553,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf goto out_tunnel; } + /* L2TPv2 only accepts PPP pseudo-wires */ + if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) { + ret = -EPROTONOSUPPORT; + goto out_tunnel; + } + if (tunnel->version > 2) { if (info->attrs[L2TP_ATTR_DATA_SEQ]) cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); -- cgit v1.2.3 From ecd012e45ab5fd76ed57546865897ce35920f56b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 15 Jun 2018 15:39:19 +0200 Subject: l2tp: filter out non-PPP sessions in pppol2tp_tunnel_ioctl() pppol2tp_tunnel_ioctl() can act on an L2TPv3 tunnel, in which case 'session' may be an Ethernet pseudo-wire. However, pppol2tp_session_ioctl() expects a PPP pseudo-wire, as it assumes l2tp_session_priv() points to a pppol2tp_session structure. For an Ethernet pseudo-wire l2tp_session_priv() points to an l2tp_eth_sess structure instead, making pppol2tp_session_ioctl() access invalid memory. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f429fed06a1e..55188382845c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1201,7 +1201,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, l2tp_session_get(sock_net(sk), tunnel, stats.session_id); - if (session) { + if (session && session->pwtype == L2TP_PWTYPE_PPP) { err = pppol2tp_session_ioctl(session, cmd, arg); l2tp_session_dec_refcount(session); -- cgit v1.2.3 From a447da7d00410278c90d3576782a43f8b675d7be Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:45 +0200 Subject: tls: fix use-after-free in tls_push_record syzkaller managed to trigger a use-after-free in tls like the following: BUG: KASAN: use-after-free in tls_push_record.constprop.15+0x6a2/0x810 [tls] Write of size 1 at addr ffff88037aa08000 by task a.out/2317 CPU: 3 PID: 2317 Comm: a.out Not tainted 4.17.0+ #144 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x280 kasan_report+0x258/0x380 ? tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_sw_push_pending_record+0x2e/0x40 [tls] tls_sk_proto_close+0x3fe/0x710 [tls] ? tcp_check_oom+0x4c0/0x4c0 ? tls_write_space+0x260/0x260 [tls] ? kmem_cache_free+0x88/0x1f0 inet_release+0xd6/0x1b0 __sock_release+0xc0/0x240 sock_close+0x11/0x20 __fput+0x22d/0x660 task_work_run+0x114/0x1a0 do_exit+0x71a/0x2780 ? mm_update_next_owner+0x650/0x650 ? handle_mm_fault+0x2f5/0x5f0 ? __do_page_fault+0x44f/0xa50 ? mm_fault_error+0x2d0/0x2d0 do_group_exit+0xde/0x300 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0x9a/0x300 ? page_fault+0x8/0x30 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happened through fault injection where aead_req allocation in tls_do_encryption() eventually failed and we returned -ENOMEM from the function. Turns out that the use-after-free is triggered from tls_sw_sendmsg() in the second tls_push_record(). The error then triggers a jump to waiting for memory in sk_stream_wait_memory() resp. returning immediately in case of MSG_DONTWAIT. What follows is the trim_both_sgl(sk, orig_size), which drops elements from the sg list added via tls_sw_sendmsg(). Now the use-after-free gets triggered when the socket is being closed, where tls_sk_proto_close() callback is invoked. The tls_complete_pending_work() will figure that there's a pending closed tls record to be flushed and thus calls into the tls_push_pending_closed_record() from there. ctx->push_pending_record() is called from the latter, which is the tls_sw_push_pending_record() from sw path. This again calls into tls_push_record(). And here the tls_fill_prepend() will panic since the buffer address has been freed earlier via trim_both_sgl(). One way to fix it is to move the aead request allocation out of tls_do_encryption() early into tls_push_record(). This means we don't prep the tls header and advance state to the TLS_PENDING_CLOSED_RECORD before allocation which could potentially fail happened. That fixes the issue on my side. Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+5c74af81c547738e1684@syzkaller.appspotmail.com Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 34895b7c132d..2945a3bd538c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -191,18 +191,12 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context_tx *ctx, size_t data_len, - gfp_t flags) + struct tls_sw_context_tx *ctx, + struct aead_request *aead_req, + size_t data_len) { - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send); - struct aead_request *aead_req; int rc; - aead_req = kzalloc(req_size, flags); - if (!aead_req) - return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; @@ -219,7 +213,6 @@ static int tls_do_encryption(struct tls_context *tls_ctx, ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; - kfree(aead_req); return rc; } @@ -228,8 +221,14 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct aead_request *req; int rc; + req = kzalloc(sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + if (!req) + return -ENOMEM; + sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); @@ -245,15 +244,14 @@ static int tls_push_record(struct sock *sk, int flags, tls_ctx->pending_open_record_frags = 0; set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size, - sk->sk_allocation); + rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); if (rc < 0) { /* If we are called from write_space and * we fail, we need to set this SOCK_NOSPACE * to trigger another write_space in the future. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - return rc; + goto out_req; } free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -268,6 +266,8 @@ static int tls_push_record(struct sock *sk, int flags, tls_err_abort(sk, EBADMSG); tls_advance_record_sn(sk, &tls_ctx->tx); +out_req: + kfree(req); return rc; } -- cgit v1.2.3 From 06030dbaf3b6c5801dcdb7fe4fbab3b91c8da84a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:46 +0200 Subject: tls: fix waitall behavior in tls_sw_recvmsg Current behavior in tls_sw_recvmsg() is to wait for incoming tls messages and copy up to exactly len bytes of data that the user provided. This is problematic in the sense that i) if no packet is currently queued in strparser we keep waiting until one has been processed and pushed into tls receive layer for tls_wait_data() to wake up and push the decrypted bits to user space. Given after tls decryption, we're back at streaming data, use sock_rcvlowat() hint from tcp socket instead. Retain current behavior with MSG_WAITALL flag and otherwise use the hint target for breaking the loop and returning to application. This is done if currently no ctx->recv_pkt is ready, otherwise continue to process it from our strparser backlog. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2945a3bd538c..f127fac88acf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -754,7 +754,7 @@ int tls_sw_recvmsg(struct sock *sk, struct sk_buff *skb; ssize_t copied = 0; bool cmsg = false; - int err = 0; + int target, err = 0; long timeo; flags |= nonblock; @@ -764,6 +764,7 @@ int tls_sw_recvmsg(struct sock *sk, lock_sock(sk); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { bool zc = false; @@ -856,6 +857,9 @@ fallback_to_reg_recv: goto recv_end; } } + /* If we have a new message from strparser, continue now. */ + if (copied >= target && !ctx->recv_pkt) + break; } while (len); recv_end: -- cgit v1.2.3 From f6a6f203d507aae3a06a8de79c6f0ecc4658b81c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 12 Jun 2018 21:26:10 -0700 Subject: neighbour: skip NTF_EXT_LEARNED entries during forced gc Commit 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") added support for NTF_EXT_LEARNED for neighbour entries. NTF_EXT_LEARNED entries are neigh entries managed by control plane (eg: Ethernet VPN implementation in FRR routing suite). Periodic gc already excludes these entries. This patch extends it to forced gc which the earlier patch missed. Fixes: 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a7a9c3d738ba..8e3fda9e725c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -119,13 +119,14 @@ unsigned long neigh_rand_reach_time(unsigned long base) EXPORT_SYMBOL(neigh_rand_reach_time); -static bool neigh_del(struct neighbour *n, __u8 state, +static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { + if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && + !(n->flags & flags)) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, @@ -157,7 +158,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, np, tbl); + return neigh_del(n, 0, 0, np, tbl); np = &n->next; } return false; @@ -185,7 +186,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) * - nobody refers to it. * - it is not permanent */ - if (neigh_del(n, NUD_PERMANENT, np, tbl)) { + if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, + tbl)) { shrunk = 1; continue; } -- cgit v1.2.3