From 4da402597c2b75c4b636ab2416baf921b363b325 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sun, 30 Sep 2018 15:06:06 +0800 Subject: xfrm: fix gro_cells leak when remove virtual xfrm interfaces The device gro_cells has been initialized, it should be freed, otherwise it will be leaked Fixes: f203b76d78092faf2 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31acc6f33d98..6f05e831a73e 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -116,6 +116,9 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) static void xfrmi_dev_free(struct net_device *dev) { + struct xfrm_if *xi = netdev_priv(dev); + + gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } -- cgit v1.2.3 From cee271678d0e3177a25d0fcb2fa5e051d48e4262 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 8 Oct 2018 19:40:16 +0200 Subject: xsk: do not call synchronize_net() under RCU read lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP update and delete functions called synchronize_net(), which can sleep. It is not allowed to sleep during an RCU read section. Instead we need to make sure that the sock sk_destruct (xsk_destruct) function is asynchronously called after an RCU grace period. Setting the SOCK_RCU_FREE flag for XDP sockets takes care of this. Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Reported-by: Eric Dumazet Signed-off-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 10 ++-------- net/xdp/xsk.c | 2 ++ 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..47147c9e184d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e937cd7c17d..661504042d30 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -744,6 +744,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_destruct = xsk_destruct; sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); + xs = xdp_sk(sk); mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); -- cgit v1.2.3 From 9f7e43da6ae4862b48bac233838ba808c1167a0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Oct 2018 09:59:36 -0700 Subject: net/xfrm: fix out-of-bounds packet access BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161 Read of size 1 at addr ffff8801d882eec7 by task syz-executor1/6667 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430 _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161 __xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299 xfrm_decode_session include/net/xfrm.h:1232 [inline] vti6_tnl_xmit+0x3c3/0x1bc1 net/ipv6/ip6_vti.c:542 __netdev_start_xmit include/linux/netdevice.h:4313 [inline] netdev_start_xmit include/linux/netdevice.h:4322 [inline] xmit_one net/core/dev.c:3217 [inline] dev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3233 __dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3803 dev_queue_xmit+0x17/0x20 net/core/dev.c:3836 Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com Reported-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef3defaf43b9..d35bcf92969c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; - while (nh + offset + 1 < skb->data || - pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + while (nh + offset + sizeof(*exthdr) < skb->data || + pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); -- cgit v1.2.3 From 9dffff200fd178f11dd50eb1fd8ccd0650c9284e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Oct 2018 18:02:21 +0200 Subject: xfrm: policy: use hlist rcu variants on insert bydst table/list lookups use rcu, so insertions must use rcu versions. Fixes: a7c44247f704e ("xfrm: policy: make xfrm_policy_lookup_bytype lockless") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f094d4b3520d..119a427d9b2b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -632,9 +632,9 @@ static void xfrm_hash_rebuild(struct work_struct *work) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -774,9 +774,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ -- cgit v1.2.3 From 2bb3207dbbd4d30e96dd0e1c8e013104193bd59c Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 9 Oct 2018 08:15:38 -0500 Subject: ethtool: fix a missing-check bug In ethtool_get_rxnfc(), the eth command 'cmd' is compared against 'ETHTOOL_GRXFH' to see whether it is necessary to adjust the variable 'info_size'. Then the whole structure of 'info' is copied from the user-space buffer 'useraddr' with 'info_size' bytes. In the following execution, 'info' may be copied again from the buffer 'useraddr' depending on the 'cmd' and the 'info.flow_type'. However, after these two copies, there is no check between 'cmd' and 'info.cmd'. In fact, 'cmd' is also copied from the buffer 'useraddr' in dev_ethtool(), which is the caller function of ethtool_get_rxnfc(). Given that 'useraddr' is in the user space, a malicious user can race to change the eth command in the buffer between these copies. By doing so, the attacker can supply inconsistent data and cause undefined behavior because in the following execution 'info' will be passed to ops->get_rxnfc(). This patch adds a necessary check on 'info.cmd' and 'cmd' to confirm that they are still same after the two copies in ethtool_get_rxnfc(). Otherwise, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 0762aaf8e964..192f2f76b7bd 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1015,6 +1015,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, return -EINVAL; } + if (info.cmd != cmd) + return -EINVAL; + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) -- cgit v1.2.3 From 58f5bbe331c566f49c9559568f982202a278aa78 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Mon, 8 Oct 2018 10:49:35 -0500 Subject: ethtool: fix a privilege escalation bug In dev_ethtool(), the eth command 'ethcmd' is firstly copied from the use-space buffer 'useraddr' and checked to see whether it is ETHTOOL_PERQUEUE. If yes, the sub-command 'sub_cmd' is further copied from the user space. Otherwise, 'sub_cmd' is the same as 'ethcmd'. Next, according to 'sub_cmd', a permission check is enforced through the function ns_capable(). For example, the permission check is required if 'sub_cmd' is ETHTOOL_SCOALESCE, but it is not necessary if 'sub_cmd' is ETHTOOL_GCOALESCE, as suggested in the comment "Allow some commands to be done by anyone". The following execution invokes different handlers according to 'ethcmd'. Specifically, if 'ethcmd' is ETHTOOL_PERQUEUE, ethtool_set_per_queue() is called. In ethtool_set_per_queue(), the kernel object 'per_queue_opt' is copied again from the user-space buffer 'useraddr' and 'per_queue_opt.sub_command' is used to determine which operation should be performed. Given that the buffer 'useraddr' is in the user space, a malicious user can race to change the sub-command between the two copies. In particular, the attacker can supply ETHTOOL_PERQUEUE and ETHTOOL_GCOALESCE to bypass the permission check in dev_ethtool(). Then before ethtool_set_per_queue() is called, the attacker changes ETHTOOL_GCOALESCE to ETHTOOL_SCOALESCE. In this way, the attacker can bypass the permission check and execute ETHTOOL_SCOALESCE. This patch enforces a check in ethtool_set_per_queue() after the second copy from 'useraddr'. If the sub-command is different from the one obtained in the first copy in dev_ethtool(), an error code EINVAL will be returned. Fixes: f38d138a7da6 ("net/ethtool: support set coalesce per queue") Signed-off-by: Wenwen Wang Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/core/ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 192f2f76b7bd..aeabc4831fca 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2472,13 +2472,17 @@ roll_back: return ret; } -static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +static int ethtool_set_per_queue(struct net_device *dev, + void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; + if (per_queue_opt.sub_command != sub_cmd) + return -EINVAL; + switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); @@ -2849,7 +2853,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: - rc = ethtool_set_per_queue(dev, useraddr); + rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); -- cgit v1.2.3 From e331473fee3d500bb0d2582a1fe598df3326d8cd Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 10 Oct 2018 22:00:58 +0200 Subject: net/sched: cls_api: add missing validation of netlink attributes Similarly to what has been done in 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes"), fix classifier code to add validation of TCA_CHAIN and TCA_KIND netlink attributes. tested with: # ./tdc.py -c filter v2: Let sch_api and cls_api share nla_policy they have in common, thanks to David Ahern. v3: Avoid EXPORT_SYMBOL(), as validation of those attributes is not done by TC modules, thanks to Cong Wang. While at it, restore the 'Delete / get qdisc' comment to its orginal position, just above tc_get_qdisc() function prototype. Fixes: 5bc1701881e39 ("net: sched: introduce multichain support for filters") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/cls_api.c | 13 ++++++++----- net/sched/sch_api.c | 8 ++++---- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0a75cb2e5e7b..70f144ac5e1d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,6 +31,8 @@ #include #include +extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; + /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); @@ -1211,7 +1213,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, replay: tp_created = 0; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1360,7 +1362,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1475,7 +1477,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *fh = NULL; int err; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1838,7 +1840,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1949,7 +1951,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + NULL); if (err) return err; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 85e73f48e48f..6684641ea344 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1307,10 +1307,6 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) return 0; } -/* - * Delete/get qdisc. - */ - const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, [TCA_OPTIONS] = { .type = NLA_NESTED }, @@ -1323,6 +1319,10 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, }; +/* + * Delete/get qdisc. + */ + static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { -- cgit v1.2.3 From 5a8e7aea953bdb6d4da13aff6f1e7f9c62023499 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 11 Oct 2018 11:15:13 -0700 Subject: llc: set SOCK_RCU_FREE in llc_sap_add_socket() WHen an llc sock is added into the sk_laddr_hash of an llc_sap, it is not marked with SOCK_RCU_FREE. This causes that the sock could be freed while it is still being read by __llc_lookup_established() with RCU read lock. sock is refcounted, but with RCU read lock, nothing prevents the readers getting a zero refcnt. Fix it by setting SOCK_RCU_FREE in llc_sap_add_socket(). Reported-by: syzbot+11e05f04c15e03be5254@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/llc/llc_conn.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index c0ac522b48a1..4ff89cb7c86f 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -734,6 +734,7 @@ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) llc_sk(sk)->sap = sap; spin_lock_bh(&sap->sk_lock); + sock_set_flag(sk, SOCK_RCU_FREE); sap->sk_count++; sk_nulls_add_node_rcu(sk, laddr_hb); hlist_add_head(&llc->dev_hash_node, dev_hb); -- cgit v1.2.3 From 4af00f4cc1ba34da4654ac31830843cae871642d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 11 Oct 2018 22:02:29 +0200 Subject: tipc: initialize broadcast link stale counter correctly In the commit referred to below we added link tolerance as an additional criteria for declaring broadcast transmission "stale" and resetting the unicast links to the affected node. Unfortunately, this 'improvement' introduced two bugs, which each and one alone cause only limited problems, but combined lead to seemingly stochastic unicast link resets, depending on the amount of broadcast traffic transmitted. The first issue, a missing initialization of the 'tolerance' field of the receiver broadcast link, was recently fixed by commit 047491ea334a ("tipc: set link tolerance correctly in broadcast link"). Ths second issue, where we omit to reset the 'stale_cnt' field of the same link after a 'stale' period is over, leads to this counter accumulating over time, and in the absence of the 'tolerance' criteria leads to the above described symptoms. This commit adds the missing initialization. Fixes: a4dc70d46cf1 ("tipc: extend link reset criteria for stale packet retransmission") Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index f6552e4f4b43..201c3b5bc96b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1041,6 +1041,7 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (r->last_retransm != buf_seqno(skb)) { r->last_retransm = buf_seqno(skb); r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); if (link_is_bc_sndlink(l)) -- cgit v1.2.3 From d7b4c24f45d2efe51b8f213da4593fefd49240ba Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Oct 2018 22:32:31 +0100 Subject: rxrpc: Fix an uninitialised variable Fix an uninitialised variable introduced by the last patch. This can cause a crash when a new call comes in to a local service, such as when an AFS fileserver calls back to the local cache manager. Fixes: c1e15b4944c9 ("rxrpc: Fix the packet reception routine") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_accept.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 652e314de38e..8079aacaecac 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_connection *conn; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct rxrpc_call *call; _enter(""); -- cgit v1.2.3 From d6672a5a97918f92bf2f3a2591f25d02bb0897a4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 11 Oct 2018 22:32:39 +0100 Subject: rxrpc: use correct kvec num when sending BUSY response packet Fixes gcc '-Wunused-but-set-variable' warning: net/rxrpc/output.c: In function 'rxrpc_reject_packets': net/rxrpc/output.c:527:11: warning: variable 'ioc' set but not used [-Wunused-but-set-variable] 'ioc' is the correct kvec num when sending a BUSY (or an ABORT) response packet. Fixes: ece64fec164f ("rxrpc: Emit BUSY packets when supposed to rather than ABORTs") Signed-off-by: YueHaibing Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e8fb8922bca8..a141ee3ab812 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -572,7 +572,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; - ret = kernel_sendmsg(local->socket, &msg, iov, 2, size); + ret = kernel_sendmsg(local->socket, &msg, + iov, ioc, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); -- cgit v1.2.3 From f547fac624be53ad8b07e9ebca7654a7827ba61b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 12 Oct 2018 16:22:47 +0200 Subject: ipv6: rate-limit probes for neighbourless routes When commit 270972554c91 ("[IPV6]: ROUTE: Add Router Reachability Probing (RFC4191).") introduced router probing, the rt6_probe() function required that a neighbour entry existed. This neighbour entry is used to record the timestamp of the last probe via the ->updated field. Later, commit 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") removed the requirement for a neighbour entry. Neighbourless routes skip the interval check and are not rate-limited. This patch adds rate-limiting for neighbourless routes, by recording the timestamp of the last probe in the fib6_info itself. Fixes: 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ net/ipv6/route.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3d4930528db0..2d31e22babd8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -159,6 +159,10 @@ struct fib6_info { struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; +#ifdef CONFIG_IPV6_ROUTER_PREF + unsigned long last_probe; +#endif + u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a366c05a239d..abcb5ae77319 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -520,10 +520,11 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct fib6_info *rt) { - struct __rt6_probe_work *work; + struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; struct neighbour *neigh; struct net_device *dev; + struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate @@ -539,15 +540,12 @@ static void rt6_probe(struct fib6_info *rt) nh_gw = &rt->fib6_nh.nh_gw; dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); + idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { - struct inet6_dev *idev; - if (neigh->nud_state & NUD_VALID) goto out; - idev = __in6_dev_get(dev); - work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, @@ -557,11 +555,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else { + } else if (time_after(jiffies, rt->last_probe + + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { + rt->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); -- cgit v1.2.3 From 7ec8dc96e1cb45693f28f1287802ef6f2888dae0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Oct 2018 16:38:36 +0100 Subject: rxrpc: Fix incorrect conditional on IPV6 The udpv6_encap_enable() function is part of the ipv6 code, and if that is configured as a loadable module and rxrpc is built in then a build failure will occur because the conditional check is wrong: net/rxrpc/local_object.o: In function `rxrpc_lookup_local': local_object.c:(.text+0x2688): undefined reference to `udpv6_encap_enable' Use the correct config symbol (CONFIG_AF_RXRPC_IPV6) in the conditional check rather than CONFIG_IPV6 as that will do the right thing. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Reported-by: kbuild-all@01.org Reported-by: Arnd Bergmann Signed-off-by: David Howells Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/rxrpc/local_object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index cad0691c2bb4..0906e51d3cfb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -139,7 +139,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) udp_sk(usk)->gro_complete = NULL; udp_encap_enable(); -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) if (local->srx.transport.family == AF_INET6) udpv6_encap_enable(); #endif -- cgit v1.2.3 From d3092b2efca1cd1d492d0b08499a2066c5ca8cec Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Fri, 12 Oct 2018 22:46:55 +0200 Subject: tipc: fix unsafe rcu locking when accessing publication list The binding table's 'cluster_scope' list is rcu protected to handle races between threads changing the list and those traversing the list at the same moment. We have now found that the function named_distribute() uses the regular list_for_each() macro to traverse the said list. Likewise, the function tipc_named_withdraw() is removing items from the same list using the regular list_del() call. When these two functions execute in parallel we see occasional crashes. This commit fixes this by adding the missing _rcu() suffixes. Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 51b4b96f89db..3cfeb9df64b0 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -115,7 +115,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) struct sk_buff *buf; struct distr_item *item; - list_del(&publ->binding_node); + list_del_rcu(&publ->binding_node); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -147,7 +147,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; - list_for_each_entry(publ, pls, binding_node) { + list_for_each_entry_rcu(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, -- cgit v1.2.3 From dc012f3628eaecfb5ba68404a5c30ef501daf63d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Oct 2018 18:58:53 -0700 Subject: ipv6: mcast: fix a use-after-free in inet6_mc_check syzbot found a use-after-free in inet6_mc_check [1] The problem here is that inet6_mc_check() uses rcu and read_lock(&iml->sflock) So the fact that ip6_mc_leave_src() is called under RTNL and the socket lock does not help us, we need to acquire iml->sflock in write mode. In the future, we should convert all this stuff to RCU. [1] BUG: KASAN: use-after-free in ipv6_addr_equal include/net/ipv6.h:521 [inline] BUG: KASAN: use-after-free in inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649 Read of size 8 at addr ffff8801ce7f2510 by task syz-executor0/22432 CPU: 1 PID: 22432 Comm: syz-executor0 Not tainted 4.19.0-rc7+ #280 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 ipv6_addr_equal include/net/ipv6.h:521 [inline] inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649 __raw_v6_lookup+0x320/0x3f0 net/ipv6/raw.c:98 ipv6_raw_deliver net/ipv6/raw.c:183 [inline] raw6_local_deliver+0x3d3/0xcb0 net/ipv6/raw.c:240 ip6_input_finish+0x467/0x1aa0 net/ipv6/ip6_input.c:345 NF_HOOK include/linux/netfilter.h:289 [inline] ip6_input+0xe9/0x600 net/ipv6/ip6_input.c:426 ip6_mc_input+0x48a/0xd20 net/ipv6/ip6_input.c:503 dst_input include/net/dst.h:450 [inline] ip6_rcv_finish+0x17a/0x330 net/ipv6/ip6_input.c:76 NF_HOOK include/linux/netfilter.h:289 [inline] ipv6_rcv+0x120/0x640 net/ipv6/ip6_input.c:271 __netif_receive_skb_one_core+0x14d/0x200 net/core/dev.c:4913 __netif_receive_skb+0x2c/0x1e0 net/core/dev.c:5023 netif_receive_skb_internal+0x12c/0x620 net/core/dev.c:5126 napi_frags_finish net/core/dev.c:5664 [inline] napi_gro_frags+0x75a/0xc90 net/core/dev.c:5737 tun_get_user+0x3189/0x4250 drivers/net/tun.c:1923 tun_chr_write_iter+0xb9/0x154 drivers/net/tun.c:1968 call_write_iter include/linux/fs.h:1808 [inline] do_iter_readv_writev+0x8b0/0xa80 fs/read_write.c:680 do_iter_write+0x185/0x5f0 fs/read_write.c:959 vfs_writev+0x1f1/0x360 fs/read_write.c:1004 do_writev+0x11a/0x310 fs/read_write.c:1039 __do_sys_writev fs/read_write.c:1112 [inline] __se_sys_writev fs/read_write.c:1109 [inline] __x64_sys_writev+0x75/0xb0 fs/read_write.c:1109 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457421 Code: 75 14 b8 14 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 34 b5 fb ff c3 48 83 ec 08 e8 1a 2d 00 00 48 89 04 24 b8 14 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 63 2d 00 00 48 89 d0 48 83 c4 08 48 3d 01 RSP: 002b:00007f2d30ecaba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 000000000000003e RCX: 0000000000457421 RDX: 0000000000000001 RSI: 00007f2d30ecabf0 RDI: 00000000000000f0 RBP: 0000000020000500 R08: 00000000000000f0 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f2d30ecb6d4 R13: 00000000004c4890 R14: 00000000004d7b90 R15: 00000000ffffffff Allocated by task 22437: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 __do_kmalloc mm/slab.c:3718 [inline] __kmalloc+0x14e/0x760 mm/slab.c:3727 kmalloc include/linux/slab.h:518 [inline] sock_kmalloc+0x15a/0x1f0 net/core/sock.c:1983 ip6_mc_source+0x14dd/0x1960 net/ipv6/mcast.c:427 do_ipv6_setsockopt.isra.9+0x3afb/0x45d0 net/ipv6/ipv6_sockglue.c:743 ipv6_setsockopt+0xbd/0x170 net/ipv6/ipv6_sockglue.c:933 rawv6_setsockopt+0x59/0x140 net/ipv6/raw.c:1069 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3038 __sys_setsockopt+0x1ba/0x3c0 net/socket.c:1902 __do_sys_setsockopt net/socket.c:1913 [inline] __se_sys_setsockopt net/socket.c:1910 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1910 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 22430: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xcf/0x230 mm/slab.c:3813 __sock_kfree_s net/core/sock.c:2004 [inline] sock_kfree_s+0x29/0x60 net/core/sock.c:2010 ip6_mc_leave_src+0x11a/0x1d0 net/ipv6/mcast.c:2448 __ipv6_sock_mc_close+0x20b/0x4e0 net/ipv6/mcast.c:310 ipv6_sock_mc_close+0x158/0x1d0 net/ipv6/mcast.c:328 inet6_release+0x40/0x70 net/ipv6/af_inet6.c:452 __sock_release+0xd7/0x250 net/socket.c:579 sock_close+0x19/0x20 net/socket.c:1141 __fput+0x385/0xa30 fs/file_table.c:278 ____fput+0x15/0x20 fs/file_table.c:309 task_work_run+0x1e8/0x2a0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:193 [inline] exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801ce7f2500 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 16 bytes inside of 192-byte region [ffff8801ce7f2500, ffff8801ce7f25c0) The buggy address belongs to the page: page:ffffea000739fc80 count:1 mapcount:0 mapping:ffff8801da800040 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0006f6e548 ffffea000737b948 ffff8801da800040 raw: 0000000000000000 ffff8801ce7f2000 0000000100000010 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801ce7f2400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801ce7f2480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff8801ce7f2500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801ce7f2580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff8801ce7f2600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4ae54aaca373..dbab62e3f0d7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2436,17 +2436,17 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; - /* callers have the socket lock and rtnl lock - * so no other readers or writers of iml or its sflist - */ + write_lock_bh(&iml->sflock); if (!iml->sflist) { /* any-source empty exclude case */ - return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } else { + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; } - err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - iml->sflist->sl_count, iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + write_unlock_bh(&iml->sflock); return err; } -- cgit v1.2.3 From d805397c3822d57ca3884d4bea37b2291fc40992 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Oct 2018 19:58:29 +0800 Subject: sctp: use the pmtu from the icmp packet to update transport pathmtu Other than asoc pmtu sync from all transports, sctp_assoc_sync_pmtu is also processing transport pmtu_pending by icmp packets. But it's meaningless to use sctp_dst_mtu(t->dst) as new pmtu for a transport. The right pmtu value should come from the icmp packet, and it would be saved into transport->mtu_info in this patch and used later when the pmtu sync happens in sctp_sendmsg_to_asoc or sctp_packet_config. Besides, without this patch, as pmtu can only be updated correctly when receiving a icmp packet and no place is holding sock lock, it will take long time if the sock is busy with sending packets. Note that it doesn't process transport->mtu_info in .release_cb(), as there is no enough information for pmtu update, like for which asoc or transport. It is not worth traversing all asocs to check pmtu_pending. So unlike tcp, sctp does this in tx path, for which mtu_info needs to be atomic_t. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 ++ net/sctp/associola.c | 3 ++- net/sctp/input.c | 1 + net/sctp/output.c | 6 ++++++ 4 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 28a7c8e44636..a11f93790476 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -876,6 +876,8 @@ struct sctp_transport { unsigned long sackdelay; __u32 sackfreq; + atomic_t mtu_info; + /* When was the last time that we heard from this transport? We use * this to pick new active and retran paths. */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 297d9cf960b9..a827a1f562bf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1450,7 +1450,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst)); + sctp_transport_update_pmtu(t, + atomic_read(&t->mtu_info)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/input.c b/net/sctp/input.c index 9bbc5f92c941..5c36a99882ed 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -395,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; if (sock_owned_by_user(sk)) { + atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; diff --git a/net/sctp/output.c b/net/sctp/output.c index 7f849b01ec8e..67939ad99c01 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_assoc_sync_pmtu(asoc); } + if (asoc->pmtu_pending) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); + asoc->pmtu_pending = 0; + } + /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ -- cgit v1.2.3 From 1890fea7936ad9be0b7caf6a94146b0d905c4b60 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 15 Oct 2018 22:37:21 +0100 Subject: rxrpc: Fix a missing rxrpc_put_peer() in the error_report handler Fix a missing call to rxrpc_put_peer() on the main path through the rxrpc_error_report() function. This manifests itself as a ref leak whenever an ICMP packet or other error comes in. In commit f334430316e7, the hand-off of the ref to a work item was removed and was not replaced with a put. Fixes: f334430316e7 ("rxrpc: Fix error distribution") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/peer_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 05b51bdbdd41..bd2fa3b7caa7 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -195,6 +195,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_put_peer(peer); _leave(""); } -- cgit v1.2.3 From 0ac1077e3a549bf8d35971613e2be05bdbb41a00 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 16 Oct 2018 15:52:02 +0800 Subject: sctp: get pr_assoc and pr_stream all status with SCTP_PR_SCTP_ALL instead According to rfc7496 section 4.3 or 4.4: sprstat_policy: This parameter indicates for which PR-SCTP policy the user wants the information. It is an error to use SCTP_PR_SCTP_NONE in sprstat_policy. If SCTP_PR_SCTP_ALL is used, the counters provided are aggregated over all supported policies. We change to dump pr_assoc and pr_stream all status by SCTP_PR_SCTP_ALL instead, and return error for SCTP_PR_SCTP_NONE, as it also said "It is an error to use SCTP_PR_SCTP_NONE in sprstat_policy. " Fixes: 826d253d57b1 ("sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt") Fixes: d229d48d183f ("sctp: add SCTP_PR_STREAM_STATUS sockopt for prsctp") Reported-by: Ying Xu Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index b479db5c71d9..34dd3d497f2c 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -301,6 +301,7 @@ enum sctp_sinfo_flags { SCTP_SACK_IMMEDIATELY = (1 << 3), /* SACK should be sent without delay. */ /* 2 bits here have been used by SCTP_PR_SCTP_MASK */ SCTP_SENDALL = (1 << 6), + SCTP_PR_SCTP_ALL = (1 << 7), SCTP_NOTIFICATION = MSG_NOTIFICATION, /* Next message is not user msg but notification. */ SCTP_EOF = MSG_FIN, /* Initiate graceful shutdown process. */ }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f73e9d38d5ba..e25a20fc629a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7100,14 +7100,14 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; - if (policy == SCTP_PR_SCTP_NONE) { + if (policy & SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -7159,7 +7159,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); @@ -7175,7 +7175,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, goto out; } - if (policy == SCTP_PR_SCTP_NONE) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { -- cgit v1.2.3 From 84258438e8ce12d6888b68a1238bba9cb25307e2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 17 Oct 2018 00:35:10 +0900 Subject: net: bpfilter: use get_pid_task instead of pid_task pid_task() dereferences rcu protected tasks array. But there is no rcu_read_lock() in shutdown_umh() routine so that rcu_read_lock() is needed. get_pid_task() is wrapper function of pid_task. it holds rcu_read_lock() then calls pid_task(). if task isn't NULL, it increases reference count of task. test commands: %modprobe bpfilter %modprobe -rv bpfilter splat looks like: [15102.030932] ============================= [15102.030957] WARNING: suspicious RCU usage [15102.030985] 4.19.0-rc7+ #21 Not tainted [15102.031010] ----------------------------- [15102.031038] kernel/pid.c:330 suspicious rcu_dereference_check() usage! [15102.031063] other info that might help us debug this: [15102.031332] rcu_scheduler_active = 2, debug_locks = 1 [15102.031363] 1 lock held by modprobe/1570: [15102.031389] #0: 00000000580ef2b0 (bpfilter_lock){+.+.}, at: stop_umh+0x13/0x52 [bpfilter] [15102.031552] stack backtrace: [15102.031583] CPU: 1 PID: 1570 Comm: modprobe Not tainted 4.19.0-rc7+ #21 [15102.031607] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [15102.031628] Call Trace: [15102.031676] dump_stack+0xc9/0x16b [15102.031723] ? show_regs_print_info+0x5/0x5 [15102.031801] ? lockdep_rcu_suspicious+0x117/0x160 [15102.031855] pid_task+0x134/0x160 [15102.031900] ? find_vpid+0xf0/0xf0 [15102.032017] shutdown_umh.constprop.1+0x1e/0x53 [bpfilter] [15102.032055] stop_umh+0x46/0x52 [bpfilter] [15102.032092] __x64_sys_delete_module+0x47e/0x570 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b64e1649993b..94e88f510c5b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -23,9 +23,11 @@ static void shutdown_umh(struct umh_info *info) if (!info->pid) return; - tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); - if (tsk) + tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) { force_sig(SIGKILL, tsk); + put_task_struct(tsk); + } fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; -- cgit v1.2.3 From b336decab22158937975293aea79396525f92bb3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 16 Oct 2018 15:18:17 -0300 Subject: sctp: fix race on sctp_id2asoc syzbot reported an use-after-free involving sctp_id2asoc. Dmitry Vyukov helped to root cause it and it is because of reading the asoc after it was freed: CPU 1 CPU 2 (working on socket 1) (working on socket 2) sctp_association_destroy sctp_id2asoc spin lock grab the asoc from idr spin unlock spin lock remove asoc from idr spin unlock free(asoc) if asoc->base.sk != sk ... [*] This can only be hit if trying to fetch asocs from different sockets. As we have a single IDR for all asocs, in all SCTP sockets, their id is unique on the system. An application can try to send stuff on an id that matches on another socket, and the if in [*] will protect from such usage. But it didn't consider that as that asoc may belong to another socket, it may be freed in parallel (read: under another socket lock). We fix it by moving the checks in [*] into the protected region. This fixes it because the asoc cannot be freed while the lock is held. Reported-by: syzbot+c7dd55d7aec49d48e49a@syzkaller.appspotmail.com Acked-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e25a20fc629a..bba877a0205b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -271,11 +271,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) spin_lock_bh(&sctp_assocs_id_lock); asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); + if (asoc && (asoc->base.sk != sk || asoc->base.dead)) + asoc = NULL; spin_unlock_bh(&sctp_assocs_id_lock); - if (!asoc || (asoc->base.sk != sk) || asoc->base.dead) - return NULL; - return asoc; } -- cgit v1.2.3 From c863850ce22e1b0bb365d49cadf51f4765153ae4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 17 Oct 2018 03:06:12 +0800 Subject: sctp: not free the new asoc when sctp_wait_for_connect returns err When sctp_wait_for_connect is called to wait for connect ready for sp->strm_interleave in sctp_sendmsg_to_asoc, a panic could be triggered if cpu is scheduled out and the new asoc is freed elsewhere, as it will return err and later the asoc gets freed again in sctp_sendmsg. [ 285.840764] list_del corruption, ffff9f0f7b284078->next is LIST_POISON1 (dead000000000100) [ 285.843590] WARNING: CPU: 1 PID: 8861 at lib/list_debug.c:47 __list_del_entry_valid+0x50/0xa0 [ 285.846193] Kernel panic - not syncing: panic_on_warn set ... [ 285.846193] [ 285.848206] CPU: 1 PID: 8861 Comm: sctp_ndata Kdump: loaded Not tainted 4.19.0-rc7.label #584 [ 285.850559] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 285.852164] Call Trace: ... [ 285.872210] ? __list_del_entry_valid+0x50/0xa0 [ 285.872894] sctp_association_free+0x42/0x2d0 [sctp] [ 285.873612] sctp_sendmsg+0x5a4/0x6b0 [sctp] [ 285.874236] sock_sendmsg+0x30/0x40 [ 285.874741] ___sys_sendmsg+0x27a/0x290 [ 285.875304] ? __switch_to_asm+0x34/0x70 [ 285.875872] ? __switch_to_asm+0x40/0x70 [ 285.876438] ? ptep_set_access_flags+0x2a/0x30 [ 285.877083] ? do_wp_page+0x151/0x540 [ 285.877614] __sys_sendmsg+0x58/0xa0 [ 285.878138] do_syscall_64+0x55/0x180 [ 285.878669] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is a similar issue with the one fixed in Commit ca3af4dd28cf ("sctp: do not free asoc when it is already dead in sctp_sendmsg"). But this one can't be fixed by returning -ESRCH for the dead asoc in sctp_wait_for_connect, as it will break sctp_connect's return value to users. This patch is to simply set err to -ESRCH before it returns to sctp_sendmsg when any err is returned by sctp_wait_for_connect for sp->strm_interleave, so that no asoc would be freed due to this. When users see this error, they will know the packet hasn't been sent. And it also makes sense to not free asoc because waiting connect fails, like the second call for sctp_wait_for_connect in sctp_sendmsg_to_asoc. Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bba877a0205b..c1c1bda334a4 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1945,8 +1945,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sp->strm_interleave) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); - if (err) + if (err) { + err = -ESRCH; goto err; + } } else { wait_connect = true; } -- cgit v1.2.3 From 84dad55951b0d009372ec21760b650634246e144 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 Oct 2018 11:44:04 +0200 Subject: udp6: fix encap return code for resubmitting The commit eb63f2964dbe ("udp6: add missing checks on edumux packet processing") used the same return code convention of the ipv4 counterpart, but ipv6 uses the opposite one: positive values means resubmit. This change addresses the issue, using positive return value for resubmitting. Also update the related comment, which was broken, too. Fixes: eb63f2964dbe ("udp6: add missing checks on edumux packet processing") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/udp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 28c4aa5078fc..b36694b6716e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -766,11 +766,9 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, ret = udpv6_queue_rcv_skb(sk, skb); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ + /* a return value > 0 means to resubmit the input */ if (ret > 0) - return -ret; + return ret; return 0; } -- cgit v1.2.3 From eddf016b910486d2123675a6b5fd7d64f77cdca8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 17 Oct 2018 22:34:34 +0300 Subject: net: ipmr: fix unresolved entry dumps If the skb space ends in an unresolved entry while dumping we'll miss some unresolved entries. The reason is due to zeroing the entry counter between dumping resolved and unresolved mfc entries. We should just keep counting until the whole table is dumped and zero when we move to the next as we have a separate table counter. Reported-by: Colin Ian King Fixes: 8fb472c09b9d ("ipmr: improve hash scalability") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr_base.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 1ad9aa62a97b..eab8cd5ec2f5 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -296,8 +296,6 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, next_entry: e++; } - e = 0; - s_e = 0; spin_lock_bh(lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { -- cgit v1.2.3 From 3c53ed8fef6881a864f0ee8240ed2793ef73ad0d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 18 Oct 2018 10:34:26 +0200 Subject: net: sched: Fix for duplicate class dump When dumping classes by parent, kernel would return classes twice: | # tc qdisc add dev lo root prio | # tc class show dev lo | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: | # tc class show dev lo parent 8001: | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: This comes from qdisc_match_from_root() potentially returning the root qdisc itself if its handle matched. Though in that case, root's classes were already dumped a few lines above. Fixes: cb395b2010879 ("net: sched: optimize class dumps") Signed-off-by: Phil Sutter Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6684641ea344..3dc0acf54245 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2059,7 +2059,8 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); - if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + if (q && q != root && + tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } -- cgit v1.2.3 From b6168562c8ce2bd5a30e213021650422e08764dc Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Thu, 18 Oct 2018 09:36:46 -0500 Subject: net: socket: fix a missing-check bug In ethtool_ioctl(), the ioctl command 'ethcmd' is checked through a switch statement to see whether it is necessary to pre-process the ethtool structure, because, as mentioned in the comment, the structure ethtool_rxnfc is defined with padding. If yes, a user-space buffer 'rxnfc' is allocated through compat_alloc_user_space(). One thing to note here is that, if 'ethcmd' is ETHTOOL_GRXCLSRLALL, the size of the buffer 'rxnfc' is partially determined by 'rule_cnt', which is actually acquired from the user-space buffer 'compat_rxnfc', i.e., 'compat_rxnfc->rule_cnt', through get_user(). After 'rxnfc' is allocated, the data in the original user-space buffer 'compat_rxnfc' is then copied to 'rxnfc' through copy_in_user(), including the 'rule_cnt' field. However, after this copy, no check is re-enforced on 'rxnfc->rule_cnt'. So it is possible that a malicious user race to change the value in the 'compat_rxnfc->rule_cnt' between these two copies. Through this way, the attacker can bypass the previous check on 'rule_cnt' and inject malicious data. This can cause undefined behavior of the kernel and introduce potential security risk. This patch avoids the above issue via copying the value acquired by get_user() to 'rxnfc->rule_cn', if 'ethcmd' is ETHTOOL_GRXCLSRLALL. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- net/socket.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 01f3f8f32d6f..390a8ecef4bf 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2875,9 +2875,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) + (void __user *)&rxnfc->fs.ring_cookie)) + return -EFAULT; + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + if (put_user(rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + } else if (copy_in_user(&rxnfc->rule_cnt, + &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) return -EFAULT; } -- cgit v1.2.3 From b06f9d9f1a907dd03f203e2ce9e27e318c22ba01 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 18 Oct 2018 17:38:29 +0200 Subject: tipc: fix info leak from kernel tipc_event We initialize a struct tipc_event allocated on the kernel stack to zero to avert info leak to user space. Reported-by: syzbot+057458894bc8cada4dee@syzkaller.appspotmail.com Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index e82f13cb2dc5..06fee142f09f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -666,6 +666,7 @@ static void tipc_group_create_event(struct tipc_group *grp, struct sk_buff *skb; struct tipc_msg *hdr; + memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; -- cgit v1.2.3 From d4d576f5ab7edcb757bb33e6a5600666a0b1232d Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 18 Oct 2018 21:25:07 +0200 Subject: ip6_tunnel: Fix encapsulation layout Commit 058214a4d1df ("ip6_tun: Add infrastructure for doing encapsulation") added the ip6_tnl_encap() call in ip6_tnl_xmit(), before the call to ipv6_push_frag_opts() to append the IPv6 Tunnel Encapsulation Limit option (option 4, RFC 2473, par. 5.1) to the outer IPv6 header. As long as the option didn't actually end up in generated packets, this wasn't an issue. Then commit 89a23c8b528b ("ip6_tunnel: Fix missing tunnel encapsulation limit option") fixed sending of this option, and the resulting layout, e.g. for FoU, is: .-------------------.------------.----------.-------------------.----- - - | Outer IPv6 Header | UDP header | Option 4 | Inner IPv6 Header | Payload '-------------------'------------'----------'-------------------'----- - - Needless to say, FoU and GUE (at least) won't work over IPv6. The option is appended by default, and I couldn't find a way to disable it with the current iproute2. Turn this into a more reasonable: .-------------------.----------.------------.-------------------.----- - - | Outer IPv6 Header | Option 4 | UDP header | Inner IPv6 Header | Payload '-------------------'----------'------------'-------------------'----- - - With this, and with 84dad55951b0 ("udp6: fix encap return code for resubmitting"), FoU and GUE work again over IPv6. Fixes: 058214a4d1df ("ip6_tun: Add infrastructure for doing encapsulation") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a0b6932c3afd..a9d06d4dd057 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,11 +1184,6 @@ route_lookup: } skb_dst_set(skb, dst); - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); - } - if (hop_limit == 0) { if (skb->protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; @@ -1210,6 +1205,11 @@ route_lookup: if (err) return err; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_frag_opts(skb, &opt.ops, &proto); + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); -- cgit v1.2.3 From 48995423143a097527802e28d7add20e5a27677a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 19 Oct 2018 10:45:08 -0700 Subject: Revert "bond: take rcu lock in netpoll_send_skb_on_dev" This reverts commit 6fe9487892b32cb1c8b8b0d552ed7222a527fe30. It is causing more serious regressions than the RCU warning it is fixing. Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index de1d1ba92f2d..3ae899805f8b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -312,7 +312,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; - rcu_read_lock_bh(); lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); @@ -357,7 +356,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } - rcu_read_unlock_bh(); } EXPORT_SYMBOL(netpoll_send_skb_on_dev); -- cgit v1.2.3