From 09db51241118aeb06e1c8cd393b45879ce099b36 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Mon, 28 Jan 2019 09:35:35 +0100 Subject: esp: Skip TX bytes accounting when sending from a request socket On ESP output, sk_wmem_alloc is incremented for the added padding if a socket is associated to the skb. When replying with TCP SYNACKs over IPsec, the associated sk is a casted request socket, only. Increasing sk_wmem_alloc on a request socket results in a write at an arbitrary struct offset. In the best case, this produces the following WARNING: WARNING: CPU: 1 PID: 0 at lib/refcount.c:102 esp_output_head+0x2e4/0x308 [esp4] refcount_t: addition on 0; use-after-free. CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.0.0-rc3 #2 Hardware name: Marvell Armada 380/385 (Device Tree) [...] [] (esp_output_head [esp4]) from [] (esp_output+0xb8/0x180 [esp4]) [] (esp_output [esp4]) from [] (xfrm_output_resume+0x558/0x664) [] (xfrm_output_resume) from [] (xfrm4_output+0x44/0xc4) [] (xfrm4_output) from [] (tcp_v4_send_synack+0xa8/0xe8) [] (tcp_v4_send_synack) from [] (tcp_conn_request+0x7f4/0x948) [] (tcp_conn_request) from [] (tcp_rcv_state_process+0x2a0/0xe64) [] (tcp_rcv_state_process) from [] (tcp_v4_do_rcv+0xf0/0x1f4) [] (tcp_v4_do_rcv) from [] (tcp_v4_rcv+0xdb8/0xe20) [] (tcp_v4_rcv) from [] (ip_protocol_deliver_rcu+0x2c/0x2dc) [] (ip_protocol_deliver_rcu) from [] (ip_local_deliver_finish+0x48/0x54) [] (ip_local_deliver_finish) from [] (ip_local_deliver+0x54/0xec) [] (ip_local_deliver) from [] (ip_rcv+0x48/0xb8) [] (ip_rcv) from [] (__netif_receive_skb_one_core+0x50/0x6c) [...] The issue triggers only when not using TCP syncookies, as for syncookies no socket is associated. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Martin Willi Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5459f41fc26f..10e809b296ec 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -328,7 +328,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb->len += tailen; skb->data_len += tailen; skb->truesize += tailen; - if (sk) + if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); goto out; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5afe9f83374d..239d4a65ad6e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -296,7 +296,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info skb->len += tailen; skb->data_len += tailen; skb->truesize += tailen; - if (sk) + if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); goto out; -- cgit v1.2.3 From f75a2804da391571563c4b6b29e7797787332673 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 Jan 2019 13:05:49 -0800 Subject: xfrm: destroy xfrm_state synchronously on net exit path xfrm_state_put() moves struct xfrm_state to the GC list and schedules the GC work to clean it up. On net exit call path, xfrm_state_flush() is called to clean up and xfrm_flush_gc() is called to wait for the GC work to complete before exit. However, this doesn't work because one of the ->destructor(), ipcomp_destroy(), schedules the same GC work again inside the GC work. It is hard to wait for such a nested async callback. This is also why syzbot still reports the following warning: WARNING: CPU: 1 PID: 33 at net/ipv6/xfrm6_tunnel.c:351 xfrm6_tunnel_net_exit+0x2cb/0x500 net/ipv6/xfrm6_tunnel.c:351 ... ops_exit_list.isra.0+0xb0/0x160 net/core/net_namespace.c:153 cleanup_net+0x51d/0xb10 net/core/net_namespace.c:551 process_one_work+0xd0c/0x1ce0 kernel/workqueue.c:2153 worker_thread+0x143/0x14a0 kernel/workqueue.c:2296 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 In fact, it is perfectly fine to bypass GC and destroy xfrm_state synchronously on net exit call path, because it is in process context and doesn't need a work struct to do any blocking work. This patch introduces xfrm_state_put_sync() which simply bypasses GC, and lets its callers to decide whether to use this synchronous version. On net exit path, xfrm_state_fini() and xfrm6_tunnel_net_exit() use it. And, as ipcomp_destroy() itself is blocking, it can use xfrm_state_put_sync() directly too. Also rename xfrm_state_gc_destroy() to ___xfrm_state_destroy() to reflect this change. Fixes: b48c05ab5d32 ("xfrm: Fix warning in xfrm6_tunnel_net_exit.") Reported-and-tested-by: syzbot+e9aebef558e3ed673934@syzkaller.appspotmail.com Cc: Steffen Klassert Signed-off-by: Cong Wang Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 +++++++++--- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 30 +++++++++++++++++++----------- net/xfrm/xfrm_user.c | 2 +- 5 files changed, 31 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7298a53b9702..85386becbaea 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -853,7 +853,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *); +void __xfrm_state_destroy(struct xfrm_state *, bool); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -863,7 +863,13 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x); + __xfrm_state_destroy(x, false); +} + +static inline void xfrm_state_put_sync(struct xfrm_state *x) +{ + if (refcount_dec_and_test(&x->refcnt)) + __xfrm_state_destroy(x, true); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1590,7 +1596,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f5b4febeaa25..bc65db782bfb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -344,8 +344,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/key/af_key.c b/net/key/af_key.c index 655c787f9d54..637030f43b67 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1783,7 +1783,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true); + err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 23c92891758a..1bb971f46fc6 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -432,7 +432,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void xfrm_state_gc_destroy(struct xfrm_state *x) +static void ___xfrm_state_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); @@ -474,7 +474,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - xfrm_state_gc_destroy(x); + ___xfrm_state_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -598,14 +598,19 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) } EXPORT_SYMBOL(xfrm_state_alloc); -void __xfrm_state_destroy(struct xfrm_state *x) +void __xfrm_state_destroy(struct xfrm_state *x, bool sync) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); + if (sync) { + synchronize_rcu(); + ___xfrm_state_destroy(x); + } else { + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); + } } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -708,7 +713,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) { int i, err = 0, cnt = 0; @@ -730,7 +735,10 @@ restart: err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - xfrm_state_put(x); + if (sync) + xfrm_state_put_sync(x); + else + xfrm_state_put(x); if (!err) cnt++; @@ -2215,7 +2223,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) if (atomic_read(&t->tunnel_users) == 2) xfrm_state_delete(t); atomic_dec(&t->tunnel_users); - xfrm_state_put(t); + xfrm_state_put_sync(t); x->tunnel = NULL; } } @@ -2375,8 +2383,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c6d26afcf89d..a131f9ff979e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1932,7 +1932,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true); + err = xfrm_state_flush(net, p->proto, true, false); if (err) { if (err == -ESRCH) /* empty table */ return 0; -- cgit v1.2.3 From fc2d5cfdcfe2ab76b263d91429caa22451123085 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Thu, 7 Feb 2019 13:33:21 -0700 Subject: af_key: unconditionally clone on broadcast Attempting to avoid cloning the skb when broadcasting by inflating the refcount with sock_hold/sock_put while under RCU lock is dangerous and violates RCU principles. It leads to subtle race conditions when attempting to free the SKB, as we may reference sockets that have already been freed by the stack. Unable to handle kernel paging request at virtual address 6b6b6b6b6b6c4b [006b6b6b6b6b6c4b] address between user and kernel address ranges Internal error: Oops: 96000004 [#1] PREEMPT SMP task: fffffff78f65b380 task.stack: ffffff8049a88000 pc : sock_rfree+0x38/0x6c lr : skb_release_head_state+0x6c/0xcc Process repro (pid: 7117, stack limit = 0xffffff8049a88000) Call trace: sock_rfree+0x38/0x6c skb_release_head_state+0x6c/0xcc skb_release_all+0x1c/0x38 __kfree_skb+0x1c/0x30 kfree_skb+0xd0/0xf4 pfkey_broadcast+0x14c/0x18c pfkey_sendmsg+0x1d8/0x408 sock_sendmsg+0x44/0x60 ___sys_sendmsg+0x1d0/0x2a8 __sys_sendmsg+0x64/0xb4 SyS_sendmsg+0x34/0x4c el0_svc_naked+0x34/0x38 Kernel panic - not syncing: Fatal exception Suggested-by: Eric Dumazet Signed-off-by: Sean Tranchetti Signed-off-by: Steffen Klassert --- net/key/af_key.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 637030f43b67..5651c29cb5bd 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -196,30 +196,22 @@ static int pfkey_release(struct socket *sock) return 0; } -static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, - gfp_t allocation, struct sock *sk) +static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, + struct sock *sk) { int err = -ENOBUFS; - sock_hold(sk); - if (*skb2 == NULL) { - if (refcount_read(&skb->users) != 1) { - *skb2 = skb_clone(skb, allocation); - } else { - *skb2 = skb; - refcount_inc(&skb->users); - } - } - if (*skb2 != NULL) { - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_set_owner_r(*skb2, sk); - skb_queue_tail(&sk->sk_receive_queue, *skb2); - sk->sk_data_ready(sk); - *skb2 = NULL; - err = 0; - } + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return err; + + skb = skb_clone(skb, allocation); + + if (skb) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk); + err = 0; } - sock_put(sk); return err; } @@ -234,7 +226,6 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; - struct sk_buff *skb2 = NULL; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think @@ -253,7 +244,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, * socket. */ if (pfk->promisc) - pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); + pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) @@ -268,7 +259,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, continue; } - err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); + err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ @@ -278,9 +269,8 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + err = pfkey_broadcast_one(skb, allocation, one_sk); - kfree_skb(skb2); kfree_skb(skb); return err; } -- cgit v1.2.3 From 660899ddf06ae8bb5bbbd0a19418b739375430c5 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Mon, 18 Feb 2019 10:49:39 +0100 Subject: xfrm: Fix inbound traffic via XFRM interfaces across network namespaces After moving an XFRM interface to another namespace it stays associated with the original namespace (net in `struct xfrm_if` and the list keyed with `xfrmi_net_id`), allowing processes in the new namespace to use SAs/policies that were created in the original namespace. For instance, this allows a keying daemon in one namespace to establish IPsec SAs for other namespaces without processes there having access to the keys or IKE credentials. This worked fine for outbound traffic, however, for inbound traffic the lookup for the interfaces and the policies used the incorrect namespace (the one the XFRM interface was moved to). Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 4 ++-- net/xfrm/xfrm_policy.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 6be8c7df15bb..dbb3c1945b5c 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -76,10 +76,10 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) int ifindex; struct xfrm_if *xi; - if (!skb->dev) + if (!secpath_exists(skb) || !skb->dev) return NULL; - xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id); + xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); ifindex = skb->dev->ifindex; for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ba0a4048c846..8d1a898d0ba5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3314,8 +3314,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (ifcb) { xi = ifcb->decode_session(skb); - if (xi) + if (xi) { if_id = xi->p.if_id; + net = xi->net; + } } rcu_read_unlock(); -- cgit v1.2.3 From 0fd3fd0a9bb0b02b6435bb7070e9f7b82a23f068 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 5 Feb 2019 20:30:27 +0100 Subject: libceph: handle an empty authorize reply The authorize reply can be empty, for example when the ticket used to build the authorizer is too old and TAG_BADAUTHORIZER is returned from the service. Calling ->verify_authorizer_reply() results in an attempt to decrypt and validate (somewhat) random data in au->buf (most likely the signature block from calc_signature()), which fails and ends up in con_fault_finish() with !con->auth_retry. The ticket isn't invalidated and the connection is retried again and again until a new ticket is obtained from the monitor: libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply Let TAG_BADAUTHORIZER handler kick in and increment con->auth_retry. Cc: stable@vger.kernel.org Fixes: 5c056fdc5b47 ("libceph: verify authorize reply on connect") Link: https://tracker.ceph.com/issues/20164 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3661cdd927f1..7e71b0df1fbc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2058,6 +2058,8 @@ static int process_connect(struct ceph_connection *con) dout("process_connect on %p tag %d\n", con, (int)con->in_tag); if (con->auth) { + int len = le32_to_cpu(con->in_reply.authorizer_len); + /* * Any connection that defines ->get_authorizer() * should also define ->add_authorizer_challenge() and @@ -2067,8 +2069,7 @@ static int process_connect(struct ceph_connection *con) */ if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { ret = con->ops->add_authorizer_challenge( - con, con->auth->authorizer_reply_buf, - le32_to_cpu(con->in_reply.authorizer_len)); + con, con->auth->authorizer_reply_buf, len); if (ret < 0) return ret; @@ -2078,10 +2079,12 @@ static int process_connect(struct ceph_connection *con) return 0; } - ret = con->ops->verify_authorizer_reply(con); - if (ret < 0) { - con->error_msg = "bad authorize reply"; - return ret; + if (len) { + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; + return ret; + } } } -- cgit v1.2.3 From df1a2cb7c74b3d3abc8d8c2d690f82c8ebc3490a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Feb 2019 15:42:38 -0800 Subject: bpf/test_run: fix unkillable BPF_PROG_TEST_RUN Syzbot found out that running BPF_PROG_TEST_RUN with repeat=0xffffffff makes process unkillable. The problem is that when CONFIG_PREEMPT is enabled, we never see need_resched() return true. This is due to the fact that preempt_enable() (which we do in bpf_test_run_one on each iteration) now handles resched if it's needed. Let's disable preemption for the whole run, not per test. In this case we can properly see whether resched is needed. Let's also properly return -EINTR to the userspace in case of a signal interrupt. See recent discussion: http://lore.kernel.org/netdev/CAH3MdRWHr4N8jei8jxDppXjmw-Nw=puNDLbu1dQOFQHxfU2onA@mail.gmail.com I'll follow up with the same fix bpf_prog_test_run_flow_dissector in bpf-next. Reported-by: syzbot Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fa2644d276ef..e31e1b20f7f4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -13,27 +13,13 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - u32 ret; - - preempt_disable(); - rcu_read_lock(); - bpf_cgroup_storage_set(storage); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); - preempt_enable(); - - return ret; -} - -static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, - u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, + u32 *retval, u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; + int ret = 0; u32 i; for_each_cgroup_storage_type(stype) { @@ -48,25 +34,42 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, if (!repeat) repeat = 1; + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - *ret = bpf_test_run_one(prog, ctx, storage); + bpf_cgroup_storage_set(storage); + *retval = BPF_PROG_RUN(prog, ctx); + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (need_resched()) { - if (signal_pending(current)) - break; time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); } } time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return 0; + return ret; } static int bpf_test_finish(const union bpf_attr *kattr, -- cgit v1.2.3 From 9c2054a5cf415a9dc32c91ffde78399955deb571 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 20 Feb 2019 10:32:52 +0000 Subject: net: dsa: fix unintended change of bridge interface STP state When a DSA port is added to a bridge and brought up, the resulting STP state programmed into the hardware depends on the order that these operations are performed. However, the Linux bridge code believes that the port is in disabled mode. If the DSA port is first added to a bridge and then brought up, it will be in blocking mode. If it is brought up and then added to the bridge, it will be in disabled mode. This difference is caused by DSA always setting the STP mode in dsa_port_enable() whether or not this port is part of a bridge. Since bridge always sets the STP state when the port is added, brought up or taken down, it is unnecessary for us to manipulate the STP state. Apparently, this code was copied from Rocker, and the very next day a similar fix for Rocker was merged but was not propagated to DSA. See e47172ab7e41 ("rocker: put port in FORWADING state after leaving bridge") Fixes: b73adef67765 ("net: dsa: integrate with SWITCHDEV for HW bridging") Signed-off-by: Russell King Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/port.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index 2d7e01b23572..2a2a878b5ce3 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -69,7 +69,6 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) { - u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; struct dsa_switch *ds = dp->ds; int port = dp->index; int err; @@ -80,7 +79,8 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) return err; } - dsa_port_set_state_now(dp, stp_state); + if (!dp->bridge_dev) + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); return 0; } @@ -90,7 +90,8 @@ void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy) struct dsa_switch *ds = dp->ds; int port = dp->index; - dsa_port_set_state_now(dp, BR_STATE_DISABLED); + if (!dp->bridge_dev) + dsa_port_set_state_now(dp, BR_STATE_DISABLED); if (ds->ops->port_disable) ds->ops->port_disable(ds, port, phy); -- cgit v1.2.3 From ae3b564179bfd06f32d051b9e5d72ce4b2a07c37 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Feb 2019 20:09:35 +0000 Subject: missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 57 ++++++++++++++++++++++++++++++---------------------- net/unix/diag.c | 3 ++- security/lsm_audit.c | 10 +++++---- 3 files changed, 41 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 74d1eed7cbd4..a95d479caeea 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -890,7 +890,7 @@ retry: addr->hash ^= sk->sk_type; __unix_remove_socket(sk); - u->addr = addr; + smp_store_release(&u->addr, addr); __unix_insert_socket(&unix_socket_table[addr->hash], sk); spin_unlock(&unix_table_lock); err = 0; @@ -1060,7 +1060,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = 0; __unix_remove_socket(sk); - u->addr = addr; + smp_store_release(&u->addr, addr); __unix_insert_socket(list, sk); out_unlock: @@ -1331,15 +1331,29 @@ restart: RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); - /* copy address information from listening to new sock*/ - if (otheru->addr) { - refcount_inc(&otheru->addr->refcnt); - newu->addr = otheru->addr; - } + /* copy address information from listening to new sock + * + * The contents of *(otheru->addr) and otheru->path + * are seen fully set up here, since we have found + * otheru in hash under unix_table_lock. Insertion + * into the hash chain we'd found it in had been done + * in an earlier critical area protected by unix_table_lock, + * the same one where we'd set *(otheru->addr) contents, + * as well as otheru->path and otheru->addr itself. + * + * Using smp_store_release() here to set newu->addr + * is enough to make those stores, as well as stores + * to newu->path visible to anyone who gets newu->addr + * by smp_load_acquire(). IOW, the same warranties + * as for unix_sock instances bound in unix_bind() or + * in unix_autobind(). + */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } + refcount_inc(&otheru->addr->refcnt); + smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); @@ -1453,7 +1467,7 @@ out: static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; - struct unix_sock *u; + struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; @@ -1468,19 +1482,15 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) sock_hold(sk); } - u = unix_sk(sk); - unix_state_lock(sk); - if (!u->addr) { + addr = smp_load_acquire(&unix_sk(sk)->addr); + if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = sizeof(short); } else { - struct unix_address *addr = u->addr; - err = addr->len; memcpy(sunaddr, addr->name, addr->len); } - unix_state_unlock(sk); sock_put(sk); out: return err; @@ -2073,11 +2083,11 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { - struct unix_sock *u = unix_sk(sk); + struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); - if (u->addr) { - msg->msg_namelen = u->addr->len; - memcpy(msg->msg_name, u->addr->name, u->addr->len); + if (addr) { + msg->msg_namelen = addr->len; + memcpy(msg->msg_name, addr->name, addr->len); } } @@ -2581,15 +2591,14 @@ static int unix_open_file(struct sock *sk) if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; - unix_state_lock(sk); + if (!smp_load_acquire(&unix_sk(sk)->addr)) + return -ENOENT; + path = unix_sk(sk)->path; - if (!path.dentry) { - unix_state_unlock(sk); + if (!path.dentry) return -ENOENT; - } path_get(&path); - unix_state_unlock(sk); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) @@ -2830,7 +2839,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { + if (u->addr) { // under unix_table_lock here int i, len; seq_putc(seq, ' '); diff --git a/net/unix/diag.c b/net/unix/diag.c index 384c84e83462..3183d9b8ab33 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -10,7 +10,8 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - struct unix_address *addr = unix_sk(sk)->addr; + /* might or might not have unix_table_lock */ + struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) return 0; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index f84001019356..33028c098ef3 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -321,6 +321,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, if (a->u.net->sk) { struct sock *sk = a->u.net->sk; struct unix_sock *u; + struct unix_address *addr; int len = 0; char *p = NULL; @@ -351,14 +352,15 @@ static void dump_common_audit_data(struct audit_buffer *ab, #endif case AF_UNIX: u = unix_sk(sk); + addr = smp_load_acquire(&u->addr); + if (!addr) + break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } - if (!u->addr) - break; - len = u->addr->len-sizeof(short); - p = &u->addr->name->sun_path[0]; + len = addr->len-sizeof(short); + p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); -- cgit v1.2.3 From 11fe9262ed226c127f67ca4bd85977b22589b68a Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 21 Feb 2019 13:07:38 +0100 Subject: Revert "xsk: simplify AF_XDP socket teardown" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e2ce3674883ecba2605370404208c9d4a07ae1c3. It turns out that the sock destructor xsk_destruct was needed after all. The cleanup simplification broke the skb transmit cleanup path, due to that the umem was prematurely destroyed. The umem cannot be destroyed until all outstanding skbs are freed, which means that we cannot remove the umem until the sk_destruct has been called. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45f3b528dc09..85e4fe4f18cc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -366,7 +366,6 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->rx); xskq_destroy(xs->tx); - xdp_put_umem(xs->umem); sock_orphan(sk); sock->sk = NULL; @@ -718,6 +717,18 @@ static const struct proto_ops xsk_proto_ops = { .sendpage = sock_no_sendpage, }; +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -744,6 +755,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); -- cgit v1.2.3 From 193f3685d0546b0cea20c99894aadb70098e47bf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 11:19:41 +0100 Subject: ipv6: route: enforce RCU protection in rt6_update_exception_stamp_rt() We must access rt6_info->from under RCU read lock: move the dereference under such lock, with proper annotation. v1 -> v2: - avoid using multiple, racy, fetch operations for rt->from Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 964491cf3672..2b1ed8c6fcab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1599,15 +1599,15 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return; + struct fib6_info *from; rcu_read_lock(); + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + bucket = rcu_dereference(from->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES @@ -1626,6 +1626,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) if (rt6_ex) rt6_ex->stamp = jiffies; +unlock: rcu_read_unlock(); } -- cgit v1.2.3 From bf1dc8bad1d42287164d216d8efb51c5cd381b18 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 11:19:42 +0100 Subject: ipv6: route: enforce RCU protection in ip6_route_check_nh_onlink() We need a RCU critical section around rt6_info->from deference, and proper annotation. Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2b1ed8c6fcab..74b9b6fd4168 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2743,20 +2743,24 @@ static int ip6_route_check_nh_onlink(struct net *net, u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct fib6_info *from; struct rt6_info *grt; int err; err = 0; grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); if (grt) { + rcu_read_lock(); + from = rcu_dereference(grt->from); if (!grt->dst.error && /* ignore match if it is the default route */ - grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) && + from && !ipv6_addr_any(&from->fib6_dst.addr) && (grt->rt6i_flags & flags || dev != grt->dst.dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } + rcu_read_unlock(); ip6_rt_put(grt); } -- cgit v1.2.3 From d7cf4a3bf3a83c977a29055e1c4ffada7697b31f Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Feb 2019 12:56:54 +0100 Subject: net/smc: fix smc_poll in SMC_INIT state smc_poll() returns with mask bit EPOLLPRI if the connection urg_state is SMC_URG_VALID. Since SMC_URG_VALID is zero, smc_poll signals EPOLLPRI errorneously if called in state SMC_INIT before the connection is created, for instance in a non-blocking connect scenario. This patch switches to non-zero values for the urg states. Reviewed-by: Karsten Graul Fixes: de8474eb9d50 ("net/smc: urgent data support") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 5721416d0605..adbdf195eb08 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -113,9 +113,9 @@ struct smc_host_cdc_msg { /* Connection Data Control message */ } __aligned(8); enum smc_urg_state { - SMC_URG_VALID, /* data present */ - SMC_URG_NOTYET, /* data pending */ - SMC_URG_READ /* data was already read */ + SMC_URG_VALID = 1, /* data present */ + SMC_URG_NOTYET = 2, /* data pending */ + SMC_URG_READ = 3, /* data was already read */ }; struct smc_connection { -- cgit v1.2.3 From 223b7329ec6a0dae1b7f7db7b770e93f4a069ef9 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 19 Feb 2019 11:20:47 +0700 Subject: tipc: improve function tipc_wait_for_cond() Commit 844cf763fba6 ("tipc: make macro tipc_wait_for_cond() smp safe") replaced finish_wait() with remove_wait_queue() but still used prepare_to_wait(). This causes unnecessary conditional checking before adding to wait queue in prepare_to_wait(). This commit replaces prepare_to_wait() with add_wait_queue() as the pair function with remove_wait_queue(). Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1217c90a363b..81b87916a0eb 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -388,7 +388,7 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) rc_ = tipc_sk_sock_err((sock_), timeo_); \ if (rc_) \ break; \ - prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \ + add_wait_queue(sk_sleep(sk_), &wait_); \ release_sock(sk_); \ *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \ sched_annotate_sleep(); \ -- cgit v1.2.3 From 48766a583c7961af080de2df692f476624a9a21a Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 19 Feb 2019 11:20:48 +0700 Subject: tipc: improve function tipc_wait_for_rcvmsg() This commit replaces schedule_timeout() with wait_woken() in function tipc_wait_for_rcvmsg(). wait_woken() uses memory barriers in its implementation to avoid potential race condition when putting a process into sleeping state and then waking it up. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 81b87916a0eb..684f2125fc6b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1677,7 +1677,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk) static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) { struct sock *sk = sock->sk; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); long timeo = *timeop; int err = sock_error(sk); @@ -1685,15 +1685,17 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) return err; for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { if (sk->sk_shutdown & RCV_SHUTDOWN) { err = -ENOTCONN; break; } + add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + sched_annotate_sleep(); lock_sock(sk); + remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -1709,7 +1711,6 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) if (err) break; } - finish_wait(sk_sleep(sk), &wait); *timeop = timeo; return err; } -- cgit v1.2.3 From 2bdf700e538828d6456150b9319e5f689b062d54 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 19 Feb 2019 17:42:05 +0100 Subject: net: ip_gre: do not report erspan_ver for gre or gretap Report erspan version field to userspace in ipgre_fill_info just for erspan tunnels. The issue can be triggered with the following reproducer: $ip link add name gre1 type gre local 192.168.0.1 remote 192.168.1.1 $ip link set dev gre1 up $ip -d link sh gre1 13: gre1@NONE: mtu 1476 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/gre 192.168.0.1 peer 192.168.1.1 promiscuity 0 minmtu 0 maxmtu 0 gre remote 192.168.1.1 local 192.168.0.1 ttl inherit erspan_ver 0 addrgenmode eui64 numtxqueues 1 numrxqueues 1 Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3978f807fa8b..6ae89f2b541b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1457,9 +1457,23 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if ((t->erspan_ver == 1 || t->erspan_ver == 2) && - !t->collect_md) - o_flags |= TUNNEL_KEY; + if (t->erspan_ver == 1 || t->erspan_ver == 2) { + if (!t->collect_md) + o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -1495,19 +1509,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else if (t->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - return 0; nla_put_failure: -- cgit v1.2.3 From 103d0244d29fcaf38f1339d4538919bbbc051490 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 19 Feb 2019 17:42:06 +0100 Subject: net: ip6_gre: do not report erspan_ver for ip6gre or ip6gretap Report erspan version field to userspace in ip6gre_fill_info just for erspan_v6 tunnels. Moreover report IFLA_GRE_ERSPAN_INDEX only for erspan version 1. The issue can be triggered with the following reproducer: $ip link add name gre6 type ip6gre local 2001::1 remote 2002::2 $ip link set gre6 up $ip -d link sh gre6 14: grep6@NONE: mtu 1448 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/gre6 2001::1 peer 2002::2 promiscuity 0 minmtu 0 maxmtu 0 ip6gre remote 2002::2 local 2001::1 hoplimit 64 encaplimit 4 tclass 0x00 flowlabel 0x00000 erspan_index 0 erspan_ver 0 addrgenmode eui64 Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 43890898b0b5..0fdd0109d131 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2104,9 +2104,23 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if ((p->erspan_ver == 1 || p->erspan_ver == 2) && - !p->collect_md) - o_flags |= TUNNEL_KEY; + if (p->erspan_ver == 1 || p->erspan_ver == 2) { + if (!p->collect_md) + o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -2121,8 +2135,7 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || - nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -2140,19 +2153,6 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) - goto nla_put_failure; - - if (p->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) - goto nla_put_failure; - } else if (p->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) - goto nla_put_failure; - } - return 0; nla_put_failure: -- cgit v1.2.3 From 6321aa197547da397753757bd84c6ce64b3e3d89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Feb 2019 22:53:50 +0100 Subject: phonet: fix building with clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clang warns about overflowing the data[] member in the struct pnpipehdr: net/phonet/pep.c:295:8: warning: array index 4 is past the end of the array (which contains 1 element) [-Warray-bounds] if (hdr->data[4] == PEP_IND_READY) ^ ~ include/net/phonet/pep.h:66:3: note: array 'data' declared here u8 data[1]; Using a flexible array member at the end of the struct avoids the warning, but since we cannot have a flexible array member inside of the union, each index now has to be moved back by one, which makes it a little uglier. Signed-off-by: Arnd Bergmann Acked-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- include/net/phonet/pep.h | 5 +++-- net/phonet/pep.c | 32 ++++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/phonet/pep.h b/include/net/phonet/pep.h index b669fe6dbc3b..98f31c7ea23d 100644 --- a/include/net/phonet/pep.h +++ b/include/net/phonet/pep.h @@ -63,10 +63,11 @@ struct pnpipehdr { u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ - u8 data[1]; + u8 data0; /* anything else */ }; + u8 data[]; }; -#define other_pep_type data[1] +#define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 9fc76b19cd3c..db3473540303 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -132,7 +132,7 @@ static int pep_indicate(struct sock *sk, u8 id, u8 code, ph->utid = 0; ph->message_id = id; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = code; + ph->error_code = code; return pn_skb_send(sk, skb, NULL); } @@ -153,7 +153,7 @@ static int pipe_handler_request(struct sock *sk, u8 id, u8 code, ph->utid = id; /* whatever */ ph->message_id = id; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = code; + ph->error_code = code; return pn_skb_send(sk, skb, NULL); } @@ -208,7 +208,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, struct pnpipehdr *ph; struct sockaddr_pn dst; u8 data[4] = { - oph->data[0], /* PEP type */ + oph->pep_type, /* PEP type */ code, /* error code, at an unusual offset */ PAD, PAD, }; @@ -221,7 +221,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, ph->utid = oph->utid; ph->message_id = PNS_PEP_CTRL_RESP; ph->pipe_handle = oph->pipe_handle; - ph->data[0] = oph->data[1]; /* CTRL id */ + ph->data0 = oph->data[0]; /* CTRL id */ pn_skb_get_src_sockaddr(oskb, &dst); return pn_skb_send(sk, skb, &dst); @@ -272,17 +272,17 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) return -EINVAL; hdr = pnp_hdr(skb); - if (hdr->data[0] != PN_PEP_TYPE_COMMON) { + if (hdr->pep_type != PN_PEP_TYPE_COMMON) { net_dbg_ratelimited("Phonet unknown PEP type: %u\n", - (unsigned int)hdr->data[0]); + (unsigned int)hdr->pep_type); return -EOPNOTSUPP; } - switch (hdr->data[1]) { + switch (hdr->data[0]) { case PN_PEP_IND_FLOW_CONTROL: switch (pn->tx_fc) { case PN_LEGACY_FLOW_CONTROL: - switch (hdr->data[4]) { + switch (hdr->data[3]) { case PEP_IND_BUSY: atomic_set(&pn->tx_credits, 0); break; @@ -292,7 +292,7 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) } break; case PN_ONE_CREDIT_FLOW_CONTROL: - if (hdr->data[4] == PEP_IND_READY) + if (hdr->data[3] == PEP_IND_READY) atomic_set(&pn->tx_credits, wake = 1); break; } @@ -301,12 +301,12 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) case PN_PEP_IND_ID_MCFC_GRANT_CREDITS: if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL) break; - atomic_add(wake = hdr->data[4], &pn->tx_credits); + atomic_add(wake = hdr->data[3], &pn->tx_credits); break; default: net_dbg_ratelimited("Phonet unknown PEP indication: %u\n", - (unsigned int)hdr->data[1]); + (unsigned int)hdr->data[0]); return -EOPNOTSUPP; } if (wake) @@ -318,7 +318,7 @@ static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); - u8 n_sb = hdr->data[0]; + u8 n_sb = hdr->data0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; __skb_pull(skb, sizeof(*hdr)); @@ -506,7 +506,7 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) return -ECONNREFUSED; /* Parse sub-blocks */ - n_sb = hdr->data[4]; + n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[6], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); @@ -739,7 +739,7 @@ static int pipe_do_remove(struct sock *sk) ph->utid = 0; ph->message_id = PNS_PIPE_REMOVE_REQ; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = PAD; + ph->data0 = PAD; return pn_skb_send(sk, skb, NULL); } @@ -817,7 +817,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, peer_type = hdr->other_pep_type << 8; /* Parse sub-blocks (options) */ - n_sb = hdr->data[4]; + n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[1], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); @@ -1109,7 +1109,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->utid = 0; if (pn->aligned) { ph->message_id = PNS_PIPE_ALIGNED_DATA; - ph->data[0] = 0; /* padding */ + ph->data0 = 0; /* padding */ } else ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; -- cgit v1.2.3 From 5c14a4d05f68415af9e41a4e667d1748d41d1baf Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 21 Feb 2019 18:29:36 +0100 Subject: mac80211: Change default tx_sk_pacing_shift to 7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we did the original tests for the optimal value of sk_pacing_shift, we came up with 6 ms of buffering as the default. Sadly, 6 is not a power of two, so when picking the shift value I erred on the size of less buffering and picked 4 ms instead of 8. This was probably wrong; those 2 ms of extra buffering makes a larger difference than I thought. So, change the default pacing shift to 7, which corresponds to 8 ms of buffering. The point of diminishing returns really kicks in after 8 ms, and so having this as a default should cut down on the need for extensive per-device testing and overrides needed in the drivers. Cc: stable@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 87a729926734..977dea436ee8 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -615,13 +615,13 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, * We need a bit of data queued to build aggregates properly, so * instruct the TCP stack to allow more than a single ms of data * to be queued in the stack. The value is a bit-shift of 1 - * second, so 8 is ~4ms of queued data. Only affects local TCP + * second, so 7 is ~8ms of queued data. Only affects local TCP * sockets. * This is the default, anyhow - drivers may need to override it * for local reasons (longer buffers, longer completion time, or * similar). */ - local->hw.tx_sk_pacing_shift = 8; + local->hw.tx_sk_pacing_shift = 7; /* set up some defaults */ local->hw.queues = 1; -- cgit v1.2.3 From 51d0af222f6fa43134c6187ab4f374630f6e0d96 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 22 Feb 2019 13:21:15 +0100 Subject: mac80211: allocate tailroom for forwarded mesh packets Forwarded packets enter the tx path through ieee80211_add_pending_skb, which skips the ieee80211_skb_resize call. Fixes WARN_ON in ccmp_encrypt_skb and resulting packet loss. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bb4d71efb6fb..c2a6da5d80da 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2644,6 +2644,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u16 ac, q, hdrlen; + int tailroom = 0; hdr = (struct ieee80211_hdr *) skb->data; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -2732,8 +2733,12 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (!ifmsh->mshcfg.dot11MeshForwarding) goto out; + if (sdata->crypto_tx_tailroom_needed_cnt) + tailroom = IEEE80211_ENCRYPT_TAILROOM; + fwd_skb = skb_copy_expand(skb, local->tx_headroom + - sdata->encrypt_headroom, 0, GFP_ATOMIC); + sdata->encrypt_headroom, + tailroom, GFP_ATOMIC); if (!fwd_skb) goto out; -- cgit v1.2.3 From d1f20c03f48102e52eb98b8651d129b83134cae4 Mon Sep 17 00:00:00 2001 From: Maciej Kwiecien Date: Fri, 22 Feb 2019 09:45:26 +0100 Subject: sctp: don't compare hb_timer expire date before starting it hb_timer might not start at all for a particular transport because its start is conditional. In a result a node is not sending heartbeats. Function sctp_transport_reset_hb_timer has two roles: - initial start of hb_timer for a given transport, - update expire date of hb_timer for a given transport. The function is optimized to update timer's expire only if it is before a new calculated one but this comparison is invalid for a timer which has not yet started. Such a timer has expire == 0 and if a new expire value is bigger than (MAX_JIFFIES / 2 + 2) then "time_before" macro will fail and timer will not start resulting in no heartbeat packets send by the node. This was found when association was initialized within first 5 mins after system boot due to jiffies init value which is near to MAX_JIFFIES. Test kernel version: 4.9.154 (ARCH=arm) hb_timer.expire = 0; //initialized, not started timer new_expire = MAX_JIFFIES / 2 + 2; //or more time_before(hb_timer.expire, new_expire) == false Fixes: ba6f5e33bdbb ("sctp: avoid refreshing heartbeat timer too often") Reported-by: Marcin Stojek Tested-by: Marcin Stojek Signed-off-by: Maciej Kwiecien Reviewed-by: Alexander Sverdlin Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 033696e6f74f..ad158d311ffa 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -207,7 +207,8 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if (time_before(transport->hb_timer.expires, expires) && + if ((time_before(transport->hb_timer.expires, expires) || + !timer_pending(&transport->hb_timer)) && !mod_timer(&transport->hb_timer, expires + prandom_u32_max(transport->rto))) sctp_transport_hold(transport); -- cgit v1.2.3 From efcc9bcaf77c07df01371a7c34e50424c291f3ac Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Feb 2019 09:23:03 +0100 Subject: net: ip6_gre: fix possible NULL pointer dereference in ip6erspan_set_version Fix a possible NULL pointer dereference in ip6erspan_set_version checking nlattr data pointer kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 7549 Comm: syz-executor432 Not tainted 5.0.0-rc6-next-20190218 #37 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ip6erspan_set_version+0x5c/0x350 net/ipv6/ip6_gre.c:1726 Code: 07 38 d0 7f 08 84 c0 0f 85 9f 02 00 00 49 8d bc 24 b0 00 00 00 c6 43 54 01 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9a 02 00 00 4d 8b ac 24 b0 00 00 00 4d 85 ed 0f RSP: 0018:ffff888089ed7168 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8880869d6e58 RCX: 0000000000000000 RDX: 0000000000000016 RSI: ffffffff862736b4 RDI: 00000000000000b0 RBP: ffff888089ed7180 R08: 1ffff11010d3adcb R09: ffff8880869d6e58 R10: ffffed1010d3add5 R11: ffff8880869d6eaf R12: 0000000000000000 R13: ffffffff8931f8c0 R14: ffffffff862825d0 R15: ffff8880869d6e58 FS: 0000000000b3d880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000184 CR3: 0000000092cc5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6erspan_newlink+0x66/0x7b0 net/ipv6/ip6_gre.c:2210 __rtnl_newlink+0x107b/0x16c0 net/core/rtnetlink.c:3176 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3234 rtnetlink_rcv_msg+0x465/0xb00 net/core/rtnetlink.c:5192 netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2485 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5210 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x8ae/0xd70 net/netlink/af_netlink.c:1925 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xdd/0x130 net/socket.c:631 ___sys_sendmsg+0x806/0x930 net/socket.c:2136 __sys_sendmsg+0x105/0x1d0 net/socket.c:2174 __do_sys_sendmsg net/socket.c:2183 [inline] __se_sys_sendmsg net/socket.c:2181 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440159 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffa69156e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440159 RDX: 0000000000000000 RSI: 0000000020001340 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000001 R09: 00000000004002c8 R10: 0000000000000011 R11: 0000000000000246 R12: 00000000004019e0 R13: 0000000000401a70 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace 09f8a7d13b4faaa1 ]--- RIP: 0010:ip6erspan_set_version+0x5c/0x350 net/ipv6/ip6_gre.c:1726 Code: 07 38 d0 7f 08 84 c0 0f 85 9f 02 00 00 49 8d bc 24 b0 00 00 00 c6 43 54 01 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9a 02 00 00 4d 8b ac 24 b0 00 00 00 4d 85 ed 0f RSP: 0018:ffff888089ed7168 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8880869d6e58 RCX: 0000000000000000 RDX: 0000000000000016 RSI: ffffffff862736b4 RDI: 00000000000000b0 RBP: ffff888089ed7180 R08: 1ffff11010d3adcb R09: ffff8880869d6e58 R10: ffffed1010d3add5 R11: ffff8880869d6eaf R12: 0000000000000000 R13: ffffffff8931f8c0 R14: ffffffff862825d0 R15: ffff8880869d6e58 FS: 0000000000b3d880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000184 CR3: 0000000092cc5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 4974d5f678ab ("net: ip6_gre: initialize erspan_ver just for erspan tunnels") Reported-and-tested-by: syzbot+30191cf1057abd3064af@syzkaller.appspotmail.com Signed-off-by: Lorenzo Bianconi Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 0fdd0109d131..26f25b6e2833 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1722,6 +1722,9 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], static void ip6erspan_set_version(struct nlattr *data[], struct __ip6_tnl_parm *parms) { + if (!data) + return; + parms->erspan_ver = 1; if (data[IFLA_GRE_ERSPAN_VER]) parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); -- cgit v1.2.3 From f5b51fe804ec2a6edce0f8f6b11ea57283f5857b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Feb 2019 18:18:12 +0100 Subject: ipv6: route: purge exception on removal When a netdevice is unregistered, we flush the relevant exception via rt6_sync_down_dev() -> fib6_ifdown() -> fib6_del() -> fib6_del_route(). Finally, we end-up calling rt6_remove_exception(), where we release the relevant dst, while we keep the references to the related fib6_info and dev. Such references should be released later when the dst will be destroyed. There are a number of caches that can keep the exception around for an unlimited amount of time - namely dst_cache, possibly even socket cache. As a result device registration may hang, as demonstrated by this script: ip netns add cl ip netns add rt ip netns add srv ip netns exec rt sysctl -w net.ipv6.conf.all.forwarding=1 ip link add name cl_veth type veth peer name cl_rt_veth ip link set dev cl_veth netns cl ip -n cl link set dev cl_veth up ip -n cl addr add dev cl_veth 2001::2/64 ip -n cl route add default via 2001::1 ip -n cl link add tunv6 type ip6tnl mode ip6ip6 local 2001::2 remote 2002::1 hoplimit 64 dev cl_veth ip -n cl link set tunv6 up ip -n cl addr add 2013::2/64 dev tunv6 ip link set dev cl_rt_veth netns rt ip -n rt link set dev cl_rt_veth up ip -n rt addr add dev cl_rt_veth 2001::1/64 ip link add name rt_srv_veth type veth peer name srv_veth ip link set dev srv_veth netns srv ip -n srv link set dev srv_veth up ip -n srv addr add dev srv_veth 2002::1/64 ip -n srv route add default via 2002::2 ip -n srv link add tunv6 type ip6tnl mode ip6ip6 local 2002::1 remote 2001::2 hoplimit 64 dev srv_veth ip -n srv link set tunv6 up ip -n srv addr add 2013::1/64 dev tunv6 ip link set dev rt_srv_veth netns rt ip -n rt link set dev rt_srv_veth up ip -n rt addr add dev rt_srv_veth 2002::2/64 ip netns exec srv netserver & sleep 0.1 ip netns exec cl ping6 -c 4 2013::1 ip netns exec cl netperf -H 2013::1 -t TCP_STREAM -l 3 & sleep 1 ip -n rt link set dev rt_srv_veth mtu 1400 wait %2 ip -n cl link del cl_veth This commit addresses the issue purging all the references held by the exception at time, as we currently do for e.g. ipv6 pcpu dst entries. v1 -> v2: - re-order the code to avoid accessing dst and net after dst_dev_put() Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 74b9b6fd4168..047c47224dba 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1274,18 +1274,29 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { + struct fib6_info *from; struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); + net->ipv6.rt6_stats->fib_rt_cache--; + + /* purge completely the exception to allow releasing the held resources: + * some [sk] cache may keep the dst around for unlimited time + */ + from = rcu_dereference_protected(rt6_ex->rt6i->from, + lockdep_is_held(&rt6_exception_lock)); + rcu_assign_pointer(rt6_ex->rt6i->from, NULL); + fib6_info_release(from); + dst_dev_put(&rt6_ex->rt6i->dst); + hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; - net->ipv6.rt6_stats->fib_rt_cache--; } /* Remove oldest rt6_ex in bucket and free the memory -- cgit v1.2.3 From 52baf9878b65872a7fc735d7fae3350ea9f30646 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Feb 2019 22:34:54 +0100 Subject: net: socket: add check for negative optlen in compat setsockopt __sys_setsockopt() already checks for `optlen < 0`. Add an equivalent check to the compat path for robustness. This has to be `> INT_MAX` instead of `< 0` because the signedness of `optlen` is different here. Signed-off-by: Jann Horn Signed-off-by: David S. Miller --- net/compat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 959d1c51826d..3d348198004f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -388,8 +388,12 @@ static int __compat_sys_setsockopt(int fd, int level, int optname, char __user *optval, unsigned int optlen) { int err; - struct socket *sock = sockfd_lookup(fd, &err); + struct socket *sock; + + if (optlen > INT_MAX) + return -EINVAL; + sock = sockfd_lookup(fd, &err); if (sock) { err = security_socket_setsockopt(sock, level, optname); if (err) { -- cgit v1.2.3 From 97f0082a0592212fc15d4680f5a4d80f79a1687c Mon Sep 17 00:00:00 2001 From: Kalash Nainwal Date: Wed, 20 Feb 2019 16:23:04 -0800 Subject: net: Set rtm_table to RT_TABLE_COMPAT for ipv6 for tables > 255 Set rtm_table to RT_TABLE_COMPAT for ipv6 for tables > 255 to keep legacy software happy. This is similar to what was done for ipv4 in commit 709772e6e065 ("net: Fix routing tables with id > 255 for legacy software"). Signed-off-by: Kalash Nainwal Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 047c47224dba..ce15dc4ccbfa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4665,7 +4665,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; - rtm->rtm_table = table; + rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; -- cgit v1.2.3 From 543fc3fb41834a7f2e4cfa1dcf8aa9c472a52e9a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:57 +0100 Subject: udpv6: add the required annotation to mib type In commit 029a37434880 ("udp6: cleanup stats accounting in recvmsg()") I forgot to add the percpu annotation for the mib pointer. Add it, and make sparse happy. Fixes: 029a37434880 ("udp6: cleanup stats accounting in recvmsg()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2596ffdeebea..e6c52c27f354 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -288,8 +288,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int peeked, peeking, off; int err; int is_udplite = IS_UDPLITE(sk); + struct udp_mib __percpu *mib; bool checksum_valid = false; - struct udp_mib *mib; int is_udp4; if (flags & MSG_ERRQUEUE) -- cgit v1.2.3 From 5de362df44d71fc8f6b153ae4eaa2a1284c84490 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:58 +0100 Subject: fou6: fix proto error handler argument type Last argument of gue6_err_proto_handler() has a wrong type annotation, fix it and make sparse happy again. Fixes: b8a51b38e4d4 ("fou, fou6: ICMP error handlers for FoU and GUE") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/fou6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index b858bd5280bf..867474abe269 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -72,7 +72,7 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, static int gue6_err_proto_handler(int proto, struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info) + u8 type, u8 code, int offset, __be32 info) { const struct inet6_protocol *ipprot; -- cgit v1.2.3 From 424a7cd078401591fc45587ffb2c012d7f402fb7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:59 +0100 Subject: udpv6: fix possible user after free in error handler Before derefencing the encap pointer, commit e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") checks for a NULL value, but the two fetch operation can race with removal. Fix the above using a single access. Also fix a couple of type annotations, to make sparse happy. Fixes: e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/udp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6c52c27f354..b444483cdb2b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -420,17 +420,19 @@ EXPORT_SYMBOL(udpv6_encap_enable); */ static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info) + u8 type, u8 code, int offset, __be32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info); + u8 type, u8 code, int offset, __be32 info); + const struct ip6_tnl_encap_ops *encap; - if (!ip6tun_encaps[i]) + encap = rcu_dereference(ip6tun_encaps[i]); + if (!encap) continue; - handler = rcu_dereference(ip6tun_encaps[i]->err_handler); + handler = encap->err_handler; if (handler && !handler(skb, opt, type, code, offset, info)) return 0; } -- cgit v1.2.3 From 92b95364235b6441a36861ff0ca4541a13351d60 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:44:00 +0100 Subject: udp: fix possible user after free in error handler Similar to the previous commit, this addresses the same issue for ipv4: use a single fetch operation and use the correct rcu annotation. Fixes: e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5c3cd5d84a6f..372fdc5381a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -562,10 +562,12 @@ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); + const struct ip_tunnel_encap_ops *encap; - if (!iptun_encaps[i]) + encap = rcu_dereference(iptun_encaps[i]); + if (!encap) continue; - handler = rcu_dereference(iptun_encaps[i]->err_handler); + handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } -- cgit v1.2.3 From 278e2148c07559dd4ad8602f22366d61eb2ee7b7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 22 Feb 2019 21:22:32 +0800 Subject: Revert "bridge: do not add port to router list when receives query with source 0.0.0.0" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5a2de63fd1a5 ("bridge: do not add port to router list when receives query with source 0.0.0.0") and commit 0fe5119e267f ("net: bridge: remove ipv6 zero address check in mcast queries") The reason is RFC 4541 is not a standard but suggestive. Currently we will elect 0.0.0.0 as Querier if there is no ip address configured on bridge. If we do not add the port which recives query with source 0.0.0.0 to router list, the IGMP reports will not be about to forward to Querier, IGMP data will also not be able to forward to dest. As Nikolay suggested, revert this change first and add a boolopt api to disable none-zero election in future if needed. Reported-by: Linus Lüssing Reported-by: Sebastian Gottschall Fixes: 5a2de63fd1a5 ("bridge: do not add port to router list when receives query with source 0.0.0.0") Fixes: 0fe5119e267f ("net: bridge: remove ipv6 zero address check in mcast queries") Signed-off-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3aeff0895669..ac92b2eb32b1 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1204,14 +1204,7 @@ static void br_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - - /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules, - * the arrival port for IGMP Queries where the source address - * is 0.0.0.0 should not be added to router port list. - */ - if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) || - saddr->proto == htons(ETH_P_IPV6)) - br_multicast_mark_router(br, port); + br_multicast_mark_router(br, port); } static void br_ip4_multicast_query(struct net_bridge *br, -- cgit v1.2.3 From 797a22bd5298c2674d927893f46cadf619dad11d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Feb 2019 13:24:59 -0800 Subject: net/x25: fix a race in x25_bind() syzbot was able to trigger another soft lockup [1] I first thought it was the O(N^2) issue I mentioned in my prior fix (f657d22ee1f "net/x25: do not hold the cpu too long in x25_new_lci()"), but I eventually found that x25_bind() was not checking SOCK_ZAPPED state under socket lock protection. This means that multiple threads can end up calling x25_insert_socket() for the same socket, and corrupt x25_list [1] watchdog: BUG: soft lockup - CPU#0 stuck for 123s! [syz-executor.2:10492] Modules linked in: irq event stamp: 27515 hardirqs last enabled at (27514): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (27515): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (32): [] x25_get_neigh+0xa3/0xd0 net/x25/x25_link.c:336 softirqs last disabled at (34): [] x25_find_socket+0x23/0x140 net/x25/af_x25.c:341 CPU: 0 PID: 10492 Comm: syz-executor.2 Not tainted 5.0.0-rc7+ #88 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__sanitizer_cov_trace_pc+0x4/0x50 kernel/kcov.c:97 Code: f4 ff ff ff e8 11 9f ea ff 48 c7 05 12 fb e5 08 00 00 00 00 e9 c8 e9 ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 <48> 8b 75 08 65 48 8b 04 25 40 ee 01 00 65 8b 15 38 0c 92 7e 81 e2 RSP: 0018:ffff88806e94fc48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: 1ffff1100d84dac5 RBX: 0000000000000001 RCX: ffffc90006197000 RDX: 0000000000040000 RSI: ffffffff86324bf3 RDI: ffff88806c26d628 RBP: ffff88806e94fc48 R08: ffff88806c1c6500 R09: fffffbfff1282561 R10: fffffbfff1282560 R11: ffffffff89412b03 R12: ffff88806c26d628 R13: ffff888090455200 R14: dffffc0000000000 R15: 0000000000000000 FS: 00007f3a107e4700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3a107e3db8 CR3: 00000000a5544000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __x25_find_socket net/x25/af_x25.c:327 [inline] x25_find_socket+0x7d/0x140 net/x25/af_x25.c:342 x25_new_lci net/x25/af_x25.c:355 [inline] x25_connect+0x380/0xde0 net/x25/af_x25.c:784 __sys_connect+0x266/0x330 net/socket.c:1662 __do_sys_connect net/socket.c:1673 [inline] __se_sys_connect net/socket.c:1670 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1670 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e29 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f3a107e3c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e29 RDX: 0000000000000012 RSI: 0000000020000200 RDI: 0000000000000005 RBP: 000000000073c040 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3a107e46d4 R13: 00000000004be362 R14: 00000000004ceb98 R15: 00000000ffffffff Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 10493 Comm: syz-executor.3 Not tainted 5.0.0-rc7+ #88 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:193 [inline] RIP: 0010:queued_write_lock_slowpath+0x143/0x290 kernel/locking/qrwlock.c:86 Code: 4c 8d 2c 01 41 83 c7 03 41 0f b6 45 00 41 38 c7 7c 08 84 c0 0f 85 0c 01 00 00 8b 03 3d 00 01 00 00 74 1a f3 90 41 0f b6 55 00 <41> 38 d7 7c eb 84 d2 74 e7 48 89 df e8 cc aa 4e 00 eb dd be 04 00 RSP: 0018:ffff888085c47bd8 EFLAGS: 00000206 RAX: 0000000000000300 RBX: ffffffff89412b00 RCX: 1ffffffff1282560 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffff89412b00 RBP: ffff888085c47c70 R08: 1ffffffff1282560 R09: fffffbfff1282561 R10: fffffbfff1282560 R11: ffffffff89412b03 R12: 00000000000000ff R13: fffffbfff1282560 R14: 1ffff11010b88f7d R15: 0000000000000003 FS: 00007fdd04086700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fdd04064db8 CR3: 0000000090be0000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: queued_write_lock include/asm-generic/qrwlock.h:104 [inline] do_raw_write_lock+0x1d6/0x290 kernel/locking/spinlock_debug.c:203 __raw_write_lock_bh include/linux/rwlock_api_smp.h:204 [inline] _raw_write_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:312 x25_insert_socket+0x21/0xe0 net/x25/af_x25.c:267 x25_bind+0x273/0x340 net/x25/af_x25.c:703 __sys_bind+0x23f/0x290 net/socket.c:1481 __do_sys_bind net/socket.c:1492 [inline] __se_sys_bind net/socket.c:1490 [inline] __x64_sys_bind+0x73/0xb0 net/socket.c:1490 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e29 Fixes: 90c27297a9bf ("X.25 remove bkl in bind") Signed-off-by: Eric Dumazet Cc: andrew hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ec3a828672ef..eff31348e20b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -679,8 +679,7 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; - if (!sock_flag(sk, SOCK_ZAPPED) || - addr_len != sizeof(struct sockaddr_x25) || + if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25) { rc = -EINVAL; goto out; @@ -699,9 +698,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } lock_sock(sk); - x25_sk(sk)->source_addr = addr->sx25_addr; - x25_insert_socket(sk); - sock_reset_flag(sk, SOCK_ZAPPED); + if (sock_flag(sk, SOCK_ZAPPED)) { + x25_sk(sk)->source_addr = addr->sx25_addr; + x25_insert_socket(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + } else { + rc = -EINVAL; + } release_sock(sk); SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); out: -- cgit v1.2.3 From bf50b606cfd85ac8d3d0adb711f3e22204059848 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Feb 2019 15:51:51 -0800 Subject: tcp: repaired skbs must init their tso_segs syzbot reported a WARN_ON(!tcp_skb_pcount(skb)) in tcp_send_loss_probe() [1] This was caused by TCP_REPAIR sent skbs that inadvertenly were missing a call to tcp_init_tso_segs() [1] WARNING: CPU: 1 PID: 0 at net/ipv4/tcp_output.c:2534 tcp_send_loss_probe+0x771/0x8a0 net/ipv4/tcp_output.c:2534 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.0.0-rc7+ #77 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x45 kernel/panic.c:571 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] fixup_bug arch/x86/kernel/traps.c:173 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:tcp_send_loss_probe+0x771/0x8a0 net/ipv4/tcp_output.c:2534 Code: 88 fc ff ff 4c 89 ef e8 ed 75 c8 fb e9 c8 fc ff ff e8 43 76 c8 fb e9 63 fd ff ff e8 d9 75 c8 fb e9 94 f9 ff ff e8 bf 03 91 fb <0f> 0b e9 7d fa ff ff e8 b3 03 91 fb 0f b6 1d 37 43 7a 03 31 ff 89 RSP: 0018:ffff8880ae907c60 EFLAGS: 00010206 RAX: ffff8880a989c340 RBX: 0000000000000000 RCX: ffffffff85dedbdb RDX: 0000000000000100 RSI: ffffffff85dee0b1 RDI: 0000000000000005 RBP: ffff8880ae907c90 R08: ffff8880a989c340 R09: ffffed10147d1ae1 R10: ffffed10147d1ae0 R11: ffff8880a3e8d703 R12: ffff888091b90040 R13: ffff8880a3e8d540 R14: 0000000000008000 R15: ffff888091b90860 tcp_write_timer_handler+0x5c0/0x8a0 net/ipv4/tcp_timer.c:583 tcp_write_timer+0x10e/0x1d0 net/ipv4/tcp_timer.c:607 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325 expire_timers kernel/time/timer.c:1362 [inline] __run_timers kernel/time/timer.c:1681 [inline] __run_timers kernel/time/timer.c:1649 [inline] run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694 __do_softirq+0x266/0x95a kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x180/0x1d0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x14a/0x570 arch/x86/kernel/apic/apic.c:1062 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807 RIP: 0010:native_safe_halt+0x2/0x10 arch/x86/include/asm/irqflags.h:58 Code: ff ff ff 48 89 c7 48 89 45 d8 e8 59 0c a1 fa 48 8b 45 d8 e9 ce fe ff ff 48 89 df e8 48 0c a1 fa eb 82 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffff8880a98afd78 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff1125061 RBX: ffff8880a989c340 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000001 RDI: ffff8880a989cbbc RBP: ffff8880a98afda8 R08: ffff8880a989c340 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 R13: ffffffff889282f8 R14: 0000000000000001 R15: 0000000000000000 arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:555 default_idle_call+0x36/0x90 kernel/sched/idle.c:93 cpuidle_idle_call kernel/sched/idle.c:153 [inline] do_idle+0x386/0x570 kernel/sched/idle.c:262 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:353 start_secondary+0x404/0x5c0 arch/x86/kernel/smpboot.c:271 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243 Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 79861919b889 ("tcp: fix TCP_REPAIR xmit queue setup") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Andrey Vagin Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 730bc44dbad9..ccc78f3a4b60 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2347,6 +2347,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } -- cgit v1.2.3