From 8de8b71b787f38983d414d2dba169a3bfefa668a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 6 Apr 2022 17:58:04 +0200 Subject: xsk: Fix l2fwd for copy mode + busy poll combo While checking AF_XDP copy mode combined with busy poll, strange results were observed. rxdrop and txonly scenarios worked fine, but l2fwd broke immediately. After a deeper look, it turned out that for l2fwd, Tx side was exiting early due to xsk_no_wakeup() returning true and in the end xsk_generic_xmit() was never called. Note that AF_XDP Tx in copy mode is syscall steered, so the current behavior is broken. Txonly scenario only worked due to the fact that sk_mark_napi_id_once_xdp() was never called - since Rx side is not in the picture for this case and mentioned function is called in xsk_rcv_check(), sk::sk_napi_id was never set, which in turn meant that xsk_no_wakeup() was returning false (see the sk->sk_napi_id >= MIN_NAPI_ID check in there). To fix this, prefer busy poll in xsk_sendmsg() only when zero copy is enabled on a given AF_XDP socket. By doing so, busy poll in copy mode would not exit early on Tx side and eventually xsk_generic_xmit() will be called. Fixes: a0731952d9cd ("xsk: Add busy-poll support for {recv,send}msg()") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220406155804.434493-1-maciej.fijalkowski@intel.com --- net/xdp/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2c34caee0fd1..7d3a00cb24ec 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -639,7 +639,7 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ - if (xsk_no_wakeup(sk)) + if (xs->zc && xsk_no_wakeup(sk)) return 0; pool = xs->pool; -- cgit v1.2.3 From 425d239379db03d514cb1c476bfe7c320bb89dfc Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Sat, 9 Apr 2022 23:30:53 +0200 Subject: bpf: Fix release of page_pool in BPF_PROG_RUN in test runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The live packet mode in BPF_PROG_RUN allocates a page_pool instance for each test run instance and uses it for the packet data. On setup it creates the page_pool, and calls xdp_reg_mem_model() to allow pages to be returned properly from the XDP data path. However, xdp_reg_mem_model() also raises the reference count of the page_pool itself, so the single page_pool_destroy() count on teardown was not enough to actually release the pool. To fix this, add an additional xdp_unreg_mem_model() call on teardown. Fixes: b530e9e1063e ("bpf: Add "live packet" mode for XDP in BPF_PROG_RUN") Reported-by: Freysteinn Alfredsson Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220409213053.3117305-1-toke@redhat.com --- net/bpf/test_run.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e7b9c2636d10..af709c182674 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -108,6 +108,7 @@ struct xdp_test_data { struct page_pool *pp; struct xdp_frame **frames; struct sk_buff **skbs; + struct xdp_mem_info mem; u32 batch_size; u32 frame_cnt; }; @@ -147,7 +148,6 @@ static void xdp_test_run_init_page(struct page *page, void *arg) static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) { - struct xdp_mem_info mem = {}; struct page_pool *pp; int err = -ENOMEM; struct page_pool_params pp_params = { @@ -174,7 +174,7 @@ static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_c } /* will copy 'mem.id' into pp->xdp_mem_id */ - err = xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pp); + err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); if (err) goto err_mmodel; @@ -202,6 +202,7 @@ err_skbs: static void xdp_test_run_teardown(struct xdp_test_data *xdp) { + xdp_unreg_mem_model(&xdp->mem); page_pool_destroy(xdp->pp); kfree(xdp->frames); kfree(xdp->skbs); -- cgit v1.2.3 From eba1a872cb73314280d5448d934935b23e30b7ca Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 12 Apr 2022 19:05:45 +0800 Subject: ipvs: correctly print the memory size of ip_vs_conn_tab The memory size of ip_vs_conn_tab changed after we use hlist instead of list. Fixes: 731109e78415 ("ipvs: use hlist instead of list") Signed-off-by: Pengcheng Yang Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 2c467c422dc6..fb67f1ca2495 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1495,7 +1495,7 @@ int __init ip_vs_conn_init(void) pr_info("Connection hash table configured " "(size=%d, memory=%ldKbytes)\n", ip_vs_conn_tab_size, - (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); + (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024); IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); -- cgit v1.2.3 From fc06b2867f4cea543505acfb194c2be4ebf0c7d3 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 20 Apr 2022 19:04:08 +0800 Subject: net: dsa: Add missing of_node_put() in dsa_port_link_register_of The device_node pointer is returned by of_parse_phandle() with refcount incremented. We should use of_node_put() on it when done. of_node_put() will check for NULL value. Fixes: a20f997010c4 ("net: dsa: Don't instantiate phylink for CPU/DSA ports unless needed") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- net/dsa/port.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index 32d472a82241..cdc56ba11f52 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1620,8 +1620,10 @@ int dsa_port_link_register_of(struct dsa_port *dp) if (ds->ops->phylink_mac_link_down) ds->ops->phylink_mac_link_down(ds, port, MLO_AN_FIXED, PHY_INTERFACE_MODE_NA); + of_node_put(phy_np); return dsa_port_phylink_register(dp); } + of_node_put(phy_np); return 0; } -- cgit v1.2.3 From babc3dc9524f0bcb5a0ec61f3c3639b11508fad6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 18 Apr 2022 12:21:05 +0200 Subject: netfilter: nft_set_rbtree: overlap detection with element re-addition after deletion This patch fixes spurious EEXIST errors. Extend d2df92e98a34 ("netfilter: nft_set_rbtree: handle element re-addition after deletion") to deal with elements with same end flags in the same transation. Reset the overlap flag as described by 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion"). Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Fixes: d2df92e98a34 ("netfilter: nft_set_rbtree: handle element re-addition after deletion") Signed-off-by: Pablo Neira Ayuso Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index d600a566da32..7325bee7d144 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -349,7 +349,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, *ext = &rbe->ext; return -EEXIST; } else { - p = &parent->rb_left; + overlap = false; + if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; } } -- cgit v1.2.3 From b02d196c44ead1a5949729be9ff08fe781c3e48a Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Wed, 20 Apr 2022 19:52:19 +0300 Subject: bpf, lwt: Fix crash when using bpf_skb_set_tunnel_key() from bpf_xmit lwt hook xmit_check_hhlen() observes the dst for getting the device hard header length to make sure a modified packet can fit. When a helper which changes the dst - such as bpf_skb_set_tunnel_key() - is called as part of the xmit program the accessed dst is no longer valid. This leads to the following splat: BUG: kernel NULL pointer dereference, address: 00000000000000de #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 798 Comm: ping Not tainted 5.18.0-rc2+ #103 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 RIP: 0010:bpf_xmit+0xfb/0x17f Code: c6 c0 4d cd 8e 48 c7 c7 7d 33 f0 8e e8 42 09 fb ff 48 8b 45 58 48 8b 95 c8 00 00 00 48 2b 95 c0 00 00 00 48 83 e0 fe 48 8b 00 <0f> b7 80 de 00 00 00 39 c2 73 22 29 d0 b9 20 0a 00 00 31 d2 48 89 RSP: 0018:ffffb148c0bc7b98 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000240008 RCX: 0000000000000000 RDX: 0000000000000010 RSI: 00000000ffffffea RDI: 00000000ffffffff RBP: ffff922a828a4e00 R08: ffffffff8f1350e8 R09: 00000000ffffdfff R10: ffffffff8f055100 R11: ffffffff8f105100 R12: 0000000000000000 R13: ffff922a828a4e00 R14: 0000000000000040 R15: 0000000000000000 FS: 00007f414e8f0080(0000) GS:ffff922afdc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000de CR3: 0000000002d80006 CR4: 0000000000370ef0 Call Trace: lwtunnel_xmit.cold+0x71/0xc8 ip_finish_output2+0x279/0x520 ? __ip_finish_output.part.0+0x21/0x130 Fix by fetching the device hard header length before running the BPF code. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220420165219.1755407-1-eyal.birger@gmail.com --- net/core/lwt_bpf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 349480ef68a5..8b6b5e72b217 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -159,10 +159,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst->lwtstate->orig_output(net, sk, skb); } -static int xmit_check_hhlen(struct sk_buff *skb) +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { - int hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -274,6 +272,7 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; @@ -291,7 +290,7 @@ static int bpf_xmit(struct sk_buff *skb) /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ - ret = xmit_check_hhlen(skb); + ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; -- cgit v1.2.3 From 5b0b9e4c2c895227c8852488b3f09839233bba54 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Wed, 20 Apr 2022 17:50:26 -0700 Subject: tcp: md5: incorrect tcp_header_len for incoming connections In tcp_create_openreq_child we adjust tcp_header_len for md5 using the remote address in newsk. But that address is still 0 in newsk at this point, and it is only set later by the callers (tcp_v[46]_syn_recv_sock). Use the address from the request socket instead. Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Signed-off-by: Francesco Ruggeri Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220421005026.686A45EC01F2@us226.sjc.aristanetworks.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6366df7aaf2a..6854bb1fb32b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -531,7 +531,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) + if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) -- cgit v1.2.3 From 7f40ea2145d926510b27b785562d2c92df1b0d91 Mon Sep 17 00:00:00 2001 From: Clément Léger Date: Thu, 21 Apr 2022 12:12:47 +0200 Subject: net: bridge: switchdev: check br_vlan_group() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit br_vlan_group() can return NULL and thus return value must be checked to avoid dereferencing a NULL pointer. Fixes: 6284c723d9b9 ("net: bridge: mst: Notify switchdev drivers of VLAN MSTI migrations") Signed-off-by: Clément Léger Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20220421101247.121896-1-clement.leger@bootlin.com Signed-off-by: Jakub Kicinski --- net/bridge/br_switchdev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 8cc44c367231..18affda2b522 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -353,6 +353,8 @@ static int br_switchdev_vlan_attr_replay(struct net_device *br_dev, attr.orig_dev = br_dev; vg = br_vlan_group(br); + if (!vg) + return 0; list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->msti) { -- cgit v1.2.3 From b253a0680ceadc5d7b4acca7aa2d870326cad8ad Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Wed, 20 Apr 2022 10:34:41 +0800 Subject: tcp: ensure to use the most recently sent skb when filling the rate sample If an ACK (s)acks multiple skbs, we favor the information from the most recently sent skb by choosing the skb with the highest prior_delivered count. But in the interval between receiving ACKs, we send multiple skbs with the same prior_delivered, because the tp->delivered only changes when we receive an ACK. We used RACK's solution, copying tcp_rack_sent_after() as tcp_skb_sent_after() helper to determine "which packet was sent last?". Later, we will use tcp_skb_sent_after() instead in RACK. Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection") Signed-off-by: Pengcheng Yang Cc: Paolo Abeni Acked-by: Neal Cardwell Tested-by: Neal Cardwell Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1650422081-22153-1-git-send-email-yangpc@wangsu.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 6 ++++++ net/ipv4/tcp_rate.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 70ca4a5e330a..be712fb9ddd7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1042,6 +1042,7 @@ struct rate_sample { int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ + u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ @@ -1164,6 +1165,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); +static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +{ + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index fbab921670cc..9a8e014d9b5b 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -74,27 +74,32 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) * * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is * called multiple times. We favor the information from the most recently - * sent skb, i.e., the skb with the highest prior_delivered count. + * sent skb, i.e., the skb with the most recently sent time and the highest + * sequence. */ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u64 tx_tstamp; if (!scb->tx.delivered_mstamp) return; + tx_tstamp = tcp_skb_timestamp_us(skb); if (!rs->prior_delivered || - after(scb->tx.delivered, rs->prior_delivered)) { + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->last_end_seq = scb->end_seq; /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); + tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, scb->tx.first_tx_mstamp); -- cgit v1.2.3 From 165e3e17fe8fe6a8aab319bc6e631a2e23b9a857 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Apr 2022 16:52:41 -0400 Subject: sctp: check asoc strreset_chunk in sctp_generate_reconf_event A null pointer reference issue can be triggered when the response of a stream reconf request arrives after the timer is triggered, such as: send Incoming SSN Reset Request ---> CPU0: reconf timer is triggered, go to the handler code before hold sk lock <--- reply with Outgoing SSN Reset Request CPU1: process Outgoing SSN Reset Request, and set asoc->strreset_chunk to NULL CPU0: continue the handler code, hold sk lock, and try to hold asoc->strreset_chunk, crash! In Ying Xu's testing, the call trace is: [ ] BUG: kernel NULL pointer dereference, address: 0000000000000010 [ ] RIP: 0010:sctp_chunk_hold+0xe/0x40 [sctp] [ ] Call Trace: [ ] [ ] sctp_sf_send_reconf+0x2c/0x100 [sctp] [ ] sctp_do_sm+0xa4/0x220 [sctp] [ ] sctp_generate_reconf_event+0xbd/0xe0 [sctp] [ ] call_timer_fn+0x26/0x130 This patch is to fix it by returning from the timer handler if asoc strreset_chunk is already set to NULL. Fixes: 7b9438de0cd4 ("sctp: add stream reconf timer") Reported-by: Ying Xu Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b3815b568e8e..463c4a58d2c3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -458,6 +458,10 @@ void sctp_generate_reconf_event(struct timer_list *t) goto out_unlock; } + /* This happens when the response arrives after the timer is triggered. */ + if (!asoc->strreset_chunk) + goto out_unlock; + error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF), asoc->state, asoc->ep, asoc, -- cgit v1.2.3 From b9b1e0da5800a41a537f3bd1c294e492dad5cc9e Mon Sep 17 00:00:00 2001 From: Rongguang Wei Date: Wed, 20 Apr 2022 10:38:04 +0800 Subject: netfilter: flowtable: Remove the empty file CONFIG_NF_FLOW_TABLE_IPV4 is already removed and the real user is also removed(nf_flow_table_ipv4.c is empty). Fixes: c42ba4290b2147aa ("netfilter: flowtable: remove ipv4/ipv6 modules") Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_flow_table_ipv4.c | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 net/ipv4/netfilter/nf_flow_table_ipv4.c (limited to 'net') diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit v1.2.3 From 8ddffdb9442a9d60b4a6e679ac48d7d21403a674 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 19 Apr 2022 15:47:00 +0200 Subject: netfilter: Update ip6_route_me_harder to consider L3 domain The commit referenced below fixed packet re-routing if Netfilter mangles a routing key property of a packet and the packet is routed in a VRF L3 domain. The fix, however, addressed IPv4 re-routing, only. This commit applies the same behavior for IPv6. While at it, untangle the nested ternary operator to make the code more readable. Fixes: 6d8b49c3a3a3 ("netfilter: Update ip_route_me_harder to consider L3 domain") Cc: stable@vger.kernel.org Signed-off-by: Martin Willi Reviewed-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1da332450d98..8ce60ab89015 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,14 +24,13 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); + struct net_device *dev = skb_dst(skb)->dev; struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; int strict = (ipv6_addr_type(&iph->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct flowi6 fl6 = { - .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if : - strict ? skb_dst(skb)->dev->ifindex : 0, .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, @@ -39,6 +38,13 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff }; int err; + if (sk && sk->sk_bound_dev_if) + fl6.flowi6_oif = sk->sk_bound_dev_if; + else if (strict) + fl6.flowi6_oif = dev->ifindex; + else + fl6.flowi6_oif = l3mdev_master_ifindex(dev); + fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys); dst = ip6_route_output(net, sk, &fl6); err = dst->error; -- cgit v1.2.3 From 4e2e65e2e56c6ceb4ea1719360080c0af083229e Mon Sep 17 00:00:00 2001 From: liuyacan Date: Thu, 21 Apr 2022 17:40:27 +0800 Subject: net/smc: sync err code when tcp connection was refused In the current implementation, when TCP initiates a connection to an unavailable [ip,port], ECONNREFUSED will be stored in the TCP socket, but SMC will not. However, some apps (like curl) use getsockopt(,,SO_ERROR,,) to get the error information, which makes them miss the error message and behave strangely. Fixes: 50717a37db03 ("net/smc: nonblocking connect rework") Signed-off-by: liuyacan Reviewed-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fc7b6eb22143..bbb1a4ce5050 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1475,6 +1475,8 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ -- cgit v1.2.3 From ff827beb706ed719c766acf36449801ded0c17fc Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 21 Apr 2022 15:07:57 -0700 Subject: ip_gre: Make o_seqno start from 0 in native mode For GRE and GRETAP devices, currently o_seqno starts from 1 in native mode. According to RFC 2890 2.2., "The first datagram is sent with a sequence number of 0." Fix it. It is worth mentioning that o_seqno already starts from 0 in collect_md mode, see gre_fb_xmit(), where tunnel->o_seqno is passed to gre_build_header() before getting incremented. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Peilin Ye Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 365caebf51ab..21a8943f6fa4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -459,14 +459,12 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; + __be16 flags = tunnel->parms.o_flags; /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, - tunnel->parms.o_flags, proto, tunnel->parms.o_key, - htonl(tunnel->o_seqno)); + flags, proto, tunnel->parms.o_key, + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } -- cgit v1.2.3 From fde98ae91f79cab4e020f40c35ed23cbdc59661c Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 21 Apr 2022 15:08:38 -0700 Subject: ip6_gre: Make o_seqno start from 0 in native mode For IP6GRE and IP6GRETAP devices, currently o_seqno starts from 1 in native mode. According to RFC 2890 2.2., "The first datagram is sent with a sequence number of 0." Fix it. It is worth mentioning that o_seqno already starts from 0 in collect_md mode, see the "if (tunnel->parms.collect_md)" clause in __gre6_xmit(), where tunnel->o_seqno is passed to gre_build_header() before getting incremented. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: Peilin Ye Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 976236736146..d9e4ac94eab4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -724,6 +724,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, { struct ip6_tnl *tunnel = netdev_priv(dev); __be16 protocol; + __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -739,7 +740,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; - __be16 flags; int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); @@ -770,15 +770,14 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, : 0); } else { - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + flags = tunnel->parms.o_flags; + + gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - htonl(tunnel->o_seqno)); + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, -- cgit v1.2.3 From 31c417c948d7f6909cb63f0ac3298f3c38f8ce20 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 21 Apr 2022 15:09:02 -0700 Subject: ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode As pointed out by Jakub Kicinski, currently using TUNNEL_SEQ in collect_md mode is racy for [IP6]GRE[TAP] devices. Consider the following sequence of events: 1. An [IP6]GRE[TAP] device is created in collect_md mode using "ip link add ... external". "ip" ignores "[o]seq" if "external" is specified, so TUNNEL_SEQ is off, and the device is marked as NETIF_F_LLTX (i.e. it uses lockless TX); 2. Someone sets TUNNEL_SEQ on outgoing skb's, using e.g. bpf_skb_set_tunnel_key() in an eBPF program attached to this device; 3. gre_fb_xmit() or __gre6_xmit() processes these skb's: gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); ^^^^^^^^^^^^^^^^^ Since we are not using the TX lock (&txq->_xmit_lock), multiple CPUs may try to do this tunnel->o_seqno++ in parallel, which is racy. Fix it by making o_seqno atomic_t. As mentioned by Eric Dumazet in commit b790e01aee74 ("ip_gre: lockless xmit"), making o_seqno atomic_t increases "chance for packets being out of order at receiver" when NETIF_F_LLTX is on. Maybe a better fix would be: 1. Do not ignore "oseq" in external mode. Users MUST specify "oseq" if they want the kernel to allow sequencing of outgoing packets; 2. Reject all outgoing TUNNEL_SEQ packets if the device was not created with "oseq". Unfortunately, that would break userspace. We could now make [IP6]GRE[TAP] devices always NETIF_F_LLTX, but let us do it in separate patches to keep this fix minimal. Suggested-by: Jakub Kicinski Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.") Signed-off-by: Peilin Ye Acked-by: William Tu Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 +- include/net/ip_tunnels.h | 2 +- net/ipv4/ip_gre.c | 6 +++--- net/ipv6/ip6_gre.c | 7 ++++--- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index a38c4f1e4e5c..74b369bddf49 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -58,7 +58,7 @@ struct ip6_tnl { /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 88dee57eac8a..c24fa934221d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -116,7 +116,7 @@ struct ip_tunnel { /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ - u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 21a8943f6fa4..aacee9dd771b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -464,7 +464,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -502,7 +502,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -579,7 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) } gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d9e4ac94eab4..5136959b3dc5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -766,7 +766,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); } else { @@ -777,7 +777,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) + : 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -1055,7 +1056,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* Push GRE header. */ proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) : htons(ETH_P_ERSPAN2); - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++)); + gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) -- cgit v1.2.3 From 7c762e70c50b462fabe44a597e2a6c3e56c236c0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 22 Apr 2022 01:42:22 +0300 Subject: net: dsa: flood multicast to CPU when slave has IFF_PROMISC Certain DSA switches can eliminate flooding to the CPU when none of the ports have the IFF_ALLMULTI or IFF_PROMISC flags set. This is done by synthesizing a call to dsa_port_bridge_flags() for the CPU port, a call which normally comes from the bridge driver via switchdev. The bridge port flags and IFF_PROMISC|IFF_ALLMULTI have slightly different semantics, and due to inattention/lack of proper testing, the IFF_PROMISC flag allows unknown unicast to be flooded to the CPU, but not unknown multicast. This must be fixed by setting both BR_FLOOD (unicast) and BR_MCAST_FLOOD in the synthesized dsa_port_bridge_flags() call, since IFF_PROMISC means that packets should not be filtered regardless of their MAC DA. Fixes: 7569459a52c9 ("net: dsa: manage flooding on the CPU ports") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 41c69a6e7854..8022d50584db 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -285,7 +285,7 @@ static void dsa_port_manage_cpu_flood(struct dsa_port *dp) if (other_dp->slave->flags & IFF_ALLMULTI) flags.val |= BR_MCAST_FLOOD; if (other_dp->slave->flags & IFF_PROMISC) - flags.val |= BR_FLOOD; + flags.val |= BR_FLOOD | BR_MCAST_FLOOD; } err = dsa_port_pre_bridge_flags(dp, flags, NULL); -- cgit v1.2.3 From 4bfe744ff1644fbc0a991a2677dc874475dd6776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Apr 2022 17:34:07 -0700 Subject: tcp: fix potential xmit stalls caused by TCP_NOTSENT_LOWAT I had this bug sitting for too long in my pile, it is time to fix it. Thanks to Doug Porter for reminding me of it! We had various attempts in the past, including commit 0cbe6a8f089e ("tcp: remove SOCK_QUEUE_SHRUNK"), but the issue is that TCP stack currently only generates EPOLLOUT from input path, when tp->snd_una has advanced and skb(s) cleaned from rtx queue. If a flow has a big RTT, and/or receives SACKs, it is possible that the notsent part (tp->write_seq - tp->snd_nxt) reaches 0 and no more data can be sent until tp->snd_una finally advances. What is needed is to also check if POLLOUT needs to be generated whenever tp->snd_nxt is advanced, from output path. This bug triggers more often after an idle period, as we do not receive ACK for at least one RTT. tcp_notsent_lowat could be a fraction of what CWND and pacing rate would allow to send during this RTT. In a followup patch, I will remove the bogus call to tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED) from tcp_check_space(). Fact that we have decided to generate an EPOLLOUT does not mean the application has immediately refilled the transmit queue. This optimistic call might have been the reason the bug seemed not too serious. Tested: 200 ms rtt, 1% packet loss, 32 MB tcp_rmem[2] and tcp_wmem[2] $ echo 500000 >/proc/sys/net/ipv4/tcp_notsent_lowat $ cat bench_rr.sh SUM=0 for i in {1..10} do V=`netperf -H remote_host -l30 -t TCP_RR -- -r 10000000,10000 -o LOCAL_BYTES_SENT | egrep -v "MIGRATED|Bytes"` echo $V SUM=$(($SUM + $V)) done echo SUM=$SUM Before patch: $ bench_rr.sh 130000000 80000000 140000000 140000000 140000000 140000000 130000000 40000000 90000000 110000000 SUM=1140000000 After patch: $ bench_rr.sh 430000000 590000000 530000000 450000000 450000000 350000000 450000000 490000000 480000000 460000000 SUM=4680000000 # This is 410 % of the value before patch. Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Reported-by: Doug Porter Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 12 +++++++++++- net/ipv4/tcp_output.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index be712fb9ddd7..b99d9d9cbd99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -620,6 +620,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); +void tcp_check_space(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2088f93fa37b..48f607522860 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5454,7 +5454,17 @@ static void tcp_new_space(struct sock *sk) INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } -static void tcp_check_space(struct sock *sk) +/* Caller made space either from: + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) + * + * We might be able to generate EPOLLOUT to the application if: + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became + * small enough that tcp_stream_memory_free() decides it + * is time to generate EPOLLOUT. + */ +void tcp_check_space(struct sock *sk) { /* pairs with tcp_poll() */ smp_mb(); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9ede847f4199..1ca2f28c9981 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -82,6 +82,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); + tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one -- cgit v1.2.3 From ba5a4fdd63ae0c575707030db0b634b160baddd7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Apr 2022 13:35:09 -0700 Subject: tcp: make sure treq->af_specific is initialized syzbot complained about a recent change in TCP stack, hitting a NULL pointer [1] tcp request sockets have an af_specific pointer, which was used before the blamed change only for SYNACK generation in non SYNCOOKIE mode. tcp requests sockets momentarily created when third packet coming from client in SYNCOOKIE mode were not using treq->af_specific. Make sure this field is populated, in the same way normal TCP requests sockets do in tcp_conn_request(). [1] TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. Check SNMP counters. general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 PID: 3695 Comm: syz-executor864 Not tainted 5.18.0-rc3-syzkaller-00224-g5fd1fe4807f9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tcp_create_openreq_child+0xe16/0x16b0 net/ipv4/tcp_minisocks.c:534 Code: 48 c1 ea 03 80 3c 02 00 0f 85 e5 07 00 00 4c 8b b3 28 01 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7e 08 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 c9 07 00 00 48 8b 3c 24 48 89 de 41 ff 56 08 48 RSP: 0018:ffffc90000de0588 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff888076490330 RCX: 0000000000000100 RDX: 0000000000000001 RSI: ffffffff87d67ff0 RDI: 0000000000000008 RBP: ffff88806ee1c7f8 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87d67f00 R11: 0000000000000000 R12: ffff88806ee1bfc0 R13: ffff88801b0e0368 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f517fe58700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffcead76960 CR3: 000000006f97b000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_v6_syn_recv_sock+0x199/0x23b0 net/ipv6/tcp_ipv6.c:1267 tcp_get_cookie_sock+0xc9/0x850 net/ipv4/syncookies.c:207 cookie_v6_check+0x15c3/0x2340 net/ipv6/syncookies.c:258 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1131 [inline] tcp_v6_do_rcv+0x1148/0x13b0 net/ipv6/tcp_ipv6.c:1486 tcp_v6_rcv+0x3305/0x3840 net/ipv6/tcp_ipv6.c:1725 ip6_protocol_deliver_rcu+0x2e9/0x1900 net/ipv6/ip6_input.c:422 ip6_input_finish+0x14c/0x2c0 net/ipv6/ip6_input.c:464 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:473 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ipv6_rcv+0x27f/0x3b0 net/ipv6/ip6_input.c:297 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 process_backlog+0x3a0/0x7c0 net/core/dev.c:5847 __napi_poll+0xb3/0x6e0 net/core/dev.c:6413 napi_poll net/core/dev.c:6480 [inline] net_rx_action+0x8ec/0xc60 net/core/dev.c:6567 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097 Fixes: 5b0b9e4c2c89 ("tcp: md5: incorrect tcp_header_len for incoming connections") Signed-off-by: Eric Dumazet Cc: Francesco Ruggeri Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/syncookies.c | 8 +++++++- net/ipv6/syncookies.c | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index b99d9d9cbd99..cc1295037533 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -480,6 +480,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2cb3b852d148..f33c31dd7366 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -281,6 +281,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, EXPORT_SYMBOL(cookie_ecn_ok); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { @@ -297,6 +298,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, return NULL; treq = tcp_rsk(req); + + /* treq->af_specific might be used to perform TCP_MD5 lookup */ + treq->af_specific = af_ops; + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); @@ -364,7 +369,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb); + req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, skb); if (!req) goto out; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d1b61d00368e..9cc123f000fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb); + req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, skb); if (!req) goto out; -- cgit v1.2.3 From 97b9af7a70936e331170c79040cc9bf20071b566 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 22 Apr 2022 15:56:18 +0800 Subject: net/smc: Only save the original clcsock callback functions Both listen and fallback process will save the current clcsock callback functions and establish new ones. But if both of them happen, the saved callback functions will be overwritten. So this patch introduces some helpers to ensure that only save the original callback functions of clcsock. Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 55 +++++++++++++++++++++++++++++++++++------------------ net/smc/smc.h | 29 ++++++++++++++++++++++++++++ net/smc/smc_close.c | 3 ++- 3 files changed, 67 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index bbb1a4ce5050..d8433f17c5c9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -373,6 +373,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -782,9 +783,24 @@ static void smc_fback_error_report(struct sock *clcsk) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); } +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); +} + static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { - struct sock *clcsk; int rc = 0; mutex_lock(&smc->clcsock_release_lock); @@ -792,10 +808,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) rc = -EBADF; goto out; } - clcsk = smc->clcsock->sk; - if (smc->use_fallback) - goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -810,18 +823,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_fback_state_change; - clcsk->sk_data_ready = smc_fback_data_ready; - clcsk->sk_write_space = smc_fback_write_space; - clcsk->sk_error_report = smc_fback_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); @@ -1596,6 +1598,19 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -2397,10 +2412,10 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; - smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2415,7 +2430,9 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc.h b/net/smc/smc.h index ea0620529ebe..5ed765ea0c73 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -288,12 +288,41 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 676cb2333d3c..7bd1ef55b9df 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -214,7 +214,8 @@ again: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } -- cgit v1.2.3 From 0558226cebee256aa3f8ec0cc5a800a10bf120a6 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 22 Apr 2022 15:56:19 +0800 Subject: net/smc: Fix slab-out-of-bounds issue in fallback syzbot reported a slab-out-of-bounds/use-after-free issue, which was caused by accessing an already freed smc sock in fallback-specific callback functions of clcsock. This patch fixes the issue by restoring fallback-specific callback functions to original ones and resetting clcsock sk_user_data to NULL before freeing smc sock. Meanwhile, this patch introduces sk_callback_lock to make the access and assignment to sk_user_data mutually exclusive. Reported-by: syzbot+b425899ed22c6943e00b@syzkaller.appspotmail.com Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback") Link: https://lore.kernel.org/r/00000000000013ca8105d7ae3ada@google.com/ Signed-off-by: Wen Gu Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 80 ++++++++++++++++++++++++++++++++++++++--------------- net/smc/smc_close.c | 2 ++ 2 files changed, 59 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d8433f17c5c9..fce16b9d6e1a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -243,11 +243,27 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); } } @@ -745,48 +761,57 @@ out: static void smc_fback_state_change(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { - struct smc_sock *smc = - smc_clcsock_user_data(clcsk); + struct smc_sock *smc; - if (!smc) - return; - smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; + write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, @@ -797,6 +822,8 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc) &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) @@ -2370,17 +2397,20 @@ out: static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct smc_sock *lsmc = - smc_clcsock_user_data(listen_clcsock); + struct smc_sock *lsmc; + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) - return; + goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -2412,10 +2442,12 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; @@ -2430,9 +2462,11 @@ static int smc_listen(struct socket *sock, int backlog) rc = kernel_listen(smc->clcsock, backlog); if (rc) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 7bd1ef55b9df..31db7438857c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -214,9 +214,11 @@ again: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); -- cgit v1.2.3 From b561275d633bcd8e0e8055ab86f1a13df75a0269 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 22 Apr 2022 19:43:40 +0800 Subject: mctp: defer the kfree of object mdev->addrs The function mctp_unregister() reclaims the device's relevant resource when a netcard detaches. However, a running routine may be unaware of this and cause the use-after-free of the mdev->addrs object. The race condition can be demonstrated below cleanup thread another thread | unregister_netdev() | mctp_sendmsg() ... | ... mctp_unregister() | rt = mctp_route_lookup() ... | mctl_local_output() kfree(mdev->addrs) | ... | saddr = rt->dev->addrs[0]; | An attacker can adopt the (recent provided) mtcpserial driver with pty to fake the device detaching and use the userfaultfd to increase the race success chance (in mctp_sendmsg). The KASan report for such a POC is shown below: [ 86.051955] ================================================================== [ 86.051955] BUG: KASAN: use-after-free in mctp_local_output+0x4e9/0xb7d [ 86.051955] Read of size 1 at addr ffff888005f298c0 by task poc/295 [ 86.051955] [ 86.051955] Call Trace: [ 86.051955] [ 86.051955] dump_stack_lvl+0x33/0x42 [ 86.051955] print_report.cold.13+0xb2/0x6b3 [ 86.051955] ? preempt_schedule_irq+0x57/0x80 [ 86.051955] ? mctp_local_output+0x4e9/0xb7d [ 86.051955] kasan_report+0xa5/0x120 [ 86.051955] ? mctp_local_output+0x4e9/0xb7d [ 86.051955] mctp_local_output+0x4e9/0xb7d [ 86.051955] ? mctp_dev_set_key+0x79/0x79 [ 86.051955] ? copyin+0x38/0x50 [ 86.051955] ? _copy_from_iter+0x1b6/0xf20 [ 86.051955] ? sysvec_apic_timer_interrupt+0x97/0xb0 [ 86.051955] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 86.051955] ? mctp_local_output+0x1/0xb7d [ 86.051955] mctp_sendmsg+0x64d/0xdb0 [ 86.051955] ? mctp_sk_close+0x20/0x20 [ 86.051955] ? __fget_light+0x2fd/0x4f0 [ 86.051955] ? mctp_sk_close+0x20/0x20 [ 86.051955] sock_sendmsg+0xdd/0x110 [ 86.051955] __sys_sendto+0x1cc/0x2a0 [ 86.051955] ? __ia32_sys_getpeername+0xa0/0xa0 [ 86.051955] ? new_sync_write+0x335/0x550 [ 86.051955] ? alloc_file+0x22f/0x500 [ 86.051955] ? __ip_do_redirect+0x820/0x1820 [ 86.051955] ? vfs_write+0x44d/0x7b0 [ 86.051955] ? vfs_write+0x44d/0x7b0 [ 86.051955] ? fput_many+0x15/0x120 [ 86.051955] ? ksys_write+0x155/0x1b0 [ 86.051955] ? __ia32_sys_read+0xa0/0xa0 [ 86.051955] __x64_sys_sendto+0xd8/0x1b0 [ 86.051955] ? exit_to_user_mode_prepare+0x2f/0x120 [ 86.051955] ? syscall_exit_to_user_mode+0x12/0x20 [ 86.051955] do_syscall_64+0x3a/0x80 [ 86.051955] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 86.051955] RIP: 0033:0x7f82118a56b3 [ 86.051955] RSP: 002b:00007ffdb154b110 EFLAGS: 00000293 ORIG_RAX: 000000000000002c [ 86.051955] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f82118a56b3 [ 86.051955] RDX: 0000000000000010 RSI: 00007f8211cd4000 RDI: 0000000000000007 [ 86.051955] RBP: 00007ffdb154c1d0 R08: 00007ffdb154b164 R09: 000000000000000c [ 86.051955] R10: 0000000000000000 R11: 0000000000000293 R12: 000055d779800db0 [ 86.051955] R13: 00007ffdb154c2b0 R14: 0000000000000000 R15: 0000000000000000 [ 86.051955] [ 86.051955] [ 86.051955] Allocated by task 295: [ 86.051955] kasan_save_stack+0x1c/0x40 [ 86.051955] __kasan_kmalloc+0x84/0xa0 [ 86.051955] mctp_rtm_newaddr+0x242/0x610 [ 86.051955] rtnetlink_rcv_msg+0x2fd/0x8b0 [ 86.051955] netlink_rcv_skb+0x11c/0x340 [ 86.051955] netlink_unicast+0x439/0x630 [ 86.051955] netlink_sendmsg+0x752/0xc00 [ 86.051955] sock_sendmsg+0xdd/0x110 [ 86.051955] __sys_sendto+0x1cc/0x2a0 [ 86.051955] __x64_sys_sendto+0xd8/0x1b0 [ 86.051955] do_syscall_64+0x3a/0x80 [ 86.051955] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 86.051955] [ 86.051955] Freed by task 301: [ 86.051955] kasan_save_stack+0x1c/0x40 [ 86.051955] kasan_set_track+0x21/0x30 [ 86.051955] kasan_set_free_info+0x20/0x30 [ 86.051955] __kasan_slab_free+0x104/0x170 [ 86.051955] kfree+0x8c/0x290 [ 86.051955] mctp_dev_notify+0x161/0x2c0 [ 86.051955] raw_notifier_call_chain+0x8b/0xc0 [ 86.051955] unregister_netdevice_many+0x299/0x1180 [ 86.051955] unregister_netdevice_queue+0x210/0x2f0 [ 86.051955] unregister_netdev+0x13/0x20 [ 86.051955] mctp_serial_close+0x6d/0xa0 [ 86.051955] tty_ldisc_kill+0x31/0xa0 [ 86.051955] tty_ldisc_hangup+0x24f/0x560 [ 86.051955] __tty_hangup.part.28+0x2ce/0x6b0 [ 86.051955] tty_release+0x327/0xc70 [ 86.051955] __fput+0x1df/0x8b0 [ 86.051955] task_work_run+0xca/0x150 [ 86.051955] exit_to_user_mode_prepare+0x114/0x120 [ 86.051955] syscall_exit_to_user_mode+0x12/0x20 [ 86.051955] do_syscall_64+0x46/0x80 [ 86.051955] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 86.051955] [ 86.051955] The buggy address belongs to the object at ffff888005f298c0 [ 86.051955] which belongs to the cache kmalloc-8 of size 8 [ 86.051955] The buggy address is located 0 bytes inside of [ 86.051955] 8-byte region [ffff888005f298c0, ffff888005f298c8) [ 86.051955] [ 86.051955] The buggy address belongs to the physical page: [ 86.051955] flags: 0x100000000000200(slab|node=0|zone=1) [ 86.051955] raw: 0100000000000200 dead000000000100 dead000000000122 ffff888005c42280 [ 86.051955] raw: 0000000000000000 0000000080660066 00000001ffffffff 0000000000000000 [ 86.051955] page dumped because: kasan: bad access detected [ 86.051955] [ 86.051955] Memory state around the buggy address: [ 86.051955] ffff888005f29780: 00 fc fc fc fc 00 fc fc fc fc 00 fc fc fc fc 00 [ 86.051955] ffff888005f29800: fc fc fc fc 00 fc fc fc fc 00 fc fc fc fc 00 fc [ 86.051955] >ffff888005f29880: fc fc fc fb fc fc fc fc fa fc fc fc fc fa fc fc [ 86.051955] ^ [ 86.051955] ffff888005f29900: fc fc 00 fc fc fc fc 00 fc fc fc fc 00 fc fc fc [ 86.051955] ffff888005f29980: fc 00 fc fc fc fc 00 fc fc fc fc 00 fc fc fc fc [ 86.051955] ================================================================== To this end, just like the commit e04480920d1e ("Bluetooth: defer cleanup of resources in hci_unregister_dev()") this patch defers the destructive kfree(mdev->addrs) in mctp_unregister to the mctp_dev_put, where the refcount of mdev is zero and the entire device is reclaimed. This prevents the use-after-free because the sendmsg thread holds the reference of mdev in the mctp_route object. Fixes: 583be982d934 (mctp: Add device handling and netlink interface) Signed-off-by: Lin Ma Acked-by: Jeremy Kerr Link: https://lore.kernel.org/r/20220422114340.32346-1-linma@zju.edu.cn Signed-off-by: Paolo Abeni --- net/mctp/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mctp/device.c b/net/mctp/device.c index f49be882e98e..99a3bda8852f 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -313,6 +313,7 @@ void mctp_dev_hold(struct mctp_dev *mdev) void mctp_dev_put(struct mctp_dev *mdev) { if (mdev && refcount_dec_and_test(&mdev->refs)) { + kfree(mdev->addrs); dev_put(mdev->dev); kfree_rcu(mdev, rcu); } @@ -441,7 +442,6 @@ static void mctp_unregister(struct net_device *dev) mctp_route_remove_dev(mdev); mctp_neigh_remove_dev(mdev); - kfree(mdev->addrs); mctp_dev_put(mdev); } -- cgit v1.2.3 From ba3beec2ec1d3b4fd8672ca6e781dac4b3267f6e Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 25 Apr 2022 17:37:45 +0200 Subject: xsk: Fix possible crash when multiple sockets are created Fix a crash that happens if an Rx only socket is created first, then a second socket is created that is Tx only and bound to the same umem as the first socket and also the same netdev and queue_id together with the XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page pool was not created by the first socket as it was an Rx only socket. When the second socket is bound it needs this tx_descs array of this shared page pool as it has a Tx component, but unfortunately it was never allocated, leading to a crash. Note that this array is only used for zero-copy drivers using the batched Tx APIs, currently only ice and i40e. [ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 5511.158419] #PF: supervisor write access in kernel mode [ 5511.164472] #PF: error_code(0x0002) - not-present page [ 5511.170416] PGD 0 P4D 0 [ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI [ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G E 5.18.0-rc1+ #97 [ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016 [ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310 [ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b [ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246 [ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000 [ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0 [ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48 [ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000 [ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100 [ 5511.274387] FS: 0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000 [ 5511.283746] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0 [ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 5511.315142] Call Trace: [ 5511.317972] [ 5511.320301] ice_xmit_zc+0x68/0x2f0 [ice] [ 5511.324977] ? ktime_get+0x38/0xa0 [ 5511.328913] ice_napi_poll+0x7a/0x6a0 [ice] [ 5511.333784] __napi_poll+0x2c/0x160 [ 5511.337821] net_rx_action+0xdd/0x200 [ 5511.342058] __do_softirq+0xe6/0x2dd [ 5511.346198] irq_exit_rcu+0xb5/0x100 [ 5511.350339] common_interrupt+0xa4/0xc0 [ 5511.354777] [ 5511.357201] [ 5511.359625] asm_common_interrupt+0x1e/0x40 [ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360 [ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49 [ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202 [ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f [ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046 [ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008 [ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700 [ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000 [ 5511.480300] cpuidle_enter+0x29/0x40 [ 5511.494329] do_idle+0x1c7/0x250 [ 5511.507610] cpu_startup_entry+0x19/0x20 [ 5511.521394] start_kernel+0x649/0x66e [ 5511.534626] secondary_startup_64_no_verify+0xc3/0xcb [ 5511.549230] Detect such case during bind() and allocate this memory region via newly introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as for other buffer pool allocations, so that it matches the kvfree() from xp_destroy(). Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220425153745.481322-1-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + net/xdp/xsk.c | 13 +++++++++++++ net/xdp/xsk_buff_pool.c | 16 ++++++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 5554ee75e7da..647722e847b4 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -97,6 +97,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id); +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); void xp_get_pool(struct xsk_buff_pool *pool); bool xp_put_pool(struct xsk_buff_pool *pool); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7d3a00cb24ec..3a9348030e20 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -967,6 +967,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; + + /* If underlying shared umem was created without Tx + * ring, allocate Tx descs array that Tx batching API + * utilizes + */ + if (xs->tx && !xs->pool->tx_descs) { + err = xp_alloc_tx_descs(xs->pool, xs); + if (err) { + xp_put_pool(xs->pool); + sockfd_put(sock); + goto out_unlock; + } + } } xdp_get_umem(umem_xs->umem); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index af040ffa14ff..87bdd71c7bb6 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -42,6 +42,16 @@ void xp_destroy(struct xsk_buff_pool *pool) kvfree(pool); } +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) +{ + pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), + GFP_KERNEL); + if (!pool->tx_descs) + return -ENOMEM; + + return 0; +} + struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { @@ -59,11 +69,9 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (!pool->heads) goto out; - if (xs->tx) { - pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL); - if (!pool->tx_descs) + if (xs->tx) + if (xp_alloc_tx_descs(pool, xs)) goto out; - } pool->chunk_mask = ~((u64)umem->chunk_size - 1); pool->addrs_cnt = umem->size; -- cgit v1.2.3 From c86cc5a3ec70f5644f1fa21610b943d0441bc1f7 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Apr 2022 12:58:16 -0700 Subject: Bluetooth: hci_event: Fix checking for invalid handle on error status Commit d5ebaa7c5f6f6 introduces checks for handle range (e.g HCI_CONN_HANDLE_MAX) but controllers like Intel AX200 don't seem to respect the valid range int case of error status: > HCI Event: Connect Complete (0x03) plen 11 Status: Page Timeout (0x04) Handle: 65535 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) Link type: ACL (0x01) Encryption: Disabled (0x00) [1644965.827560] Bluetooth: hci0: Ignoring HCI_Connection_Complete for invalid handle Because of it is impossible to cleanup the connections properly since the stack would attempt to cancel the connection which is no longer in progress causing the following trace: < HCI Command: Create Connection Cancel (0x01|0x0008) plen 6 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) = bluetoothd: src/profile.c:record_cb() Unable to get Hands-Free Voice gateway SDP record: Connection timed out > HCI Event: Command Complete (0x0e) plen 10 Create Connection Cancel (0x01|0x0008) ncmd 1 Status: Unknown Connection Identifier (0x02) Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) < HCI Command: Create Connection Cancel (0x01|0x0008) plen 6 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 65 +++++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5cb095b09a94..69ef31cea582 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -578,6 +578,7 @@ enum { #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f +#define HCI_ERROR_INVALID_PARAMETERS 0x12 #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 #define HCI_ERROR_REMOTE_POWER_OFF 0x15 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index abaabfae19cc..3a9071b987f4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3067,13 +3067,9 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; - if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for invalid handle"); - return; - } - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); @@ -3122,8 +3118,14 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, goto unlock; } - if (!ev->status) { + if (!status) { conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + goto done; + } if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; @@ -3164,18 +3166,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp), &cp); } - } else { - conn->state = BT_CLOSED; - if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); } if (conn->type == ACL_LINK) hci_sco_setup(conn, ev->status); - if (ev->status) { - hci_connect_cfm(conn, ev->status); +done: + if (status) { + conn->state = BT_CLOSED; + if (conn->type == ACL_LINK) + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + hci_connect_cfm(conn, status); hci_conn_del(conn); } else if (ev->link_type == SCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { @@ -3185,7 +3187,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, break; } - hci_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, status); } unlock: @@ -4676,6 +4678,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_sync_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; switch (ev->link_type) { case SCO_LINK: @@ -4690,12 +4693,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, return; } - if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete for invalid handle"); - return; - } - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); @@ -4729,9 +4727,17 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, goto unlock; } - switch (ev->status) { + switch (status) { case 0x00: conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + conn->state = BT_CLOSED; + break; + } + conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -4775,8 +4781,8 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, } } - hci_connect_cfm(conn, ev->status); - if (ev->status) + hci_connect_cfm(conn, status); + if (status) hci_conn_del(conn); unlock: @@ -5527,11 +5533,6 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, struct smp_irk *irk; u8 addr_type; - if (handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_LE_Connection_Complete for invalid handle"); - return; - } - hci_dev_lock(hdev); /* All controllers implicitly stop advertising in the event of a @@ -5603,6 +5604,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL); + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle, + HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + } + if (status) { hci_le_conn_failed(conn, status); goto unlock; -- cgit v1.2.3 From aef2aa4fa98e18ea5d9345bf777ee698c8598728 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Apr 2022 12:58:17 -0700 Subject: Bluetooth: hci_event: Fix creating hci_conn object on error status It is useless to create a hci_conn object if on error status as the result would be it being freed in the process and anyway it is likely the result of controller and host stack being out of sync. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3a9071b987f4..5a6c8afc51a0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3075,6 +3075,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { + /* In case of error status and there is no connection pending + * just unlock as there is nothing to cleanup. + */ + if (ev->status) + goto unlock; + /* Connection may not exist if auto-connected. Check the bredr * allowlist to see if this device is allowed to auto connect. * If link is an ACL type, create a connection class @@ -5542,6 +5548,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn = hci_lookup_le_connect(hdev); if (!conn) { + /* In case of error status and there is no connection pending + * just unlock as there is nothing to cleanup. + */ + if (status) + goto unlock; + conn = hci_conn_add(hdev, LE_LINK, bdaddr, role); if (!conn) { bt_dev_err(hdev, "no memory for new connection"); -- cgit v1.2.3 From 9b3628d79b46f06157affc56fdb218fdd4988321 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Apr 2022 12:58:18 -0700 Subject: Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted This attempts to cleanup the hci_conn if it cannot be aborted as otherwise it would likely result in having the controller and host stack out of sync with respect to connection handle. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 32 ++++++++++++++++++++++++-------- net/bluetooth/hci_event.c | 13 ++++--------- net/bluetooth/hci_sync.c | 11 ++++++++++- 4 files changed, 39 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d5377740e99c..8abd08245326 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1156,7 +1156,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); -void hci_le_conn_failed(struct hci_conn *conn, u8 status); +void hci_conn_failed(struct hci_conn *conn, u8 status); /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 84312c836549..fe803bee419a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -670,7 +670,7 @@ static void le_conn_timeout(struct work_struct *work) /* Disable LE Advertising */ le_disable_advertising(hdev); hci_dev_lock(hdev); - hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); + hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); hci_dev_unlock(hdev); return; } @@ -873,7 +873,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ -void hci_le_conn_failed(struct hci_conn *conn, u8 status) +static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; @@ -886,8 +886,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) params->conn = NULL; } - conn->state = BT_CLOSED; - /* If the status indicates successful cancellation of * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to @@ -899,10 +897,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); - hci_connect_cfm(conn, status); - - hci_conn_del(conn); - /* Since we may have temporarily stopped the background scanning in * favor of connection establishment, we should restart it. */ @@ -914,6 +908,28 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) hci_enable_advertising(hdev); } +/* This function requires the caller holds hdev->lock */ +void hci_conn_failed(struct hci_conn *conn, u8 status) +{ + struct hci_dev *hdev = conn->hdev; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + switch (conn->type) { + case LE_LINK: + hci_le_conn_failed(conn, status); + break; + case ACL_LINK: + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + break; + } + + conn->state = BT_CLOSED; + hci_connect_cfm(conn, status); + hci_conn_del(conn); +} + static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5a6c8afc51a0..66451661283c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2834,7 +2834,7 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -2859,7 +2859,7 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -3179,12 +3179,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, done: if (status) { - conn->state = BT_CLOSED; - if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); - hci_connect_cfm(conn, status); - hci_conn_del(conn); + hci_conn_failed(conn, status); } else if (ev->link_type == SCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_CVSD: @@ -5623,7 +5618,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, } if (status) { - hci_le_conn_failed(conn, status); + hci_conn_failed(conn, status); goto unlock; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8f4c5698913d..13600bf120b0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4408,12 +4408,21 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, static int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { + int err; + switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: return hci_disconnect_sync(hdev, conn, reason); case BT_CONNECT: - return hci_connect_cancel_sync(hdev, conn); + err = hci_connect_cancel_sync(hdev, conn); + /* Cleanup hci_conn object if it cannot be cancelled as it + * likelly means the controller and host stack are out of sync. + */ + if (err) + hci_conn_failed(conn, err); + + return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); default: -- cgit v1.2.3 From 6510ea973d8d9d4a0cb2fb557b36bd1ab3eb49f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Apr 2022 18:39:46 +0200 Subject: net: Use this_cpu_inc() to increment net->core_stats The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes netdev_core_stats_alloc() to return a per-CPU pointer. netdev_core_stats_alloc() will allocate memory on its first invocation which breaks on PREEMPT_RT because it requires non-atomic context for memory allocation. This can be avoided by enabling preemption in netdev_core_stats_alloc() assuming the caller always disables preemption. It might be better to replace local_inc() with this_cpu_inc() now that dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does not rely on already disabled preemption. This results in less instructions on x86-64: local_inc: | incl %gs:__preempt_count(%rip) # __preempt_count | movq 488(%rdi), %rax # _1->core_stats, _22 | testq %rax, %rax # _22 | je .L585 #, | add %gs:this_cpu_off(%rip), %rax # this_cpu_off, tcp_ptr__ | .L586: | testq %rax, %rax # _27 | je .L587 #, | incq (%rax) # _6->a.counter | .L587: | decl %gs:__preempt_count(%rip) # __preempt_count this_cpu_inc(), this patch: | movq 488(%rdi), %rax # _1->core_stats, _5 | testq %rax, %rax # _5 | je .L591 #, | .L585: | incq %gs:(%rax) # _18->rx_dropped Use unsigned long as type for the counter. Use this_cpu_inc() to increment the counter. Use a plain read of the counter. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/YmbO0pxgtKpCw4SY@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 21 +++++++++------------ net/core/dev.c | 14 +++++--------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b1fbe21650bb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -199,10 +199,10 @@ struct net_device_stats { * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { - local_t rx_dropped; - local_t tx_dropped; - local_t rx_nohandler; -} __aligned(4 * sizeof(local_t)); + unsigned long rx_dropped; + unsigned long tx_dropped; + unsigned long rx_nohandler; +} __aligned(4 * sizeof(unsigned long)); #include #include @@ -3843,15 +3843,15 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev); -static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) +static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); if (likely(p)) - return this_cpu_ptr(p); + return p; return netdev_core_stats_alloc(dev); } @@ -3859,14 +3859,11 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ - struct net_device_core_stats *p; \ + struct net_device_core_stats __percpu *p; \ \ - preempt_disable(); \ p = dev_core_stats(dev); \ - \ if (p) \ - local_inc(&p->FIELD); \ - preempt_enable(); \ + this_cpu_inc(p->FIELD); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) diff --git a/net/core/dev.c b/net/core/dev.c index 8c6c08446556..1461c2d9dec8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10304,7 +10304,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10315,11 +10315,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ - p = READ_ONCE(dev->core_stats); - if (!p) - return NULL; - - return this_cpu_ptr(p); + return READ_ONCE(dev->core_stats); } EXPORT_SYMBOL(netdev_core_stats_alloc); @@ -10356,9 +10352,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); - storage->rx_dropped += local_read(&core_stats->rx_dropped); - storage->tx_dropped += local_read(&core_stats->tx_dropped); - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); } } return storage; -- cgit v1.2.3 From c7aab4f17021b636a0ee75bcf28e06fb7c94ab48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Apr 2022 11:47:11 +0200 Subject: netfilter: nf_conntrack_tcp: re-init for syn packets only Jaco Kroon reported tcp problems that Eric Dumazet and Neal Cardwell pinpointed to nf_conntrack tcp_in_window() bug. tcp trace shows following sequence: I > R Flags [S], seq 3451342529, win 62580, options [.. tfo [|tcp]> R > I Flags [S.], seq 2699962254, ack 3451342530, win 65535, options [..] R > I Flags [P.], seq 1:89, ack 1, [..] Note 3rd ACK is from responder to initiator so following branch is taken: } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) || (state->state == TCP_CONNTRACK_SYN_RECV && dir == IP_CT_DIR_REPLY)) && after(end, sender->td_end)) { ... because state == TCP_CONNTRACK_SYN_RECV and dir is REPLY. This causes the scaling factor to be reset to 0: window scale option is only present in syn(ack) packets. This in turn makes nf_conntrack mark valid packets as out-of-window. This was always broken, it exists even in original commit where window tracking was added to ip_conntrack (nf_conntrack predecessor) in 2.6.9-rc1 kernel. Restrict to 'tcph->syn', just like the 3rd condtional added in commit 82b72cb94666 ("netfilter: conntrack: re-init state for retransmitted syn-ack"). Upon closer look, those conditionals/branches can be merged: Because earlier checks prevent syn-ack from showing up in original direction, the 'dir' checks in the conditional quoted above are redundant, remove them. Return early for pure syn retransmitted in reply direction (simultaneous open). Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Reported-by: Jaco Kroon Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8ec55cd72572..204a5cdff5b1 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -556,24 +556,14 @@ static bool tcp_in_window(struct nf_conn *ct, } } - } else if (((state->state == TCP_CONNTRACK_SYN_SENT - && dir == IP_CT_DIR_ORIGINAL) - || (state->state == TCP_CONNTRACK_SYN_RECV - && dir == IP_CT_DIR_REPLY)) - && after(end, sender->td_end)) { + } else if (tcph->syn && + after(end, sender->td_end) && + (state->state == TCP_CONNTRACK_SYN_SENT || + state->state == TCP_CONNTRACK_SYN_RECV)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, dataoff, tcph, sender); - } else if (tcph->syn && dir == IP_CT_DIR_REPLY && - state->state == TCP_CONNTRACK_SYN_SENT) { - /* Retransmitted syn-ack, or syn (simultaneous open). * * Re-init state for this direction, just like for the first * syn(-ack) reply, it might differ in seq, ack or tcp options. @@ -581,7 +571,8 @@ static bool tcp_in_window(struct nf_conn *ct, tcp_init_sender(sender, receiver, skb, dataoff, tcph, end, win); - if (!tcph->ack) + + if (dir == IP_CT_DIR_REPLY && !tcph->ack) return true; } -- cgit v1.2.3 From 626873c446f7559d5af8b48cefad903ffd85cf4e Mon Sep 17 00:00:00 2001 From: Volodymyr Mytnyk Date: Wed, 27 Apr 2022 14:09:00 +0300 Subject: netfilter: conntrack: fix udp offload timeout sysctl `nf_flowtable_udp_timeout` sysctl option is available only if CONFIG_NFT_FLOW_OFFLOAD enabled. But infra for this flow offload UDP timeout was added under CONFIG_NF_FLOW_TABLE config option. So, if you have CONFIG_NFT_FLOW_OFFLOAD disabled and CONFIG_NF_FLOW_TABLE enabled, the `nf_flowtable_udp_timeout` is not present in sysfs. Please note, that TCP flow offload timeout sysctl option is present even CONFIG_NFT_FLOW_OFFLOAD is disabled. I suppose it was a typo in commit that adds UDP flow offload timeout and CONFIG_NF_FLOW_TABLE should be used instead. Fixes: 975c57504da1 ("netfilter: conntrack: Introduce udp offload timeout configuration") Signed-off-by: Volodymyr Mytnyk Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 3e1afd10a9b6..55aa55b252b2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -823,7 +823,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, -#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD) +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { .procname = "nf_flowtable_udp_timeout", .maxlen = sizeof(unsigned int), -- cgit v1.2.3 From a0df71948e9548de819a6f1da68f5f1742258a52 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 26 Apr 2022 18:49:49 +0300 Subject: tls: Skip tls_append_frag on zero copy size Calling tls_append_frag when max_open_record_len == record->len might add an empty fragment to the TLS record if the call happens to be on the page boundary. Normally tls_append_frag coalesces the zero-sized fragment to the previous one, but not if it's on page boundary. If a resync happens then, the mlx5 driver posts dump WQEs in tx_post_resync_dump, and the empty fragment may become a data segment with byte_count == 0, which will confuse the NIC and lead to a CQE error. This commit fixes the described issue by skipping tls_append_frag on zero size to avoid adding empty fragments. The fix is not in the driver, because an empty fragment is hardly the desired behavior. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220426154949.159055-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 12f7b56771d9..af875ad4a822 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -483,11 +483,13 @@ handle_error: copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); copy = min_t(size_t, copy, (max_open_record_len - record->len)); - rc = tls_device_copy_data(page_address(pfrag->page) + - pfrag->offset, copy, msg_iter); - if (rc) - goto handle_error; - tls_append_frag(record, pfrag, copy); + if (copy) { + rc = tls_device_copy_data(page_address(pfrag->page) + + pfrag->offset, copy, msg_iter); + if (rc) + goto handle_error; + tls_append_frag(record, pfrag, copy); + } size -= copy; if (!size) { -- cgit v1.2.3 From 743b83f15d4069ea57c3e40996bf4a1077e0cdc1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2022 09:39:21 +0200 Subject: netfilter: nft_socket: only do sk lookups when indev is available Check if the incoming interface is available and NFT_BREAK in case neither skb->sk nor input device are set. Because nf_sk_lookup_slow*() assume packet headers are in the 'in' direction, use in postrouting is not going to yield a meaningful result. Same is true for the forward chain, so restrict the use to prerouting, input and output. Use in output work if a socket is already attached to the skb. Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Reported-and-tested-by: Topi Miettinen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 52 +++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 6d9e8e0a3a7d..05ae5a338b6f 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -54,6 +54,32 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo } #endif +static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sk_buff *skb = pkt->skb; + struct sock *sk = NULL; + + if (!indev) + return NULL; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } + + return sk; +} + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -67,20 +93,7 @@ static void nft_socket_eval(const struct nft_expr *expr, sk = NULL; if (!sk) - switch(nft_pf(pkt)) { - case NFPROTO_IPV4: - sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); - break; -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - case NFPROTO_IPV6: - sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); - break; -#endif - default: - WARN_ON_ONCE(1); - regs->verdict.code = NFT_BREAK; - return; - } + sk = nft_socket_do_lookup(pkt); if (!sk) { regs->verdict.code = NFT_BREAK; @@ -224,6 +237,16 @@ static bool nft_socket_reduce(struct nft_regs_track *track, return nft_expr_reduce_bitwise(track, expr); } +static int nft_socket_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT)); +} + static struct nft_expr_type nft_socket_type; static const struct nft_expr_ops nft_socket_ops = { .type = &nft_socket_type, @@ -231,6 +254,7 @@ static const struct nft_expr_ops nft_socket_ops = { .eval = nft_socket_eval, .init = nft_socket_init, .dump = nft_socket_dump, + .validate = nft_socket_validate, .reduce = nft_socket_reduce, }; -- cgit v1.2.3 From d9157f6806d1499e173770df1f1b234763de5c79 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 26 Apr 2022 18:03:39 +0800 Subject: tcp: fix F-RTO may not work correctly when receiving DSACK Currently DSACK is regarded as a dupack, which may cause F-RTO to incorrectly enter "loss was real" when receiving DSACK. Packetdrill to demonstrate: // Enable F-RTO and TLP 0 `sysctl -q net.ipv4.tcp_frto=2` 0 `sysctl -q net.ipv4.tcp_early_retrans=3` 0 `sysctl -q net.ipv4.tcp_congestion_control=cubic` // Establish a connection +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 // RTT 10ms, RTO 210ms +.1 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 <...> +.01 < . 1:1(0) ack 1 win 257 +0 accept(3, ..., ...) = 4 // Send 2 data segments +0 write(4, ..., 2000) = 2000 +0 > P. 1:2001(2000) ack 1 // TLP +.022 > P. 1001:2001(1000) ack 1 // Continue to send 8 data segments +0 write(4, ..., 10000) = 10000 +0 > P. 2001:10001(8000) ack 1 // RTO +.188 > . 1:1001(1000) ack 1 // The original data is acked and new data is sent(F-RTO step 2.b) +0 < . 1:1(0) ack 2001 win 257 +0 > P. 10001:12001(2000) ack 1 // D-SACK caused by TLP is regarded as a dupack, this results in // the incorrect judgment of "loss was real"(F-RTO step 3.a) +.022 < . 1:1(0) ack 2001 win 257 // Never-retransmitted data(3001:4001) are acked and // expect to switch to open state(F-RTO step 3.b) +0 < . 1:1(0) ack 4001 win 257 +0 %{ assert tcpi_ca_state == 0, tcpi_ca_state }% Fixes: e33099f96d99 ("tcp: implement RFC5682 F-RTO") Signed-off-by: Pengcheng Yang Acked-by: Neal Cardwell Tested-by: Neal Cardwell Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1650967419-2150-1-git-send-email-yangpc@wangsu.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 48f607522860..60f99e9fb6d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3867,7 +3867,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { - if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { num_dupack = 1; /* Consider if pure acks were aggregated in tcp_add_backlog() */ if (!(flag & FLAG_DATA)) -- cgit v1.2.3