From d519f350967a60b85a574ad8aeac43f2b4384746 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:30 -0800 Subject: tcp: minor optimization in tcp_add_backlog() If packet is going to be coalesced, sk_sndbuf/sk_rcvbuf values are not used. Defer their access to the point we need them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 13d868c43284..82a9e1b75405 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1800,8 +1800,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); - u32 tail_gso_size, tail_gso_segs; + u32 limit, tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1909,7 +1908,7 @@ no_coalesce: * to reduce memory overhead, so add a little headroom here. * Few sockets backlog are possibly concurrently non empty. */ - limit += 64*1024; + limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); -- cgit v1.2.3 From 283c6b54bca13313a4f437719f600a3ad2135847 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:31 -0800 Subject: tcp: remove dead code in __tcp_v6_send_check() For some reason, I forgot to change __tcp_v6_send_check() at the same time I removed (ip_summed == CHECKSUM_PARTIAL) check in __tcp_v4_send_check() Fixes: 98be9b12096f ("tcp: remove dead code after CHECKSUM_PARTIAL adoption") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_checksum.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index b3f4eaa88672..ea681910b7a3 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -65,15 +65,9 @@ static inline void __tcp_v6_send_check(struct sk_buff *skb, { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v6_check(skb->len, saddr, daddr, - csum_partial(th, th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) -- cgit v1.2.3 From 373544020024668ea552a7699c9c9f100b6bc9d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:32 -0800 Subject: tcp: small optimization in tcp_v6_send_check() For TCP flows, inet6_sk(sk)->saddr has the same value than sk->sk_v6_rcv_saddr. Using sk->sk_v6_rcv_saddr increases data locality. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 551fce49841d..1f1a89f096de 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1893,9 +1893,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { - struct ipv6_pinfo *np = inet6_sk(sk); - - __tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr); + __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { -- cgit v1.2.3 From 42f67eea3ba36cef2dce2e853de6ddcb2e89eb39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:33 -0800 Subject: net: use sk_is_tcp() in more places Move sk_is_tcp() to include/net/sock.h and use it where we can. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ include/net/sock.h | 5 +++++ net/core/skbuff.c | 6 ++---- net/core/sock.c | 6 ++---- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 584d94be9c8b..18a717fe62eb 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,12 +507,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static inline bool sk_is_udp(const struct sock *sk) { return sk->sk_type == SOCK_DGRAM && diff --git a/include/net/sock.h b/include/net/sock.h index b32906e1ab55..5bdeffdea5ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2638,6 +2638,11 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } +static inline bool sk_is_tcp(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ba2f38246f07..d57796f38a0b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4849,8 +4849,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + if (sk_is_tcp(sk)) serr->ee.ee_data -= sk->sk_tskey; } @@ -4919,8 +4918,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (tsonly) { #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && - sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); opt_stats = true; diff --git a/net/core/sock.c b/net/core/sock.c index 8f2b2f2c0e7b..0be8e43f44b9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -872,8 +872,7 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; @@ -1370,8 +1369,7 @@ set_sndbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (!((sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP) || + if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; -- cgit v1.2.3 From d0d598ca86bd9e595f16a39097707c90841afe80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:34 -0800 Subject: net: remove sk_route_forced_caps We were only using one bit, and we can replace it by sk_is_tcp() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 --- net/core/sock.c | 4 +++- net/ipv4/tcp.c | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 5bdeffdea5ec..ebad629dd9ed 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -285,8 +285,6 @@ struct bpf_local_storage; * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) - * @sk_route_forced_caps: static, forced route capabilities - * (set in tcp_init_sock()) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments @@ -461,7 +459,6 @@ struct sock { struct page_frag sk_frag; netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; - netdev_features_t sk_route_forced_caps; int sk_gso_type; unsigned int sk_gso_max_size; gfp_t sk_allocation; diff --git a/net/core/sock.c b/net/core/sock.c index 0be8e43f44b9..257b5fa60480 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2244,7 +2244,9 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; + sk->sk_route_caps = dst->dev->features; + if (sk_is_tcp(sk)) + sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7796b4cf0a0..4fa4b29260bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -456,7 +456,6 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); - sk->sk_route_forced_caps = NETIF_F_GSO; } EXPORT_SYMBOL(tcp_init_sock); -- cgit v1.2.3 From aba546565b613e74b84b8261999ea82b5561d3f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:35 -0800 Subject: net: remove sk_route_nocaps Instead of using a full netdev_features_t, we can use a single bit, as sk_route_nocaps is only used to remove NETIF_F_GSO_MASK from sk->sk_route_cap. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 11 +++++------ net/core/sock.c | 3 ++- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ebad629dd9ed..985ddcd33504 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -284,7 +284,7 @@ struct bpf_local_storage; * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) - * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) + * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments @@ -458,7 +458,6 @@ struct sock { unsigned long sk_max_pacing_rate; struct page_frag sk_frag; netdev_features_t sk_route_caps; - netdev_features_t sk_route_nocaps; int sk_gso_type; unsigned int sk_gso_max_size; gfp_t sk_allocation; @@ -468,7 +467,7 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ - u8 sk_padding : 1, + u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, @@ -2121,10 +2120,10 @@ static inline bool sk_can_gso(const struct sock *sk) void sk_setup_caps(struct sock *sk, struct dst_entry *dst); -static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) +static inline void sk_gso_disable(struct sock *sk) { - sk->sk_route_nocaps |= flags; - sk->sk_route_caps &= ~flags; + sk->sk_gso_disabled = 1; + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, diff --git a/net/core/sock.c b/net/core/sock.c index 257b5fa60480..99738e14224c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2249,7 +2249,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; - sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 82a9e1b75405..5ad81bfb27b2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1182,7 +1182,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, if (!md5sig) return -ENOMEM; - sk_nocaps_add(sk, NETIF_F_GSO_MASK); + sk_gso_disable(sk); INIT_HLIST_HEAD(&md5sig->head); rcu_assign_pointer(tp->md5sig_info, md5sig); } @@ -1620,7 +1620,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, */ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, key->key, key->keylen, GFP_ATOMIC); - sk_nocaps_add(newsk, NETIF_F_GSO_MASK); + sk_gso_disable(newsk); } #endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2e6e5a70168e..5079832af5c1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1359,7 +1359,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { - sk_nocaps_add(sk, NETIF_F_GSO_MASK); + sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2f044a49afa8..007e433d4d4d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -977,7 +977,7 @@ slow_path: fail_toobig: if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + sk_gso_disable(skb->sk); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; -- cgit v1.2.3 From 1b31debca83284486cd736757b5f26d51719ef80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:36 -0800 Subject: ipv6: shrink struct ipcm6_cookie gso_size can be moved after tclass, to use an existing hole. (8 bytes saved on 64bit arches) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c19bf51ded1d..53ac7707ca70 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -345,9 +345,9 @@ struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; + __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; - __u16 gso_size; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -- cgit v1.2.3 From 1ace2b4d2b4e1db8fc62d872ab54ab48f6215ecd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:37 -0800 Subject: net: shrink struct sock by 8 bytes Move sk_bind_phc next to sk_peer_lock to fill a hole. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 985ddcd33504..2333ab081789 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -489,6 +489,7 @@ struct sock { u16 sk_busy_poll_budget; #endif spinlock_t sk_peer_lock; + int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; @@ -498,7 +499,6 @@ struct sock { seqlock_t sk_stamp_seq; #endif u16 sk_tsflags; - int sk_bind_phc; u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; -- cgit v1.2.3 From 6c302e799a0d4be1362f505453b714fe05d91f2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:38 -0800 Subject: net: forward_alloc_get depends on CONFIG_MPTCP (struct proto)->sk_forward_alloc is currently only used by MPTCP. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2333ab081789..cb97c448472a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1206,7 +1206,9 @@ struct proto { unsigned int inuse_idx; #endif +#if IS_ENABLED(CONFIG_MPTCP) int (*forward_alloc_get)(const struct sock *sk); +#endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); @@ -1295,10 +1297,11 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int static inline int sk_forward_alloc_get(const struct sock *sk) { - if (!sk->sk_prot->forward_alloc_get) - return sk->sk_forward_alloc; - - return sk->sk_prot->forward_alloc_get(sk); +#if IS_ENABLED(CONFIG_MPTCP) + if (sk->sk_prot->forward_alloc_get) + return sk->sk_prot->forward_alloc_get(sk); +#endif + return sk->sk_forward_alloc; } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) -- cgit v1.2.3 From 91b6d325635617540b6a1646ddb138bb17cbd569 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:39 -0800 Subject: net: cache align tcp_memory_allocated, tcp_sockets_allocated tcp_memory_allocated and tcp_sockets_allocated often share a common cache line, source of false sharing. Also take care of udp_memory_allocated and mptcp_sockets_allocated. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/ipv4/udp.c | 2 +- net/mptcp/protocol.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4fa4b29260bd..862e8cb8dda5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -292,7 +292,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); -atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); #if IS_ENABLED(CONFIG_SMC) @@ -303,7 +303,7 @@ EXPORT_SYMBOL(tcp_have_smc); /* * Current number of TCP sockets. */ -struct percpu_counter tcp_sockets_allocated; +struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated); /* diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0e2f1c05da28..7101e6d892d6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); -atomic_long_t udp_memory_allocated; +atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b7e32e316738..6db93da59843 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -48,7 +48,7 @@ enum { MPTCP_CMSG_TS = BIT(0), }; -static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk); -- cgit v1.2.3 From 93afcfd1db35882921b2521a637c78755c27b02c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:40 -0800 Subject: tcp: small optimization in tcp recvmsg() When reading large chunks of data, incoming packets might be added to the backlog from BH. tcp recvmsg() detects the backlog queue is not empty, and uses a release_sock()/lock_sock() pair to process this backlog. We now have __sk_flush_backlog() to perform this a bit faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 862e8cb8dda5..24d77a32c9cb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2409,8 +2409,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (copied >= target) { /* Do not sleep, just process backlog. */ - release_sock(sk); - lock_sock(sk); + __sk_flush_backlog(sk); } else { sk_wait_data(sk, &timeo, last); } -- cgit v1.2.3 From d2489c7b6d7d5ed4b32b56703c57c47bfbfe7fa5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:41 -0800 Subject: tcp: add RETPOLINE mitigation to sk_backlog_rcv Use INDIRECT_CALL_INET() to avoid an indirect call when/if CONFIG_RETPOLINE=y Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++++- net/core/sock.c | 5 ++++- net/ipv6/tcp_ipv6.c | 5 +++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index cb97c448472a..2d40fe4c7718 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1018,12 +1018,18 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); - return sk->sk_backlog_rcv(sk, skb); + return INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index 99738e14224c..c57d9883f62c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -327,7 +327,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); - ret = sk->sk_backlog_rcv(sk, skb); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1f1a89f096de..f41f14b70123 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -72,7 +72,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; @@ -1466,7 +1466,8 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, * This is because we cannot sleep with the original spinlock * held. */ -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; -- cgit v1.2.3 From 0307a0b74b3af6ecb1c8b7f727376130b15bbf44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:42 -0800 Subject: tcp: annotate data-races on tp->segs_in and tp->data_segs_in tcp_segs_in() can be called from BH, while socket spinlock is held but socket owned by user, eventually reading these fields from tcp_get_info() Found by code inspection, no need to backport this patch to older kernels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp.c | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4da22b41bde6..05c81677aaf7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2172,9 +2172,13 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - tp->segs_in += segs_in; + + /* We update these fields while other threads might + * read them from tcp_get_info() + */ + WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) - tp->data_segs_in += segs_in; + WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 24d77a32c9cb..267b2b18f048 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3769,10 +3769,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; - info->tcpi_segs_in = tp->segs_in; + + /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ + info->tcpi_segs_in = READ_ONCE(tp->segs_in); + info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); - info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; -- cgit v1.2.3 From 7b6a893a5991f5e8a56795155ae86333b03080b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:43 -0800 Subject: tcp: annotate races around tp->urg_data tcp_poll() and tcp_ioctl() are reading tp->urg_data without socket lock owned. Also, it is faster to first check tp->urg_data in tcp_poll(), then tp->urg_seq == tp->copied_seq, because tp->urg_seq is located in a different/cold cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 17 +++++++++-------- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 267b2b18f048..313cf648c349 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -545,10 +545,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); + u16 urg_data = READ_ONCE(tp->urg_data); - if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && - !sock_flag(sk, SOCK_URGINLINE) && - tp->urg_data) + if (urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && + !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) @@ -573,7 +574,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } else mask |= EPOLLOUT | EPOLLWRNORM; - if (tp->urg_data & TCP_URG_VALID) + if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* Active TCP fastopen socket with defer_connect @@ -607,7 +608,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && + answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: @@ -1465,7 +1466,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) char c = tp->urg_data; if (!(flags & MSG_PEEK)) - tp->urg_data = TCP_URG_READ; + WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; @@ -2465,7 +2466,7 @@ found_ok_skb: skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { - tp->urg_data = 0; + WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } @@ -2959,7 +2960,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); - tp->urg_data = 0; + WRITE_ONCE(tp->urg_data, 0); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 246ab7b5e857..5ee07a337652 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5591,7 +5591,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } } - tp->urg_data = TCP_URG_NOTYET; + WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET); WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ @@ -5617,7 +5617,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t u8 tmp; if (skb_copy_bits(skb, ptr, &tmp, 1)) BUG(); - tp->urg_data = TCP_URG_VALID | tmp; + WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); } -- cgit v1.2.3 From b96c51bd3bd826a3391cbf4b1281a1e0bf9df90a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:44 -0800 Subject: tcp: tp->urg_data is unlikely to be set Use some unlikely() hints in the fast path. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++++----- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 313cf648c349..9175e0d729f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -547,7 +547,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); - if (urg_data && + if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; @@ -1633,7 +1633,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; @@ -2326,7 +2326,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ - if (tp->urg_data && tp->urg_seq == *seq) { + if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { @@ -2431,7 +2431,7 @@ found_ok_skb: used = len; /* Do we have urgent data here? */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { @@ -2465,7 +2465,7 @@ found_ok_skb: tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5ee07a337652..3658b9c3dd2b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5604,11 +5604,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t struct tcp_sock *tp = tcp_sk(sk); /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) + if (unlikely(th->urg)) tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == TCP_URG_NOTYET) { + if (unlikely(tp->urg_data == TCP_URG_NOTYET)) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - th->syn; -- cgit v1.2.3 From 3df684c1a3d08a4f649689053a3d527b3b5fda9e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:45 -0800 Subject: tcp: avoid indirect calls to sock_rfree TCP uses sk_eat_skb() when skbs can be removed from receive queue. However, the call to skb_orphan() from __kfree_skb() incurs an indirect call so sock_rfee(), which is more expensive than a direct call, especially for CONFIG_RETPOLINE=y. Add tcp_eat_recv_skb() function to make the call before __kfree_skb(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9175e0d729f5..4e7011672aa9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1580,6 +1580,16 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb->destructor == sock_rfree)) { + sock_rfree(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + sk_eat_skb(sk, skb); +} + static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1599,7 +1609,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); } return NULL; } @@ -1665,11 +1675,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); @@ -2481,14 +2491,14 @@ skip_copy: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); break; } while (len > 0); -- cgit v1.2.3 From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:46 -0800 Subject: tcp: defer skb freeing after socket lock is released tcp recvmsg() (or rx zerocopy) spends a fair amount of time freeing skbs after their payload has been consumed. A typical ~64KB GRO packet has to release ~45 page references, eventually going to page allocator for each of them. Currently, this freeing is performed while socket lock is held, meaning that there is a high chance that BH handler has to queue incoming packets to tcp socket backlog. This can cause additional latencies, because the user thread has to process the backlog at release_sock() time, and while doing so, additional frames can be added by BH handler. This patch adds logic to defer these frees after socket lock is released, or directly from BH handler if possible. Being able to free these skbs from BH handler helps a lot, because this avoids the usual alloc/free assymetry, when BH handler and user thread do not run on same cpu or NUMA node. One cpu can now be fully utilized for the kernel->user copy, and another cpu is handling BH processing and skb/page allocs/frees (assuming RFS is not forcing use of a single CPU) Tested: 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs MTU : 1500 Before: 55 Gbit After: 66 Gbit MTU : 4096+(headers) Before: 82 Gbit After: 95 Gbit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/net/sock.h | 3 +++ include/net/tcp.h | 10 ++++++++++ net/ipv4/tcp.c | 27 +++++++++++++++++++++++++-- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 6 files changed, 42 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..b8b806512e16 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -743,6 +744,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; }; union { diff --git a/include/net/sock.h b/include/net/sock.h index 2d40fe4c7718..2578d1f455a7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -408,6 +409,8 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; + struct llist_head defer_list; + #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; diff --git a/include/net/tcp.h b/include/net/tcp.h index 05c81677aaf7..44e442bf23f9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1368,6 +1368,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); + +void __sk_defer_free_flush(struct sock *sk); + +static inline void sk_defer_free_flush(struct sock *sk) +{ + if (llist_empty(&sk->defer_list)) + return; + __sk_defer_free_flush(sk); +} + int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e7011672aa9..33cd9a1c199c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1580,14 +1580,34 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +void __sk_defer_free_flush(struct sock *sk) +{ + struct llist_node *head; + struct sk_buff *skb, *n; + + head = llist_del_all(&sk->defer_list); + llist_for_each_entry_safe(skb, n, head, ll_node) { + prefetch(n); + skb_mark_not_on_list(skb); + __kfree_skb(skb); + } +} +EXPORT_SYMBOL(__sk_defer_free_flush); + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { + __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; + if (!skb_queue_empty(&sk->sk_receive_queue) || + !llist_empty(&sk->defer_list)) { + llist_add(&skb->ll_node, &sk->defer_list); + return; + } } - sk_eat_skb(sk, skb); + __kfree_skb(skb); } static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) @@ -2422,6 +2442,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { + sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } @@ -2540,6 +2561,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, &cmsg_flags); release_sock(sk); + sk_defer_free_flush(sk); if (cmsg_flags && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) @@ -3065,7 +3087,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } - + sk_defer_free_flush(sk); sk_error_report(sk); return 0; } @@ -4194,6 +4216,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); release_sock(sk); + sk_defer_free_flush(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ad81bfb27b2..3dd19a2bf06c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2102,6 +2102,7 @@ process: sk_incoming_cpu_update(sk); + sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f41f14b70123..3b7d6ede1364 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1758,6 +1758,7 @@ process: sk_incoming_cpu_update(sk); + sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; -- cgit v1.2.3 From 8bd172b787298124ef75c0e466101107c036d54d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:47 -0800 Subject: tcp: check local var (timeo) before socket fields in one test Testing timeo before sk_err/sk_state/sk_shutdown makes more sense. Modern applications use non-blocking IO, while a socket is terminated only once during its life time. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 33cd9a1c199c..7b1886103556 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2399,10 +2399,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, break; if (copied) { - if (sk->sk_err || + if (!timeo || + sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || signal_pending(current)) break; } else { -- cgit v1.2.3 From 29fbc26e6dfc7be351c23261938de3f93f5cde57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:48 -0800 Subject: tcp: do not call tcp_cleanup_rbuf() if we have a backlog Under pressure, tcp recvmsg() has logic to process the socket backlog, but calls tcp_cleanup_rbuf() right before. Avoiding sending ACK right before processing new segments makes a lot of sense, as this decrease the number of ACK packets, with no impact on effective ACK clocking. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7b1886103556..d1949fdb1462 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2436,12 +2436,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, } } - tcp_cleanup_rbuf(sk, copied); - if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { + tcp_cleanup_rbuf(sk, copied); sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } -- cgit v1.2.3 From 43f51df4172955971ef5498f09308a9dc0291766 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:49 -0800 Subject: net: move early demux fields close to sk_refcnt sk_rx_dst/sk_rx_dst_ifindex/sk_rx_dst_cookie are read in early demux, and currently spans two cache lines. Moving them close to sk_refcnt makes more sense, as only one cache line is needed. New layout for this hot cache line is : struct sock { struct sock_common __sk_common; /* 0 0x88 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ struct dst_entry * sk_rx_dst; /* 0x88 0x8 */ int sk_rx_dst_ifindex; /* 0x90 0x4 */ u32 sk_rx_dst_cookie; /* 0x94 0x4 */ socket_lock_t sk_lock; /* 0x98 0x20 */ atomic_t sk_drops; /* 0xb8 0x4 */ int sk_rcvlowat; /* 0xbc 0x4 */ /* --- cacheline 3 boundary (192 bytes) --- */ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2578d1f455a7..95cc03bd3fac 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -390,6 +390,11 @@ struct sock { #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash + /* early demux fields */ + struct dst_entry *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; + socket_lock_t sk_lock; atomic_t sk_drops; int sk_rcvlowat; @@ -432,9 +437,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct dst_entry *sk_rx_dst; - int sk_rx_dst_ifindex; - u32 sk_rx_dst_cookie; struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; -- cgit v1.2.3