summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-11-16 16:10:35 +0300
committerDavid S. Miller <davem@davemloft.net>2021-11-16 16:10:35 +0300
commit6fcc06205c15bf1bb90896efdf5967028c154aba (patch)
tree9485fbb54c44a7933e5e7904aaebaedbe0f82907 /net/ipv4
parent3ad4b7c81a992463c29ae130332c217607fe4452 (diff)
parent43f51df4172955971ef5498f09308a9dc0291766 (diff)
downloadlinux-6fcc06205c15bf1bb90896efdf5967028c154aba.tar.xz
Merge branch 'tcp-optimizations'
Eric Dumazet says: ==================== tcp: optimizations for linux-5.17 Mostly small improvements in this series. The notable change is in "defer skb freeing after socket lock is released" in recvmsg() (and RX zerocopy) The idea is to try to let skb freeing to BH handler, whenever possible, or at least perform the freeing outside of the socket lock section, for much improved performance. This idea can probably be extended to other protocols. Tests on a 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs. MTU : 1500 (1428 bytes of TCP payload per MSS) Before: 55 Gbit After: 66 Gbit MTU : 4096+ (4096 bytes of TCP payload, plus TCP/IPv6 headers) Before: 82 Gbit After: 95 Gbit ==================== Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp.c91
-rw-r--r--net/ipv4/tcp_input.c8
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c2
5 files changed, 73 insertions, 40 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7796b4cf0a0..d1949fdb1462 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,7 +292,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
#if IS_ENABLED(CONFIG_SMC)
@@ -303,7 +303,7 @@ EXPORT_SYMBOL(tcp_have_smc);
/*
* Current number of TCP sockets.
*/
-struct percpu_counter tcp_sockets_allocated;
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);
/*
@@ -456,7 +456,6 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
sk_sockets_allocated_inc(sk);
- sk->sk_route_forced_caps = NETIF_F_GSO;
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -546,10 +545,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (state != TCP_SYN_SENT &&
(state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
- if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->urg_data)
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
target++;
if (tcp_stream_is_readable(sk, target))
@@ -574,7 +574,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
} else
mask |= EPOLLOUT | EPOLLWRNORM;
- if (tp->urg_data & TCP_URG_VALID)
+ if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
@@ -608,7 +608,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data &&
+ answ = READ_ONCE(tp->urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
@@ -1466,7 +1466,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
@@ -1580,6 +1580,36 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
+void __sk_defer_free_flush(struct sock *sk)
+{
+ struct llist_node *head;
+ struct sk_buff *skb, *n;
+
+ head = llist_del_all(&sk->defer_list);
+ llist_for_each_entry_safe(skb, n, head, ll_node) {
+ prefetch(n);
+ skb_mark_not_on_list(skb);
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(__sk_defer_free_flush);
+
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ !llist_empty(&sk->defer_list)) {
+ llist_add(&skb->ll_node, &sk->defer_list);
+ return;
+ }
+ }
+ __kfree_skb(skb);
+}
+
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1599,7 +1629,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
@@ -1633,7 +1663,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1665,11 +1695,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
@@ -2326,7 +2356,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -2369,10 +2399,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
signal_pending(current))
break;
} else {
@@ -2406,13 +2436,12 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
+ __sk_flush_backlog(sk);
} else {
+ tcp_cleanup_rbuf(sk, copied);
+ sk_defer_free_flush(sk);
sk_wait_data(sk, &timeo, last);
}
@@ -2432,7 +2461,7 @@ found_ok_skb:
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
@@ -2466,8 +2495,8 @@ found_ok_skb:
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
@@ -2482,14 +2511,14 @@ skip_copy:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
@@ -2531,6 +2560,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
&cmsg_flags);
release_sock(sk);
+ sk_defer_free_flush(sk);
if (cmsg_flags && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
@@ -2961,7 +2991,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
- tp->urg_data = 0;
+ WRITE_ONCE(tp->urg_data, 0);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -3056,7 +3086,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
+ sk_defer_free_flush(sk);
sk_error_report(sk);
return 0;
}
@@ -3771,10 +3801,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
tcp_get_info_chrono_stats(tp, info);
info->tcpi_segs_out = tp->segs_out;
- info->tcpi_segs_in = tp->segs_in;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
info->tcpi_min_rtt = tcp_min_rtt(tp);
- info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
@@ -4183,6 +4215,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
release_sock(sk);
+ sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 246ab7b5e857..3658b9c3dd2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5591,7 +5591,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
+ WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
@@ -5604,11 +5604,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
- if (th->urg)
+ if (unlikely(th->urg))
tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == TCP_URG_NOTYET) {
+ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
@@ -5617,7 +5617,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
- tp->urg_data = TCP_URG_VALID | tmp;
+ WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13d868c43284..3dd19a2bf06c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1182,7 +1182,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
if (!md5sig)
return -ENOMEM;
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
INIT_HLIST_HEAD(&md5sig->head);
rcu_assign_pointer(tp->md5sig_info, md5sig);
}
@@ -1620,7 +1620,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
*/
tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
key->key, key->keylen, GFP_ATOMIC);
- sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ sk_gso_disable(newsk);
}
#endif
@@ -1800,8 +1800,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
- u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
- u32 tail_gso_size, tail_gso_segs;
+ u32 limit, tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1909,7 +1908,7 @@ no_coalesce:
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
- limit += 64*1024;
+ limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -2103,6 +2102,7 @@ process:
sk_incoming_cpu_update(sk);
+ sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2e6e5a70168e..5079832af5c1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1359,7 +1359,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0e2f1c05da28..7101e6d892d6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(udp_table);
long sysctl_udp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated;
+atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536