summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/netdevice.h5
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/tcp.h12
-rw-r--r--net/core/dev.c31
-rw-r--r--net/core/skbuff.c51
-rw-r--r--net/core/sock.c3
-rw-r--r--net/ipv4/tcp.c25
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv6/tcp_ipv6.c1
-rw-r--r--net/tls/tls_sw.c2
11 files changed, 90 insertions, 46 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7dccbfd1bf56..ac8a5f71220a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3081,6 +3081,11 @@ struct softnet_data {
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
+ /* Another possibly contended cache line */
+ spinlock_t defer_lock ____cacheline_aligned_in_smp;
+ int defer_count;
+ struct sk_buff *defer_list;
+ call_single_data_t defer_csd;
};
static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 84d78df60453..5cbc184ca685 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t;
* delivery_time at egress.
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
+ * @alloc_cpu: CPU which did the skb allocation.
* @secmark: security marking
* @mark: Generic packet mark
* @reserved_tailroom: (aka @mark) number of bytes of free space available
@@ -1080,6 +1081,7 @@ struct sk_buff {
unsigned int sender_cpu;
};
#endif
+ u16 alloc_cpu;
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
@@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size);
+void skb_attempt_defer_free(struct sk_buff *skb);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
diff --git a/include/net/sock.h b/include/net/sock.h
index a01d6c421aa2..f9f8ecae0f8d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -292,7 +292,6 @@ struct sk_filter;
* @sk_pacing_shift: scaling factor for TCP Small Queues
* @sk_lingertime: %SO_LINGER l_linger setting
* @sk_backlog: always used with the per-socket spinlock held
- * @defer_list: head of llist storing skbs to be freed
* @sk_callback_lock: used with the callbacks in the end of this struct
* @sk_error_queue: rarely used
* @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
@@ -417,7 +416,6 @@ struct sock {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
- struct llist_head defer_list;
#define sk_rmem_alloc sk_backlog.rmem_alloc
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 679b1964d494..94a52ad1101c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason);
-#ifdef CONFIG_INET
-void __sk_defer_free_flush(struct sock *sk);
-
-static inline void sk_defer_free_flush(struct sock *sk)
-{
- if (llist_empty(&sk->defer_list))
- return;
- __sk_defer_free_flush(sk);
-}
-#else
-static inline void sk_defer_free_flush(struct sock *sk) {}
-#endif
int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4a77ebda4fb1..611bd7197064 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4545,6 +4545,12 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data __always_unused)
+{
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
@@ -6571,6 +6577,28 @@ static int napi_threaded_poll(void *data)
return 0;
}
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+ struct sk_buff *skb, *next;
+ unsigned long flags;
+
+ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+ if (!READ_ONCE(sd->defer_list))
+ return;
+
+ spin_lock_irqsave(&sd->defer_lock, flags);
+ skb = sd->defer_list;
+ sd->defer_list = NULL;
+ sd->defer_count = 0;
+ spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+ while (skb != NULL) {
+ next = skb->next;
+ __kfree_skb(skb);
+ skb = next;
+ }
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -6616,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+ skb_defer_free_flush(sd);
}
struct netdev_adjacent {
@@ -11326,6 +11355,8 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
+ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
+ spin_lock_init(&sd->defer_lock);
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 30b523fa4ad2..028a280fbabd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -204,7 +204,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
-
+ skb->alloc_cpu = raw_smp_processor_id();
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
@@ -1037,6 +1037,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#ifdef CONFIG_NET_RX_BUSY_POLL
CHECK_SKB_FIELD(napi_id);
#endif
+ CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
CHECK_SKB_FIELD(sender_cpu);
#endif
@@ -6486,3 +6487,51 @@ free_now:
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+ int cpu = skb->alloc_cpu;
+ struct softnet_data *sd;
+ unsigned long flags;
+ bool kick;
+
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu) ||
+ cpu == raw_smp_processor_id()) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ sd = &per_cpu(softnet_data, cpu);
+ /* We do not send an IPI or any signal.
+ * Remote cpu will eventually call skb_defer_free_flush()
+ */
+ spin_lock_irqsave(&sd->defer_lock, flags);
+ skb->next = sd->defer_list;
+ /* Paired with READ_ONCE() in skb_defer_free_flush() */
+ WRITE_ONCE(sd->defer_list, skb);
+ sd->defer_count++;
+
+ /* kick every time queue length reaches 128.
+ * This should avoid blocking in smp_call_function_single_async().
+ * This condition should hardly be bit under normal conditions,
+ * unless cpu suddenly stopped to receive NIC interrupts.
+ */
+ kick = sd->defer_count == 128;
+
+ spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+ * if we are unlucky enough (this seems very unlikely).
+ */
+ if (unlikely(kick))
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 29abec3eabd8..a0f3989de3d6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2082,9 +2082,6 @@ void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
- WARN_ON_ONCE(!llist_empty(&sk->defer_list));
- sk_defer_free_flush(sk);
-
if (rcu_access_pointer(sk->sk_reuseport_cb)) {
reuseport_detach_sock(sk);
use_call_rcu = true;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e20b87b3bf90..db55af9eb37b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -843,7 +843,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
release_sock(sk);
- sk_defer_free_flush(sk);
if (spliced)
return spliced;
@@ -1589,20 +1588,6 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
-void __sk_defer_free_flush(struct sock *sk)
-{
- struct llist_node *head;
- struct sk_buff *skb, *n;
-
- head = llist_del_all(&sk->defer_list);
- llist_for_each_entry_safe(skb, n, head, ll_node) {
- prefetch(n);
- skb_mark_not_on_list(skb);
- __kfree_skb(skb);
- }
-}
-EXPORT_SYMBOL(__sk_defer_free_flush);
-
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
@@ -1610,11 +1595,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
sock_rfree(skb);
skb->destructor = NULL;
skb->sk = NULL;
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- !llist_empty(&sk->defer_list)) {
- llist_add(&skb->ll_node, &sk->defer_list);
- return;
- }
+ return skb_attempt_defer_free(skb);
}
__kfree_skb(skb);
}
@@ -2453,7 +2434,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
__sk_flush_backlog(sk);
} else {
tcp_cleanup_rbuf(sk, copied);
- sk_defer_free_flush(sk);
sk_wait_data(sk, &timeo, last);
}
@@ -2571,7 +2551,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
lock_sock(sk);
ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
release_sock(sk);
- sk_defer_free_flush(sk);
if (cmsg_flags && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
@@ -3096,7 +3075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
- sk_defer_free_flush(sk);
sk_error_report(sk);
return 0;
}
@@ -4225,7 +4203,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
release_sock(sk);
- sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c2d42142555..918816ec5dd4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2065,7 +2065,6 @@ process:
sk_incoming_cpu_update(sk);
- sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54277de7474b..60bdec257ba7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1728,7 +1728,6 @@ process:
sk_incoming_cpu_update(sk);
- sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index ddbe05ec5489..bc54f6c5b1a4 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1911,7 +1911,6 @@ recv_end:
end:
release_sock(sk);
- sk_defer_free_flush(sk);
if (psock)
sk_psock_put(sk, psock);
return copied ? : err;
@@ -1983,7 +1982,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
splice_read_end:
release_sock(sk);
- sk_defer_free_flush(sk);
return copied ? : err;
}