summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/skbuff.h4
-rw-r--r--net/core/dev.c16
-rw-r--r--net/core/skbuff.c428
3 files changed, 242 insertions, 206 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a4e91a2f873..6d0a33d1c0db 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1087,6 +1087,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size);
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
+
/**
* alloc_skb - allocate a network buffer
* @size: size to allocate
@@ -2919,7 +2921,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
}
void napi_consume_skb(struct sk_buff *skb, int budget);
-void __kfree_skb_flush(void);
+void napi_skb_free_stolen_head(struct sk_buff *skb);
void __kfree_skb_defer(struct sk_buff *skb);
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index ce6291bc2e16..ea9b46318d23 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4944,8 +4944,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
else
__kfree_skb_defer(skb);
}
-
- __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -6097,13 +6095,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
- skb_dst_drop(skb);
- skb_ext_put(skb);
- kmem_cache_free(skbuff_head_cache, skb);
-}
-
static gro_result_t napi_skb_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
@@ -6117,7 +6108,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else
- __kfree_skb(skb);
+ __kfree_skb_defer(skb);
break;
case GRO_HELD:
@@ -7012,7 +7003,6 @@ static int napi_threaded_poll(void *data)
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
- __kfree_skb_flush();
local_bh_enable();
if (!repoll)
@@ -7042,7 +7032,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- goto out;
+ return;
break;
}
@@ -7069,8 +7059,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
-out:
- __kfree_skb_flush();
}
struct netdev_adjacent {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d380c7b5a12d..545a472273a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -119,151 +119,75 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-/*
- * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
- * the caller if emergency pfmemalloc reserves are being used. If it is and
- * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
- * may be used. Otherwise, the packet data may be discarded until enough
- * memory is free
- */
-#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
- __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
-
-static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
- unsigned long ip, bool *pfmemalloc)
-{
- void *obj;
- bool ret_pfmemalloc = false;
+#define NAPI_SKB_CACHE_SIZE 64
+#define NAPI_SKB_CACHE_BULK 16
+#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
- /*
- * Try a regular allocation, when that fails and we're not entitled
- * to the reserves, fail.
- */
- obj = kmalloc_node_track_caller(size,
- flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
- node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
- goto out;
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
- /* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
+static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return obj;
+ return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
}
-/* Allocate a new skbuff. We do this ourselves so we can fill in a few
- * 'private' fields and also do memory statistics to find all the
- * [BEEP] leaks.
- *
- */
-
-/**
- * __alloc_skb - allocate a network buffer
- * @size: size to allocate
- * @gfp_mask: allocation mask
- * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
- * instead of head cache and allocate a cloned (child) skb.
- * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case the data is required for writeback
- * @node: numa node to allocate memory on
- *
- * Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of at least size bytes. The object has a reference count
- * of one. The return is the buffer. On a failure the return is %NULL.
- *
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
- */
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int flags, int node)
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
- struct kmem_cache *cache;
- struct skb_shared_info *shinfo;
- struct sk_buff *skb;
- u8 *data;
- bool pfmemalloc;
-
- cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
-
- if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
- gfp_mask |= __GFP_MEMALLOC;
-
- /* Get the HEAD */
- skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
- if (!skb)
- goto out;
- prefetchw(skb);
-
- /* We do our best to align skb_shared_info on a separate cache
- * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
- * aligned memory blocks, unless SLUB/SLAB debug is enabled.
- * Both skb->head and skb_shared_info are cache line aligned.
- */
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
- if (!data)
- goto nodata;
- /* kmalloc(size) might give us more room than requested.
- * Put skb_shared_info exactly at the end of allocated zone,
- * to allow max possible filling before reallocation.
- */
- size = SKB_WITH_OVERHEAD(ksize(data));
- prefetchw(data + size);
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- /* Account for allocated memory : skb + skb->head */
- skb->truesize = SKB_TRUESIZE(size);
- skb->pfmemalloc = pfmemalloc;
- refcount_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->mac_header = (typeof(skb->mac_header))~0U;
- skb->transport_header = (typeof(skb->transport_header))~0U;
+ fragsz = SKB_DATA_ALIGN(fragsz);
- /* make sure we initialize shinfo sequentially */
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
+ return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
+}
+EXPORT_SYMBOL(__napi_alloc_frag_align);
- if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff_fclones *fclones;
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ struct page_frag_cache *nc;
+ void *data;
- fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+ } else {
+ local_bh_disable();
+ data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
+ local_bh_enable();
+ }
+ return data;
+}
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
- skb->fclone = SKB_FCLONE_ORIG;
- refcount_set(&fclones->fclone_ref, 1);
+static struct sk_buff *napi_skb_cache_get(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb;
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
- }
+ if (unlikely(!nc->skb_count))
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ GFP_ATOMIC,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count))
+ return NULL;
- skb_set_kcov_handle(skb, kcov_common_handle());
+ skb = nc->skb_cache[--nc->skb_count];
+ kasan_unpoison_object_data(skbuff_head_cache, skb);
-out:
return skb;
-nodata:
- kmem_cache_free(cache, skb);
- skb = NULL;
- goto out;
}
-EXPORT_SYMBOL(__alloc_skb);
/* Caller must provide SKB that is memset cleared */
-static struct sk_buff *__build_skb_around(struct sk_buff *skb,
- void *data, unsigned int frag_size)
+static void __build_skb_around(struct sk_buff *skb, void *data,
+ unsigned int frag_size)
{
struct skb_shared_info *shinfo;
unsigned int size = frag_size ? : ksize(data);
@@ -286,8 +210,6 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
atomic_set(&shinfo->dataref, 1);
skb_set_kcov_handle(skb, kcov_common_handle());
-
- return skb;
}
/**
@@ -318,8 +240,9 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
- return __build_skb_around(skb, data, frag_size);
+ return skb;
}
/* build_skb() is wrapper over __build_skb(), that specifically
@@ -352,9 +275,9 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
if (unlikely(!skb))
return NULL;
- skb = __build_skb_around(skb, data, frag_size);
+ __build_skb_around(skb, data, frag_size);
- if (skb && frag_size) {
+ if (frag_size) {
skb->head_frag = 1;
if (page_is_pfmemalloc(virt_to_head_page(data)))
skb->pfmemalloc = 1;
@@ -363,50 +286,178 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
}
EXPORT_SYMBOL(build_skb_around);
-#define NAPI_SKB_CACHE_SIZE 64
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb;
-struct napi_alloc_cache {
- struct page_frag_cache page;
- unsigned int skb_count;
- void *skb_cache[NAPI_SKB_CACHE_SIZE];
-};
+ skb = napi_skb_cache_get();
+ if (unlikely(!skb))
+ return NULL;
-static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
-static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
+ return skb;
+}
+
+/**
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb = __napi_build_skb(data, frag_size);
- return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
+ if (likely(skb) && frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+
+ return skb;
}
+EXPORT_SYMBOL(napi_build_skb);
-void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+ bool *pfmemalloc)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
+ void *obj;
+ bool ret_pfmemalloc = false;
- return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
}
-EXPORT_SYMBOL(__napi_alloc_frag_align);
-void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+/**
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
{
- struct page_frag_cache *nc;
- void *data;
+ struct kmem_cache *cache;
+ struct sk_buff *skb;
+ u8 *data;
+ bool pfmemalloc;
- fragsz = SKB_DATA_ALIGN(fragsz);
- if (in_irq() || irqs_disabled()) {
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
- } else {
- local_bh_disable();
- data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
- local_bh_enable();
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ /* Get the HEAD */
+ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
+ likely(node == NUMA_NO_NODE || node == numa_mem_id()))
+ skb = napi_skb_cache_get();
+ else
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
+ if (unlikely(!skb))
+ return NULL;
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ if (unlikely(!data))
+ goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
+ prefetchw(data + size);
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, 0);
+ skb->pfmemalloc = pfmemalloc;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
+
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
- return data;
+
+ return skb;
+
+nodata:
+ kmem_cache_free(cache, skb);
+ return NULL;
}
-EXPORT_SYMBOL(__netdev_alloc_frag_align);
+EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -511,7 +562,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (len <= SKB_WITH_OVERHEAD(1024) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
- skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
+ NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
@@ -528,7 +580,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (unlikely(!data))
return NULL;
- skb = __build_skb(data, len);
+ skb = __napi_build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
@@ -859,43 +911,36 @@ void __consume_stateless_skb(struct sk_buff *skb)
kfree_skbmem(skb);
}
-void __kfree_skb_flush(void)
+static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 i;
- /* flush skb_cache if containing objects */
- if (nc->skb_count) {
- kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
- nc->skb_cache);
- nc->skb_count = 0;
- }
-}
-
-static inline void _kfree_skb_defer(struct sk_buff *skb)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- /* drop skb->head and call any destructors for packet */
- skb_release_all(skb);
-
- /* record skb to CPU local list */
+ kasan_poison_object_data(skbuff_head_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
-#ifdef CONFIG_SLUB
- /* SLUB writes into objects when freeing */
- prefetchw(skb);
-#endif
-
- /* flush skb_cache if it is filled */
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
- nc->skb_cache);
- nc->skb_count = 0;
+ for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+ kasan_unpoison_object_data(skbuff_head_cache,
+ nc->skb_cache[i]);
+
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ nc->skb_cache + NAPI_SKB_CACHE_HALF);
+ nc->skb_count = NAPI_SKB_CACHE_HALF;
}
}
+
void __kfree_skb_defer(struct sk_buff *skb)
{
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
+}
+
+void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ napi_skb_cache_put(skb);
}
void napi_consume_skb(struct sk_buff *skb, int budget)
@@ -920,7 +965,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);