diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-19 20:05:34 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-19 20:05:34 +0300 |
commit | 1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe (patch) | |
tree | 552e03de245cdbd0780ca1215914edc4a26540f7 /net/core/skbuff.c | |
parent | 6b5f04b6cf8ebab9a65d9c0026c650bb2538fd0f (diff) | |
parent | fe30937b65354c7fec244caebbdaae68e28ca797 (diff) | |
download | linux-1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Support more Realtek wireless chips, from Jes Sorenson.
2) New BPF types for per-cpu hash and arrap maps, from Alexei
Starovoitov.
3) Make several TCP sysctls per-namespace, from Nikolay Borisov.
4) Allow the use of SO_REUSEPORT in order to do per-thread processing
of incoming TCP/UDP connections. The muxing can be done using a
BPF program which hashes the incoming packet. From Craig Gallek.
5) Add a multiplexer for TCP streams, to provide a messaged based
interface. BPF programs can be used to determine the message
boundaries. From Tom Herbert.
6) Add 802.1AE MACSEC support, from Sabrina Dubroca.
7) Avoid factorial complexity when taking down an inetdev interface
with lots of configured addresses. We were doing things like
traversing the entire address less for each address removed, and
flushing the entire netfilter conntrack table for every address as
well.
8) Add and use SKB bulk free infrastructure, from Jesper Brouer.
9) Allow offloading u32 classifiers to hardware, and implement for
ixgbe, from John Fastabend.
10) Allow configuring IRQ coalescing parameters on a per-queue basis,
from Kan Liang.
11) Extend ethtool so that larger link mode masks can be supported.
From David Decotigny.
12) Introduce devlink, which can be used to configure port link types
(ethernet vs Infiniband, etc.), port splitting, and switch device
level attributes as a whole. From Jiri Pirko.
13) Hardware offload support for flower classifiers, from Amir Vadai.
14) Add "Local Checksum Offload". Basically, for a tunneled packet
the checksum of the outer header is 'constant' (because with the
checksum field filled into the inner protocol header, the payload
of the outer frame checksums to 'zero'), and we can take advantage
of that in various ways. From Edward Cree"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1548 commits)
bonding: fix bond_get_stats()
net: bcmgenet: fix dma api length mismatch
net/mlx4_core: Fix backward compatibility on VFs
phy: mdio-thunder: Fix some Kconfig typos
lan78xx: add ndo_get_stats64
lan78xx: handle statistics counter rollover
RDS: TCP: Remove unused constant
RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
net: smc911x: convert pxa dma to dmaengine
team: remove duplicate set of flag IFF_MULTICAST
bonding: remove duplicate set of flag IFF_MULTICAST
net: fix a comment typo
ethernet: micrel: fix some error codes
ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it
bpf, dst: add and use dst_tclassid helper
bpf: make skb->tc_classid also readable
net: mvneta: bm: clarify dependencies
cls_bpf: reset class and reuse major in da
ldmvsw: Checkpatch sunvnet.c and sunvnet_common.c
ldmvsw: Add ldmvsw.c driver code
...
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r-- | net/core/skbuff.c | 165 |
1 files changed, 119 insertions, 46 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8616d1147c93..f044f970f1a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static inline void _kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* Zero budget indicate non-NAPI context called us, like netpoll */ + if (unlikely(!budget)) { + dev_consume_skb_any(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); + return; + } + + _kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ @@ -1843,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; + struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -1869,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return true; } + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + return false; } @@ -1895,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, @@ -1916,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) - break; - } - -done: if (spd.nr_pages) ret = splice_cb(sk, pipe, &spd); @@ -3024,8 +3092,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !head_skb->encap_hdr_csum && - !!can_checksum_protocol(features, proto); + csum = !!can_checksum_protocol(features, proto); headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3118,13 +3185,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg && !nskb->remcsum_offload) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); + if (!sg) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, len), + len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; continue; } @@ -3190,12 +3259,19 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum && !nskb->remcsum_offload) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; + if (!csum) { + if (skb_has_shared_frag(nskb)) { + err = __skb_linearize(nskb); + if (err) + goto err; + } + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); @@ -4237,7 +4313,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb_sender_cpu_clear(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); @@ -4433,9 +4508,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) skb->mac_len += VLAN_HLEN; __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; |