From 47e4075df300050a920b99299c4db3dad9adaba9 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 22 Sep 2021 09:56:02 +0200 Subject: xsk: Batched buffer allocation for the pool Add a new driver interface xsk_buff_alloc_batch() offering batched buffer allocations to improve performance. The new interface takes three arguments: the buffer pool to allocated from, a pointer to an array of struct xdp_buff pointers which will contain pointers to the allocated xdp_buffs, and an unsigned integer specifying the max number of buffers to allocate. The return value is the actual number of buffers that the allocator managed to allocate and it will be in the range 0 <= N <= max, where max is the third parameter to the function. u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); A second driver interface is also introduced that need to be used in conjunction with xsk_buff_alloc_batch(). It is a helper that sets the size of struct xdp_buff and is used by the NIC Rx irq routine when receiving a packet. This helper sets the three struct members data, data_meta, and data_end. The two first ones is in the xsk_buff_alloc() case set in the allocation routine and data_end is set when a packet is received in the receive irq function. This unfortunately leads to worse performance since the xdp_buff is touched twice with a long time period in between leading to an extra cache miss. Instead, we fill out the xdp_buff with all 3 fields at one single point in time in the driver, when the size of the packet is known. Hence this helper. Note that the driver has to use this helper (or set all three fields itself) when using xsk_buff_alloc_batch(). xsk_buff_alloc() works as before and does not require this. void xsk_buff_set_size(struct xdp_buff *xdp, u32 size); Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210922075613.12186-3-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 12 ++++--- 2 files changed, 95 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 8de01aaac4a0..884d95d70f5e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -507,6 +507,93 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xp_alloc); +static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 i, cached_cons, nb_entries; + + if (max > pool->free_heads_cnt) + max = pool->free_heads_cnt; + max = xskq_cons_nb_entries(pool->fq, max); + + cached_cons = pool->fq->cached_cons; + nb_entries = max; + i = max; + while (i--) { + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr); + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (unlikely(!ok)) { + pool->fq->invalid_descs++; + nb_entries--; + continue; + } + + xskb = pool->free_heads[--pool->free_heads_cnt]; + *xdp = &xskb->xdp; + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; + xdp++; + } + + xskq_cons_release_n(pool->fq, max); + return nb_entries; +} + +static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries) +{ + struct xdp_buff_xsk *xskb; + u32 i; + + nb_entries = min_t(u32, nb_entries, pool->free_list_cnt); + + i = nb_entries; + while (i--) { + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); + list_del(&xskb->free_list_node); + + *xdp = &xskb->xdp; + xdp++; + } + pool->free_list_cnt -= nb_entries; + + return nb_entries; +} + +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 nb_entries1 = 0, nb_entries2; + + if (unlikely(pool->dma_need_sync)) { + /* Slow path */ + *xdp = xp_alloc(pool); + return !!*xdp; + } + + if (unlikely(pool->free_list_cnt)) { + nb_entries1 = xp_alloc_reused(pool, xdp, max); + if (nb_entries1 == max) + return nb_entries1; + + max -= nb_entries1; + xdp += nb_entries1; + } + + nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max); + if (!nb_entries2) + pool->fq->queue_empty_descs++; + + return nb_entries1 + nb_entries2; +} +EXPORT_SYMBOL(xp_alloc_batch); + bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { if (pool->free_list_cnt >= count) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ae13cccfb28..e9aa2c236356 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -111,14 +111,18 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; - if (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + *addr = ring->desc[idx]; +} - *addr = ring->desc[idx]; +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_cons != q->cached_prod) { + __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } -- cgit v1.2.3 From 94033cd8e73b8632bab7c8b7bb54caa4f5616db7 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 22 Sep 2021 09:56:06 +0200 Subject: xsk: Optimize for aligned case Optimize for the aligned case by precomputing the parameter values of the xdp_buff_xsk and xdp_buff structures in the heads array. We can do this as the heads array size is equal to the number of chunks in the umem for the aligned case. Then every entry in this array will reflect a certain chunk/frame and can therefore be prepopulated with the correct values and we can drop the use of the free_heads stack. Note that it is not possible to allocate more buffers than what has been allocated in the aligned case since each chunk can only contain a single buffer. We can unfortunately not do this in the unaligned case as one chunk might contain multiple buffers. In this case, we keep the old scheme of populating a heads entry every time it is used and using the free_heads stack. Also move xp_release() and xp_get_handle() to xsk_buff_pool.h. They were for some reason in xsk.c even though they are buffer pool operations. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210922075613.12186-7-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 46 ++++++++++++++++++++++++++++++++++++- net/xdp/xsk.c | 15 ------------ net/xdp/xsk_buff_pool.c | 56 +++++++++++++++++++++++++++------------------ 3 files changed, 79 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b7068f97639f..ddeefc4a1040 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct xsk_buff_pool; @@ -66,6 +67,7 @@ struct xsk_buff_pool { u32 free_heads_cnt; u32 headroom; u32 chunk_size; + u32 chunk_shift; u32 frame_len; u8 cached_need_wakeup; bool uses_need_wakeup; @@ -80,6 +82,13 @@ struct xsk_buff_pool { struct xdp_buff_xsk *free_heads[]; }; +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + /* AF_XDP core. */ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); @@ -88,7 +97,6 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id); void xp_destroy(struct xsk_buff_pool *pool); -void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); @@ -98,6 +106,21 @@ void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); +static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, + u64 addr) +{ + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; +} + +static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, + dma_addr_t *dma_pages, u64 addr) +{ + xskb->frame_dma = (dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; +} + /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, @@ -180,4 +203,25 @@ static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) xp_unaligned_extract_offset(addr); } +static inline u32 xp_aligned_extract_idx(struct xsk_buff_pool *pool, u64 addr) +{ + return xp_aligned_extract_addr(pool, addr) >> pool->chunk_shift; +} + +static inline void xp_release(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->unaligned) + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + #endif /* XSK_BUFF_POOL_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d6b500dc4208..f16074eb53c7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -134,21 +134,6 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -void xp_release(struct xdp_buff_xsk *xskb) -{ - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; -} - -static u64 xp_get_handle(struct xdp_buff_xsk *xskb) -{ - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); -} - static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 884d95d70f5e..96b14e51ba7e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -44,12 +44,13 @@ void xp_destroy(struct xsk_buff_pool *pool) struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { + bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; struct xsk_buff_pool *pool; struct xdp_buff_xsk *xskb; - u32 i; + u32 i, entries; - pool = kvzalloc(struct_size(pool, free_heads, umem->chunks), - GFP_KERNEL); + entries = unaligned ? umem->chunks : 0; + pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); if (!pool) goto out; @@ -63,7 +64,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->free_heads_cnt = umem->chunks; pool->headroom = umem->headroom; pool->chunk_size = umem->chunk_size; - pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + pool->chunk_shift = ffs(umem->chunk_size) - 1; + pool->unaligned = unaligned; pool->frame_len = umem->chunk_size - umem->headroom - XDP_PACKET_HEADROOM; pool->umem = umem; @@ -81,7 +83,10 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - pool->free_heads[i] = xskb; + if (pool->unaligned) + pool->free_heads[i] = xskb; + else + xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); } return pool; @@ -406,6 +411,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(dma_map); + else + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } err = xp_init_dma_info(pool, dma_map); if (err) { @@ -448,8 +459,6 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) if (pool->free_heads_cnt == 0) return NULL; - xskb = pool->free_heads[--pool->free_heads_cnt]; - for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { pool->fq->queue_empty_descs++; @@ -466,17 +475,17 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) } break; } - xskq_cons_release(pool->fq); - xskb->orig_addr = addr; - xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; - if (pool->dma_pages_cnt) { - xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & - ~XSK_NEXT_PG_CONTIG_MASK) + - (addr & ~PAGE_MASK); - xskb->dma = xskb->frame_dma + pool->headroom + - XDP_PACKET_HEADROOM; + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; } + + xskq_cons_release(pool->fq); return xskb; } @@ -533,13 +542,16 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd continue; } - xskb = pool->free_heads[--pool->free_heads_cnt]; + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + *xdp = &xskb->xdp; - xskb->orig_addr = addr; - xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; - xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & - ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); - xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; xdp++; } -- cgit v1.2.3 From 3103836496e75095ac208cc180a9fe8f7ff33fd8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 29 Sep 2021 08:14:03 +0200 Subject: xsk: Fix clang build error in __xp_alloc Fix a build error with clang in __xp_alloc(): [...] net/xdp/xsk_buff_pool.c:465:15: error: variable 'xskb' is uninitialized when used here [-Werror,-Wuninitialized] xp_release(xskb); ^~~~ This is correctly detected by clang, but not gcc. In fact, the xp_release() statement should not be there at all in the refactored code, just remove it. Fixes: 94033cd8e73b ("xsk: Optimize for aligned case") Reported-by: Nathan Chancellor Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210929061403.8587-1-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 96b14e51ba7e..90c4e1e819d3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -462,7 +462,6 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { pool->fq->queue_empty_descs++; - xp_release(xskb); return NULL; } -- cgit v1.2.3 From de21d8bf777240c6d6dfefa39b4925729e32c0fd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Tue, 28 Sep 2021 10:30:59 +0100 Subject: bpf: Do not invoke the XDP dispatcher for PROG_RUN with single repeat We have a unit test that invokes an XDP program with 1m different inputs, aka 1m BPF_PROG_RUN syscalls. We run this test concurrently with slight variations in how we generated the input. Since commit f23c4b3924d2 ("bpf: Start using the BPF dispatcher in BPF_TEST_RUN") the unit test has slowed down significantly. Digging deeper reveals that the concurrent tests are serialised in the kernel on the XDP dispatcher. This is a global resource that is protected by a mutex, on which we contend. Fix this by not calling into the XDP dispatcher if we only want to perform a single run of the BPF program. See: https://lore.kernel.org/bpf/CACAyw9_y4QumOW35qpgTbLsJ532uGq-kVW-VESJzGyiZkypnvw@mail.gmail.com/ Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210928093100.27124-1-lmb@cloudflare.com --- net/bpf/test_run.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fcb2f493f710..6593a71dba5f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -803,7 +803,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ret) goto free_data; - bpf_prog_change_xdp(NULL, prog); + if (repeat > 1) + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented @@ -824,7 +825,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, sizeof(struct xdp_md)); out: - bpf_prog_change_xdp(prog, NULL); + if (repeat > 1) + bpf_prog_change_xdp(prog, NULL); free_data: kfree(data); free_ctx: -- cgit v1.2.3