summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/bpf/bpf_devel_QA.rst15
-rw-r--r--MAINTAINERS6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c28
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.c134
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.h17
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx_common.h40
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_type.h5
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c379
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.h3
-rw-r--r--drivers/net/ethernet/intel/ice/ice_base.c16
-rw-r--r--drivers/net/ethernet/intel/ice/ice_txrx.h8
-rw-r--r--drivers/net/ethernet/intel/ice/ice_xsk.c378
-rw-r--r--drivers/net/ethernet/intel/ice/ice_xsk.h13
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe.h9
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c15
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c309
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h7
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/params.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c33
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c113
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h25
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c9
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c51
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c25
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c34
-rw-r--r--drivers/net/hyperv/netvsc_bpf.c1
-rw-r--r--include/linux/bpf-cgroup.h1
-rw-r--r--include/net/xdp.h9
-rw-r--r--include/net/xdp_sock.h287
-rw-r--r--include/net/xdp_sock_drv.h232
-rw-r--r--include/net/xsk_buff_pool.h140
-rw-r--r--include/trace/events/xdp.h2
-rw-r--r--include/uapi/linux/bpf.h4
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/syscall.c12
-rw-r--r--kernel/bpf/verifier.c45
-rw-r--r--net/bpf/test_run.c8
-rw-r--r--net/core/filter.c4
-rw-r--r--net/core/xdp.c51
-rw-r--r--net/ethtool/channels.c2
-rw-r--r--net/ethtool/ioctl.c2
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/xdp/Makefile3
-rw-r--r--net/xdp/xdp_umem.c55
-rw-r--r--net/xdp/xdp_umem.h2
-rw-r--r--net/xdp/xsk.c204
-rw-r--r--net/xdp/xsk.h30
-rw-r--r--net/xdp/xsk_buff_pool.c336
-rw-r--r--net/xdp/xsk_diag.c2
-rw-r--r--net/xdp/xsk_queue.c63
-rw-r--r--net/xdp/xsk_queue.h117
-rw-r--r--net/xdp/xskmap.c (renamed from kernel/bpf/xskmap.c)2
-rw-r--r--samples/bpf/.gitignore1
-rw-r--r--samples/bpf/Makefile16
-rw-r--r--samples/bpf/sampleip_kern.c12
-rw-r--r--samples/bpf/sampleip_user.c7
-rw-r--r--samples/bpf/sockex3_kern.c36
-rw-r--r--samples/bpf/sockex3_user.c64
-rw-r--r--samples/bpf/trace_common.h13
-rw-r--r--samples/bpf/trace_event_kern.c24
-rw-r--r--samples/bpf/trace_event_user.c9
-rw-r--r--samples/bpf/tracex1_user.c37
-rw-r--r--samples/bpf/tracex2_kern.c27
-rw-r--r--samples/bpf/tracex2_user.c51
-rw-r--r--samples/bpf/tracex3_kern.c24
-rw-r--r--samples/bpf/tracex3_user.c61
-rw-r--r--samples/bpf/tracex4_kern.c12
-rw-r--r--samples/bpf/tracex4_user.c51
-rw-r--r--samples/bpf/tracex5_kern.c14
-rw-r--r--samples/bpf/tracex5_user.c66
-rw-r--r--samples/bpf/tracex6_kern.c38
-rw-r--r--samples/bpf/tracex6_user.c49
-rw-r--r--samples/bpf/tracex7_user.c39
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c5
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-cgroup.rst10
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-prog.rst3
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool15
-rw-r--r--tools/bpf/bpftool/cgroup.c7
-rw-r--r--tools/bpf/bpftool/main.h4
-rw-r--r--tools/bpf/bpftool/prog.c6
-rw-r--r--tools/include/uapi/linux/bpf.h10
-rw-r--r--tools/lib/bpf/hashmap.c5
-rw-r--r--tools/lib/bpf/hashmap.h1
-rw-r--r--tools/lib/bpf/libbpf.c8
-rw-r--r--tools/testing/selftests/bpf/README.rst2
-rw-r--r--tools/testing/selftests/bpf/config2
-rw-r--r--tools/testing/selftests/bpf/network_helpers.c11
-rw-r--r--tools/testing/selftests/bpf/network_helpers.h1
-rw-r--r--tools/testing/selftests/bpf/prog_tests/align.c (renamed from tools/testing/selftests/bpf/test_align.c)109
-rw-r--r--tools/testing/selftests/bpf/prog_tests/connect_force_port.c107
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c16
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c16
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_netlink.c16
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task.c16
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task_file.c18
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c15
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c15
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h16
-rw-r--r--tools/testing/selftests/bpf/progs/connect_force_port4.c59
-rw-r--r--tools/testing/selftests/bpf/progs/connect_force_port6.c70
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c1
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_kern.h (renamed from tools/testing/selftests/bpf/test_sockmap_kern.h)158
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c913
-rw-r--r--tools/testing/selftests/bpf/verifier/ref_tracking.c33
-rw-r--r--tools/testing/selftests/bpf/verifier/value_or_null.c19
109 files changed, 2776 insertions, 2887 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index 38c15c6fcb14..0b3db91dc100 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -437,6 +437,21 @@ needed::
See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
document for further documentation.
+To maximize the number of tests passing, the .config of the kernel
+under test should match the config file fragment in
+tools/testing/selftests/bpf as closely as possible.
+
+Finally to ensure support for latest BPF Type Format features -
+discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
+is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
+pahole is delivered in the dwarves package or can be built
+from source at
+
+https://github.com/acmel/dwarves
+
+Some distros have pahole version 1.16 packaged already, e.g.
+Fedora, Gentoo.
+
Q: Which BPF kernel selftests version should I run my kernel against?
---------------------------------------------------------------------
A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
diff --git a/MAINTAINERS b/MAINTAINERS
index b7844f6cfa4a..087e68b21f9f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18443,8 +18443,12 @@ R: Jonathan Lemon <jonathan.lemon@gmail.com>
L: netdev@vger.kernel.org
L: bpf@vger.kernel.org
S: Maintained
-F: kernel/bpf/xskmap.c
+F: include/net/xdp_sock*
+F: include/net/xsk_buffer_pool.h
+F: include/uapi/linux/if_xdp.h
F: net/xdp/
+F: samples/bpf/xdpsock*
+F: tools/lib/bpf/xsk*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2a037ec244b9..ea7395b391e5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11,7 +11,7 @@
#include "i40e_diag.h"
#include "i40e_xsk.h"
#include <net/udp_tunnel.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* All i40e tracepoints are defined by the include below, which
* must be included exactly once across the whole kernel with
* CREATE_TRACE_POINTS defined
@@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
if (ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+ kfree(ring->rx_bi);
ring->xsk_umem = i40e_xsk_umem(ring);
if (ring->xsk_umem) {
- ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ ret = i40e_alloc_rx_bi_zc(ring);
+ if (ret)
+ return ret;
+ ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
- ring->zca.free = i40e_zca_free;
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca);
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
if (ret)
return ret;
dev_info(&vsi->back->pdev->dev,
- "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+ "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->queue_index);
} else {
+ ret = i40e_alloc_rx_bi(ring);
+ if (ret)
+ return ret;
ring->rx_buf_len = vsi->rx_buf_len;
if (ring->vsi->type == I40E_VSI_MAIN) {
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
@@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
- ok = ring->xsk_umem ?
- i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
- !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+ if (ring->xsk_umem) {
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
+ ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring));
+ } else {
+ ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+ }
if (!ok) {
/* Log this in case the user has forgotten to give the kernel
* any buffers, even later in the application.
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a3772beffe02..f613782f2f56 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
/**
* i40e_fd_handle_status - check the Programming Status for FD
* @rx_ring: the Rx ring for this descriptor
- * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
+ * @qword0_raw: qword0
+ * @qword1: qword1 after le_to_cpu
* @prog_id: the id originally used for programming
*
* This is used to verify if the FD programming or invalidation
* requested by SW to the HW is successful or not and take actions accordingly.
**/
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, u8 prog_id)
+static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1, u8 prog_id)
{
struct i40e_pf *pf = rx_ring->vsi->back;
struct pci_dev *pdev = pf->pdev;
+ struct i40e_32b_rx_wb_qw0 *qw0;
u32 fcnt_prog, fcnt_avail;
u32 error;
- u64 qw;
- qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
+ qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
+ error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
- pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
- if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
+ pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
+ if (qw0->hi_dword.fd_id != 0 ||
(I40E_DEBUG_FD & pf->hw.debug_mask))
dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
pf->fd_inv);
@@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
/* store the current atr filter count */
pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
- if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
+ if (qw0->hi_dword.fd_id == 0 &&
test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
/* These set_bit() calls aren't atomic with the
* test_bit() here, but worse case we potentially
@@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
if (I40E_DEBUG_FD & pf->hw.debug_mask)
dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
- rx_desc->wb.qword0.hi_dword.fd_id);
+ qw0->hi_dword.fd_id);
}
}
@@ -1195,6 +1196,11 @@ clear_counts:
rc->total_packets = 0;
}
+static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+ return &rx_ring->rx_bi[idx];
+}
+
/**
* i40e_reuse_rx_page - page flip buffer and store it back on the ring
* @rx_ring: rx descriptor ring to store buffers on
@@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *new_buff;
u16 nta = rx_ring->next_to_alloc;
- new_buff = &rx_ring->rx_bi[nta];
+ new_buff = i40e_rx_bi(rx_ring, nta);
/* update, and store next to alloc */
nta++;
@@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
}
/**
- * i40e_rx_is_programming_status - check for programming status descriptor
- * @qw: qword representing status_error_len in CPU ordering
- *
- * The value of in the descriptor length field indicate if this
- * is a programming status descriptor for flow director or FCoE
- * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
- * it is a packet descriptor.
- **/
-static inline bool i40e_rx_is_programming_status(u64 qw)
-{
- /* The Rx filter programming status and SPH bit occupy the same
- * spot in the descriptor. Since we don't support packet split we
- * can just reuse the bit as an indication that this is a
- * programming status descriptor.
- */
- return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
-}
-
-/**
- * i40e_clean_programming_status - try clean the programming status descriptor
+ * i40e_clean_programming_status - clean the programming status descriptor
* @rx_ring: the rx ring that has this descriptor
- * @rx_desc: the rx descriptor written back by HW
- * @qw: qword representing status_error_len in CPU ordering
+ * @qword0_raw: qword0
+ * @qword1: qword1 representing status_error_len in CPU ordering
*
* Flow director should handle FD_FILTER_STATUS to check its filter programming
* status being successful or not and take actions accordingly. FCoE should
@@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw)
*
* Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
**/
-struct i40e_rx_buffer *i40e_clean_programming_status(
- struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- u64 qw)
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1)
{
- struct i40e_rx_buffer *rx_buffer;
- u32 ntc;
u8 id;
- if (!i40e_rx_is_programming_status(qw))
- return NULL;
-
- ntc = rx_ring->next_to_clean;
-
- /* fetch, update, and store next to clean */
- rx_buffer = &rx_ring->rx_bi[ntc++];
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
-
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-
- id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
+ id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
- i40e_fd_handle_status(rx_ring, rx_desc, id);
-
- return rx_buffer;
+ i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
}
/**
@@ -1336,13 +1305,25 @@ err:
return -ENOMEM;
}
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
+{
+ unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;
+
+ rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
+ return rx_ring->rx_bi ? 0 : -ENOMEM;
+}
+
+static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
+{
+ memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
+}
+
/**
* i40e_clean_rx_ring - Free Rx buffers
* @rx_ring: ring to be cleaned
**/
void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
{
- unsigned long bi_size;
u16 i;
/* ring already cleared, nothing to do */
@@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
- struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+ struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);
if (!rx_bi->page)
continue;
@@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
}
skip_free:
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- memset(rx_ring->rx_bi, 0, bi_size);
+ if (rx_ring->xsk_umem)
+ i40e_clear_rx_bi_zc(rx_ring);
+ else
+ i40e_clear_rx_bi(rx_ring);
/* Zero out the descriptor ring */
memset(rx_ring->desc, 0, rx_ring->size);
@@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
- int err = -ENOMEM;
- int bi_size;
-
- /* warn if we are about to overwrite the pointer */
- WARN_ON(rx_ring->rx_bi);
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
- if (!rx_ring->rx_bi)
- goto err;
+ int err;
u64_stats_init(&rx_ring->syncp);
@@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
if (!rx_ring->desc) {
dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
rx_ring->size);
- goto err;
+ return -ENOMEM;
}
rx_ring->next_to_alloc = 0;
@@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index);
if (err < 0)
- goto err;
+ return err;
}
rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
return 0;
-err:
- kfree(rx_ring->rx_bi);
- rx_ring->rx_bi = NULL;
- return err;
}
/**
@@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
return false;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
- bi = &rx_ring->rx_bi[ntu];
+ bi = i40e_rx_bi(rx_ring, ntu);
do {
if (!i40e_alloc_mapped_page(rx_ring, bi))
@@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
ntu++;
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
- bi = rx_ring->rx_bi;
+ bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
@@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
{
struct i40e_rx_buffer *rx_buffer;
- rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
prefetchw(rx_buffer->page);
/* we are reusing so sync this buffer for CPU use */
@@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
- rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc,
- qword);
- if (unlikely(rx_buffer)) {
+ if (i40e_rx_is_programming_status(qword)) {
+ i40e_clean_programming_status(rx_ring,
+ rx_desc->raw.qword[0],
+ qword);
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ i40e_inc_ntc(rx_ring);
i40e_reuse_rx_page(rx_ring, rx_buffer);
cleaned_count++;
continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 36d37f31a287..5c255977fd58 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -296,17 +296,9 @@ struct i40e_tx_buffer {
struct i40e_rx_buffer {
dma_addr_t dma;
- union {
- struct {
- struct page *page;
- __u32 page_offset;
- __u16 pagecnt_bias;
- };
- struct {
- void *addr;
- u64 handle;
- };
- };
+ struct page *page;
+ __u32 page_offset;
+ __u16 pagecnt_bias;
};
struct i40e_queue_stats {
@@ -358,6 +350,7 @@ struct i40e_ring {
union {
struct i40e_tx_buffer *tx_bi;
struct i40e_rx_buffer *rx_bi;
+ struct xdp_buff **rx_bi_zc;
};
DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
u16 queue_index; /* Queue number of ring */
@@ -419,7 +412,6 @@ struct i40e_ring {
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca; /* ZC allocator anchor */
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)
@@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring);
/**
* i40e_get_head - Retrieve head from head writeback
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 8af0e99c6c0d..667c4dc4b39f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -4,13 +4,9 @@
#ifndef I40E_TXRX_COMMON_
#define I40E_TXRX_COMMON_
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, u8 prog_id);
int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
-struct i40e_rx_buffer *i40e_clean_programming_status(
- struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- u64 qw);
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1);
void i40e_process_skb_fields(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, struct sk_buff *skb);
void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
@@ -84,6 +80,38 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
}
}
+/**
+ * i40e_rx_is_programming_status - check for programming status descriptor
+ * @qword1: qword1 representing status_error_len in CPU ordering
+ *
+ * The value of in the descriptor length field indicate if this
+ * is a programming status descriptor for flow director or FCoE
+ * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
+ * it is a packet descriptor.
+ **/
+static inline bool i40e_rx_is_programming_status(u64 qword1)
+{
+ /* The Rx filter programming status and SPH bit occupy the same
+ * spot in the descriptor. Since we don't support packet split we
+ * can just reuse the bit as an indication that this is a
+ * programming status descriptor.
+ */
+ return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK;
+}
+
+/**
+ * i40e_inc_ntc: Advance the next_to_clean index
+ * @rx_ring: Rx ring
+ **/
+static inline void i40e_inc_ntc(struct i40e_ring *rx_ring)
+{
+ u32 ntc = rx_ring->next_to_clean + 1;
+
+ ntc = (ntc < rx_ring->count) ? ntc : 0;
+ rx_ring->next_to_clean = ntc;
+ prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 6ea2867ff60f..63e098f7cb63 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -689,7 +689,7 @@ union i40e_32byte_rx_desc {
__le64 rsvd2;
} read;
struct {
- struct {
+ struct i40e_32b_rx_wb_qw0 {
struct {
union {
__le16 mirroring_status;
@@ -727,6 +727,9 @@ union i40e_32byte_rx_desc {
} hi_dword;
} qword3;
} wb; /* writeback */
+ struct {
+ u64 qword[4];
+ } raw;
};
enum i40e_rx_desc_status_bits {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2b9184aead5f..f3953744c505 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -2,68 +2,30 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "i40e.h"
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
-/**
- * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- *
- * Returns 0 on success, <0 on failure
- **/
-static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring)
{
- struct i40e_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i, j;
- dma_addr_t dma;
-
- dev = &pf->pdev->dev;
- for (i = 0; i < umem->npgs; i++) {
- dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma))
- goto out_unmap;
+ unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count;
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (j = 0; j < i; j++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -1;
+ rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL);
+ return rx_ring->rx_bi_zc ? 0 : -ENOMEM;
}
-/**
- * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- **/
-static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring)
{
- struct i40e_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = &pf->pdev->dev;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+ memset(rx_ring->rx_bi_zc, 0,
+ sizeof(*rx_ring->rx_bi_zc) * rx_ring->count);
+}
- umem->pages[i].dma = 0;
- }
+static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+ return &rx_ring->rx_bi_zc[idx];
}
/**
@@ -78,7 +40,6 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid)
{
struct net_device *netdev = vsi->netdev;
- struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
@@ -92,13 +53,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
qid >= netdev->real_num_tx_queues)
return -EINVAL;
- reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = i40e_xsk_umem_dma_map(vsi, umem);
+ err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR);
if (err)
return err;
@@ -151,7 +106,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
}
clear_bit(qid, vsi->af_xdp_zc_qps);
- i40e_xsk_umem_dma_unmap(vsi, umem);
+ xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR);
if (if_running) {
err = i40e_queue_pair_enable(vsi, qid);
@@ -190,11 +145,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
- struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = I40E_XDP_PASS;
struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
- u64 offset;
u32 act;
rcu_read_lock();
@@ -203,9 +156,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
*/
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- offset = xdp->data - xdp->data_hard_start;
-
- xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
@@ -232,107 +182,26 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
return result;
}
-/**
- * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the recycle queue (next_to_alloc).
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = bi->addr;
- u64 handle, hr;
-
- if (addr) {
- rx_ring->rx_stats.page_reuse_count++;
- return true;
- }
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-/**
- * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the reuse queue.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, hr;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- handle &= rx_ring->xsk_umem->chunk_mask;
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
-static __always_inline bool
-__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
- bool alloc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi))
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
{
u16 ntu = rx_ring->next_to_use;
union i40e_rx_desc *rx_desc;
- struct i40e_rx_buffer *bi;
+ struct xdp_buff **bi, *xdp;
+ dma_addr_t dma;
bool ok = true;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
- bi = &rx_ring->rx_bi[ntu];
+ bi = i40e_rx_bi(rx_ring, ntu);
do {
- if (!alloc(rx_ring, bi)) {
+ xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!xdp) {
ok = false;
goto no_buffers;
}
-
- dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
-
- rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+ *bi = xdp;
+ dma = xsk_buff_xdp_get_dma(xdp);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
+ rx_desc->read.hdr_addr = 0;
rx_desc++;
bi++;
@@ -340,11 +209,10 @@ __i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
- bi = rx_ring->rx_bi;
+ bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
- rx_desc->wb.qword1.status_error_len = 0;
count--;
} while (count);
@@ -356,127 +224,8 @@ no_buffers:
}
/**
- * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the reuse queue
- * or fill ring and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
-{
- return __i40e_alloc_rx_buffers_zc(rx_ring, count,
- i40e_alloc_buffer_slow_zc);
-}
-
-/**
- * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the fill ring
- * or the internal recycle mechanism and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count)
-{
- return __i40e_alloc_rx_buffers_zc(rx_ring, count,
- i40e_alloc_buffer_zc);
-}
-
-/**
- * i40e_get_rx_buffer_zc - Return the current Rx buffer
- * @rx_ring: Rx ring
- * @size: The size of the rx buffer (read from descriptor)
- *
- * This function returns the current, received Rx buffer, and also
- * does DMA synchronization. the Rx ring.
- *
- * Returns the received Rx buffer
- **/
-static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
- const unsigned int size)
-{
- struct i40e_rx_buffer *bi;
-
- bi = &rx_ring->rx_bi[rx_ring->next_to_clean];
-
- /* we are reusing so sync this buffer for CPU use */
- dma_sync_single_range_for_cpu(rx_ring->dev,
- bi->dma, 0,
- size,
- DMA_BIDIRECTIONAL);
-
- return bi;
-}
-
-/**
- * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer
- * @rx_ring: Rx ring
- * @old_bi: The Rx buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the
- * recycle queue (next_to_alloc).
- **/
-static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *old_bi)
-{
- struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
- u16 nta = rx_ring->next_to_alloc;
-
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- new_bi->dma = old_bi->dma;
- new_bi->addr = old_bi->addr;
- new_bi->handle = old_bi->handle;
-
- old_bi->addr = NULL;
-}
-
-/**
- * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations
- * @alloc: Zero-copy allocator
- * @handle: Buffer handle
- **/
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
- struct i40e_rx_buffer *bi;
- struct i40e_ring *rx_ring;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(alloc, struct i40e_ring, zca);
- hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- mask = rx_ring->xsk_umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- bi = &rx_ring->rx_bi[nta];
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
- rx_ring->xsk_umem->headroom);
-}
-
-/**
* i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer
* @rx_ring: Rx ring
- * @bi: Rx buffer
* @xdp: xdp_buff
*
* This functions allocates a new skb from a zero-copy Rx buffer.
@@ -484,7 +233,6 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
* Returns the skb, or NULL on failure.
**/
static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi,
struct xdp_buff *xdp)
{
unsigned int metasize = xdp->data - xdp->data_meta;
@@ -503,24 +251,11 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
if (metasize)
skb_metadata_set(skb, metasize);
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(xdp);
return skb;
}
/**
- * i40e_inc_ntc: Advance the next_to_clean index
- * @rx_ring: Rx ring
- **/
-static void i40e_inc_ntc(struct i40e_ring *rx_ring)
-{
- u32 ntc = rx_ring->next_to_clean + 1;
-
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-}
-
-/**
* i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
* @rx_ring: Rx ring
* @budget: NAPI budget
@@ -531,25 +266,20 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
- struct i40e_rx_buffer *bi;
union i40e_rx_desc *rx_desc;
+ struct xdp_buff **bi;
unsigned int size;
u64 qword;
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
failure = failure ||
- !i40e_alloc_rx_buffers_fast_zc(rx_ring,
- cleaned_count);
+ !i40e_alloc_rx_buffers_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
@@ -562,35 +292,36 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
- bi = i40e_clean_programming_status(rx_ring, rx_desc,
- qword);
- if (unlikely(bi)) {
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
+ if (i40e_rx_is_programming_status(qword)) {
+ i40e_clean_programming_status(rx_ring,
+ rx_desc->raw.qword[0],
+ qword);
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ xsk_buff_free(*bi);
+ *bi = NULL;
cleaned_count++;
+ i40e_inc_ntc(rx_ring);
continue;
}
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
if (!size)
break;
- bi = i40e_get_rx_buffer_zc(rx_ring, size);
- xdp.data = bi->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = bi->handle;
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ (*bi)->data_end = (*bi)->data + size;
+ xsk_buff_dma_sync_for_cpu(*bi);
- xdp_res = i40e_run_xdp_zc(rx_ring, &xdp);
+ xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
if (xdp_res) {
- if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
+ if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
xdp_xmit |= xdp_res;
- bi->addr = NULL;
- } else {
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
- }
+ else
+ xsk_buff_free(*bi);
+ *bi = NULL;
total_rx_bytes += size;
total_rx_packets++;
@@ -606,7 +337,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
* BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
* SBP is *not* set in PRT_SBPVSI (default not set).
*/
- skb = i40e_construct_skb_zc(rx_ring, bi, &xdp);
+ skb = i40e_construct_skb_zc(rx_ring, *bi);
+ *bi = NULL;
if (!skb) {
rx_ring->rx_stats.alloc_buff_failed++;
break;
@@ -664,10 +396,9 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
@@ -826,13 +557,13 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring)
u16 i;
for (i = 0; i < rx_ring->count; i++) {
- struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+ struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i);
- if (!rx_bi->addr)
+ if (!rx_bi)
continue;
- xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle);
- rx_bi->addr = NULL;
+ xsk_buff_free(rx_bi);
+ rx_bi = NULL;
}
}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
index 9ed59c14eb55..ea919a7d60ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
@@ -12,12 +12,13 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair);
int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid);
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring);
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring);
#endif /* _I40E_XSK_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
index 4c835c144907..f066befadd39 100644
--- a/drivers/net/ethernet/intel/ice/ice_base.c
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019, Intel Corporation. */
+#include <net/xdp_sock_drv.h>
#include "ice_base.h"
#include "ice_dcb_lib.h"
@@ -308,24 +309,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
if (ring->xsk_umem) {
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
- ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ ring->rx_buf_len =
+ xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
- ring->zca.free = ice_zca_free;
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca);
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
if (err)
return err;
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
- dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+ dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->q_index);
} else {
- ring->zca.free = NULL;
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
/* coverity[check_return] */
xdp_rxq_info_reg(&ring->xdp_rxq,
@@ -426,7 +426,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
writel(0, ring->tail);
err = ring->xsk_umem ?
- ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) :
+ ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) :
ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
if (err)
dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 025dd642cf28..5f5a6ce2b7e5 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -158,17 +158,16 @@ struct ice_tx_offload_params {
};
struct ice_rx_buf {
- struct sk_buff *skb;
- dma_addr_t dma;
union {
struct {
+ struct sk_buff *skb;
+ dma_addr_t dma;
struct page *page;
unsigned int page_offset;
u16 pagecnt_bias;
};
struct {
- void *addr;
- u64 handle;
+ struct xdp_buff *xdp;
};
};
};
@@ -292,7 +291,6 @@ struct ice_ring {
struct rcu_head rcu; /* to avoid race on free */
struct bpf_prog *xdp_prog;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca;
/* CL3 - 3rd cacheline starts here */
struct xdp_rxq_info xdp_rxq;
/* CLX - the below items are only accessed infrequently and should be
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 20ac54e3156d..b6f928c9e9c9 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -2,7 +2,7 @@
/* Copyright (c) 2019, Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ice.h"
#include "ice_base.h"
@@ -280,28 +280,6 @@ static int ice_xsk_alloc_umems(struct ice_vsi *vsi)
}
/**
- * ice_xsk_add_umem - add a UMEM region for XDP sockets
- * @vsi: VSI to which the UMEM will be added
- * @umem: pointer to a requested UMEM region
- * @qid: queue ID
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
-{
- int err;
-
- err = ice_xsk_alloc_umems(vsi);
- if (err)
- return err;
-
- vsi->xsk_umems[qid] = umem;
- vsi->num_xsk_umems_used++;
-
- return 0;
-}
-
-/**
* ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
* @vsi: VSI from which the VSI will be removed
* @qid: Ring/qid associated with the UMEM
@@ -318,65 +296,6 @@ static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid)
}
}
-/**
- * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
- * @vsi: VSI to map the UMEM region
- * @umem: UMEM to map
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
- struct ice_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = ice_pf_to_dev(pf);
- for (i = 0; i < umem->npgs; i++) {
- dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
- PAGE_SIZE,
- DMA_BIDIRECTIONAL,
- ICE_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma)) {
- dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n",
- i);
- goto out_unmap;
- }
-
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (; i > 0; i--) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -EFAULT;
-}
-
-/**
- * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
- * @vsi: VSI from which the UMEM will be unmapped
- * @umem: UMEM to unmap
- */
-static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
- struct ice_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = ice_pf_to_dev(pf);
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
-
- umem->pages[i].dma = 0;
- }
-}
/**
* ice_xsk_umem_disable - disable a UMEM region
@@ -391,7 +310,7 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
!vsi->xsk_umems[qid])
return -EINVAL;
- ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
+ xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR);
ice_xsk_remove_umem(vsi, qid);
return 0;
@@ -408,7 +327,6 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
static int
ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
{
- struct xdp_umem_fq_reuse *reuseq;
int err;
if (vsi->type != ICE_VSI_PF)
@@ -419,20 +337,18 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
if (qid >= vsi->num_xsk_umems)
return -EINVAL;
+ err = ice_xsk_alloc_umems(vsi);
+ if (err)
+ return err;
+
if (vsi->xsk_umems && vsi->xsk_umems[qid])
return -EBUSY;
- reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = ice_xsk_umem_dma_map(vsi, umem);
- if (err)
- return err;
+ vsi->xsk_umems[qid] = umem;
+ vsi->num_xsk_umems_used++;
- err = ice_xsk_add_umem(vsi, umem, qid);
+ err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back),
+ ICE_RX_DMA_ATTR);
if (err)
return err;
@@ -484,137 +400,22 @@ xsk_umem_if_up:
}
/**
- * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
- * @zca: zero-cpoy allocator
- * @handle: Buffer handle
- */
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
- struct ice_rx_buf *rx_buf;
- struct ice_ring *rx_ring;
- struct xdp_umem *umem;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(zca, struct ice_ring, zca);
- umem = rx_ring->xsk_umem;
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- mask = umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- rx_buf = &rx_ring->rx_buf[nta];
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += hr;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += hr;
-
- rx_buf->handle = (u64)handle + umem->headroom;
-}
-
-/**
- * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the hot path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = rx_buf->addr;
- u64 handle, hr;
-
- if (addr) {
- rx_ring->rx_stats.page_reuse_count++;
- return true;
- }
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += hr;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += hr;
-
- rx_buf->handle = handle + umem->headroom;
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-/**
- * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the slow path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, headroom;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- handle &= umem->chunk_mask;
- headroom = umem->headroom + XDP_PACKET_HEADROOM;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += headroom;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += headroom;
-
- rx_buf->handle = handle + umem->headroom;
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
-/**
* ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
* @rx_ring: Rx ring
* @count: The number of buffers to allocate
- * @alloc: the function pointer to call for allocation
*
* This function allocates a number of Rx buffers from the fill ring
* or the internal recycle mechanism and places them on the Rx ring.
*
* Returns false if all allocations were successful, true if any fail.
*/
-static bool
-ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
- bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
{
union ice_32b_rx_flex_desc *rx_desc;
u16 ntu = rx_ring->next_to_use;
struct ice_rx_buf *rx_buf;
bool ret = false;
+ dma_addr_t dma;
if (!count)
return false;
@@ -623,16 +424,14 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
rx_buf = &rx_ring->rx_buf[ntu];
do {
- if (!alloc(rx_ring, rx_buf)) {
+ rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!rx_buf->xdp) {
ret = true;
break;
}
- dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
-
- rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
+ dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc->wb.status_error0 = 0;
rx_desc++;
@@ -653,32 +452,6 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
}
/**
- * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
-{
- return ice_alloc_rx_bufs_zc(rx_ring, count,
- ice_alloc_buf_fast_zc);
-}
-
-/**
- * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
-{
- return ice_alloc_rx_bufs_zc(rx_ring, count,
- ice_alloc_buf_slow_zc);
-}
-
-/**
* ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
* @rx_ring: Rx ring
*/
@@ -692,76 +465,21 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
}
/**
- * ice_get_rx_buf_zc - Fetch the current Rx buffer
- * @rx_ring: Rx ring
- * @size: size of a buffer
- *
- * This function returns the current, received Rx buffer and does
- * DMA synchronization.
- *
- * Returns a pointer to the received Rx buffer.
- */
-static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
-{
- struct ice_rx_buf *rx_buf;
-
- rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
-
- dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
- size, DMA_BIDIRECTIONAL);
-
- return rx_buf;
-}
-
-/**
- * ice_reuse_rx_buf_zc - reuse an Rx buffer
- * @rx_ring: Rx ring
- * @old_buf: The buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the recycle
- * queue (next_to_alloc).
- */
-static void
-ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
-{
- unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
- u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- u16 nta = rx_ring->next_to_alloc;
- struct ice_rx_buf *new_buf;
-
- new_buf = &rx_ring->rx_buf[nta++];
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- new_buf->dma = old_buf->dma & mask;
- new_buf->dma += hr;
-
- new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
- new_buf->addr += hr;
-
- new_buf->handle = old_buf->handle & mask;
- new_buf->handle += rx_ring->xsk_umem->headroom;
-
- old_buf->addr = NULL;
-}
-
-/**
* ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
* @rx_ring: Rx ring
* @rx_buf: zero-copy Rx buffer
- * @xdp: XDP buffer
*
* This function allocates a new skb from a zero-copy Rx buffer.
*
* Returns the skb on success, NULL on failure.
*/
static struct sk_buff *
-ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
- struct xdp_buff *xdp)
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
{
- unsigned int metasize = xdp->data - xdp->data_meta;
- unsigned int datasize = xdp->data_end - xdp->data;
- unsigned int datasize_hard = xdp->data_end -
- xdp->data_hard_start;
+ unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
+ unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
+ unsigned int datasize_hard = rx_buf->xdp->data_end -
+ rx_buf->xdp->data_hard_start;
struct sk_buff *skb;
skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
@@ -769,13 +487,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
if (unlikely(!skb))
return NULL;
- skb_reserve(skb, xdp->data - xdp->data_hard_start);
- memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+ skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
+ memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
- ice_reuse_rx_buf_zc(rx_ring, rx_buf);
-
+ xsk_buff_free(rx_buf->xdp);
+ rx_buf->xdp = NULL;
return skb;
}
@@ -802,7 +520,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
}
act = bpf_prog_run_xdp(xdp_prog, xdp);
- xdp->handle += xdp->data - xdp->data_hard_start;
switch (act) {
case XDP_PASS:
break;
@@ -840,13 +557,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_xmit = 0;
bool failure = false;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
@@ -858,8 +570,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
u8 rx_ptype;
if (cleaned_count >= ICE_RX_BUF_WRITE) {
- failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
- cleaned_count);
+ failure |= ice_alloc_rx_bufs_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
@@ -880,25 +592,19 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
if (!size)
break;
- rx_buf = ice_get_rx_buf_zc(rx_ring, size);
- if (!rx_buf->addr)
- break;
- xdp.data = rx_buf->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = rx_buf->handle;
+ rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+ rx_buf->xdp->data_end = rx_buf->xdp->data + size;
+ xsk_buff_dma_sync_for_cpu(rx_buf->xdp);
- xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
+ xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
if (xdp_res) {
- if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+ if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
xdp_xmit |= xdp_res;
- rx_buf->addr = NULL;
- } else {
- ice_reuse_rx_buf_zc(rx_ring, rx_buf);
- }
+ else
+ xsk_buff_free(rx_buf->xdp);
+ rx_buf->xdp = NULL;
total_rx_bytes += size;
total_rx_packets++;
cleaned_count++;
@@ -908,7 +614,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
}
/* XDP_PASS path */
- skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
+ skb = ice_construct_skb_zc(rx_ring, rx_buf);
if (!skb) {
rx_ring->rx_stats.alloc_buf_failed++;
break;
@@ -979,10 +685,9 @@ static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_buf->bytecount = desc.len;
@@ -1165,11 +870,10 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
for (i = 0; i < rx_ring->count; i++) {
struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
- if (!rx_buf->addr)
+ if (!rx_buf->xdp)
continue;
- xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle);
- rx_buf->addr = NULL;
+ rx_buf->xdp = NULL;
}
}
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h
index 8a4ba7c6d549..fc1a06b4df36 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.h
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.h
@@ -10,11 +10,10 @@ struct ice_vsi;
#ifdef CONFIG_XDP_SOCKETS
int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid);
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget);
int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count);
bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
@@ -27,12 +26,6 @@ ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi,
return -EOPNOTSUPP;
}
-static inline void
-ice_zca_free(struct zero_copy_allocator __always_unused *zca,
- unsigned long __always_unused handle)
-{
-}
-
static inline int
ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
int __always_unused budget)
@@ -48,8 +41,8 @@ ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring,
}
static inline bool
-ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
- u16 __always_unused count)
+ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
+ u16 __always_unused count)
{
return false;
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 2833e4f041ce..5ddfc83a1e46 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -224,17 +224,17 @@ struct ixgbe_tx_buffer {
};
struct ixgbe_rx_buffer {
- struct sk_buff *skb;
- dma_addr_t dma;
union {
struct {
+ struct sk_buff *skb;
+ dma_addr_t dma;
struct page *page;
__u32 page_offset;
__u16 pagecnt_bias;
};
struct {
- void *addr;
- u64 handle;
+ bool discard;
+ struct xdp_buff *xdp;
};
};
};
@@ -351,7 +351,6 @@ struct ixgbe_ring {
};
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca; /* ZC allocator anchor */
u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */
u16 rx_buf_len;
} ____cacheline_internodealigned_in_smp;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index eab5934b04f5..45fc7ce1a543 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -35,7 +35,7 @@
#include <net/tc_act/tc_mirred.h>
#include <net/vxlan.h>
#include <net/mpls.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xfrm.h>
#include "ixgbe.h"
@@ -3745,8 +3745,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
/* configure the packet buffer length */
if (rx_ring->xsk_umem) {
- u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem);
/* If the MAC support setting RXDCTL.RLPML, the
* SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and
@@ -4093,11 +4092,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
ring->xsk_umem = ixgbe_xsk_umem(adapter, ring);
if (ring->xsk_umem) {
- ring->zca.free = ixgbe_zca_free;
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca));
-
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL));
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
} else {
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL));
@@ -4153,8 +4151,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
}
if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) {
- u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
IXGBE_RXDCTL_RLPML_EN);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
index 6d01700b46bc..7887ae4aaf4f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
@@ -35,7 +35,7 @@ int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem,
void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
const int budget);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index a656ee9a1fae..86add9fbd36c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -2,7 +2,7 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ixgbe.h"
@@ -20,54 +20,11 @@ struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter,
return xdp_get_umem_from_qid(adapter->netdev, qid);
}
-static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter,
- struct xdp_umem *umem)
-{
- struct device *dev = &adapter->pdev->dev;
- unsigned int i, j;
- dma_addr_t dma;
-
- for (i = 0; i < umem->npgs; i++) {
- dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma))
- goto out_unmap;
-
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (j = 0; j < i; j++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -1;
-}
-
-static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter,
- struct xdp_umem *umem)
-{
- struct device *dev = &adapter->pdev->dev;
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
-
- umem->pages[i].dma = 0;
- }
-}
-
static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
struct xdp_umem *umem,
u16 qid)
{
struct net_device *netdev = adapter->netdev;
- struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
@@ -78,13 +35,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
qid >= netdev->real_num_tx_queues)
return -EINVAL;
- reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = ixgbe_xsk_umem_dma_map(adapter, umem);
+ err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR);
if (err)
return err;
@@ -124,7 +75,7 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid)
ixgbe_txrx_ring_disable(adapter, qid);
clear_bit(qid, adapter->af_xdp_zc_qps);
- ixgbe_xsk_umem_dma_unmap(adapter, umem);
+ xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR);
if (if_running)
ixgbe_txrx_ring_enable(adapter, qid);
@@ -143,19 +94,14 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
- struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = IXGBE_XDP_PASS;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
- u64 offset;
u32 act;
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- offset = xdp->data - xdp->data_hard_start;
-
- xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
@@ -186,140 +132,16 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
return result;
}
-static struct
-ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
- unsigned int size)
-{
- struct ixgbe_rx_buffer *bi;
-
- bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-
- /* we are reusing so sync this buffer for CPU use */
- dma_sync_single_range_for_cpu(rx_ring->dev,
- bi->dma, 0,
- size,
- DMA_BIDIRECTIONAL);
-
- return bi;
-}
-
-static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *obi)
-{
- u16 nta = rx_ring->next_to_alloc;
- struct ixgbe_rx_buffer *nbi;
-
- nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc];
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- nbi->dma = obi->dma;
- nbi->addr = obi->addr;
- nbi->handle = obi->handle;
-
- obi->addr = NULL;
- obi->skb = NULL;
-}
-
-void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
- struct ixgbe_rx_buffer *bi;
- struct ixgbe_ring *rx_ring;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(alloc, struct ixgbe_ring, zca);
- hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- mask = rx_ring->xsk_umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- bi = rx_ring->rx_buffer_info;
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
- rx_ring->xsk_umem->headroom);
-}
-
-static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = bi->addr;
- u64 handle, hr;
-
- if (addr)
- return true;
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_rx_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, hr;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_rx_page_failed++;
- return false;
- }
-
- handle &= rx_ring->xsk_umem->chunk_mask;
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
-static __always_inline bool
-__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
- bool alloc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi))
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
{
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
+ dma_addr_t dma;
bool ok = true;
/* nothing to do */
- if (!cleaned_count)
+ if (!count)
return true;
rx_desc = IXGBE_RX_DESC(rx_ring, i);
@@ -327,21 +149,18 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
i -= rx_ring->count;
do {
- if (!alloc(rx_ring, bi)) {
+ bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!bi->xdp) {
ok = false;
break;
}
- /* sync the buffer for use by the device */
- dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
- bi->page_offset,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_xdp_get_dma(bi->xdp);
/* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
- rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc++;
bi++;
@@ -355,17 +174,14 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
/* clear the length for the next_to_use descriptor */
rx_desc->wb.upper.length = 0;
- cleaned_count--;
- } while (cleaned_count);
+ count--;
+ } while (count);
i += rx_ring->count;
if (rx_ring->next_to_use != i) {
rx_ring->next_to_use = i;
- /* update next to alloc since we have filled the ring */
- rx_ring->next_to_alloc = i;
-
/* Force memory writes to complete before letting h/w
* know there are new descriptors to fetch. (Only
* applicable for weak-ordered memory model archs,
@@ -378,40 +194,27 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
return ok;
}
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
-{
- __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
- ixgbe_alloc_buffer_slow_zc);
-}
-
-static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring,
- u16 count)
-{
- return __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
- ixgbe_alloc_buffer_zc);
-}
-
static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi,
- struct xdp_buff *xdp)
+ struct ixgbe_rx_buffer *bi)
{
- unsigned int metasize = xdp->data - xdp->data_meta;
- unsigned int datasize = xdp->data_end - xdp->data;
+ unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
+ unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
struct sk_buff *skb;
/* allocate a skb to store the frags */
skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
- xdp->data_end - xdp->data_hard_start,
+ bi->xdp->data_end - bi->xdp->data_hard_start,
GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
- skb_reserve(skb, xdp->data - xdp->data_hard_start);
- memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+ skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
+ memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
return skb;
}
@@ -431,14 +234,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
@@ -448,8 +246,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
failure = failure ||
- !ixgbe_alloc_rx_buffers_fast_zc(rx_ring,
- cleaned_count);
+ !ixgbe_alloc_rx_buffers_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
@@ -464,42 +262,40 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
*/
dma_rmb();
- bi = ixgbe_get_rx_buffer_zc(rx_ring, size);
+ bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
if (unlikely(!ixgbe_test_staterr(rx_desc,
IXGBE_RXD_STAT_EOP))) {
struct ixgbe_rx_buffer *next_bi;
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
ixgbe_inc_ntc(rx_ring);
next_bi =
&rx_ring->rx_buffer_info[rx_ring->next_to_clean];
- next_bi->skb = ERR_PTR(-EINVAL);
+ next_bi->discard = true;
continue;
}
- if (unlikely(bi->skb)) {
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ if (unlikely(bi->discard)) {
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
+ bi->discard = false;
ixgbe_inc_ntc(rx_ring);
continue;
}
- xdp.data = bi->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = bi->handle;
-
- xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp);
+ bi->xdp->data_end = bi->xdp->data + size;
+ xsk_buff_dma_sync_for_cpu(bi->xdp);
+ xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
if (xdp_res) {
- if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
+ if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))
xdp_xmit |= xdp_res;
- bi->addr = NULL;
- bi->skb = NULL;
- } else {
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
- }
+ else
+ xsk_buff_free(bi->xdp);
+
+ bi->xdp = NULL;
total_rx_packets++;
total_rx_bytes += size;
@@ -509,7 +305,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
}
/* XDP_PASS path */
- skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp);
+ skb = ixgbe_construct_skb_zc(rx_ring, bi);
if (!skb) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
break;
@@ -561,17 +357,17 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring)
{
- u16 i = rx_ring->next_to_clean;
- struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i];
+ struct ixgbe_rx_buffer *bi;
+ u16 i;
- while (i != rx_ring->next_to_alloc) {
- xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle);
- i++;
- bi++;
- if (i == rx_ring->count) {
- i = 0;
- bi = rx_ring->rx_buffer_info;
- }
+ for (i = 0; i < rx_ring->count; i++) {
+ bi = &rx_ring->rx_buffer_info[i];
+
+ if (!bi->xdp)
+ continue;
+
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
}
}
@@ -594,10 +390,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 81fd53569463..4906aee6798d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -365,10 +365,7 @@ struct mlx5e_dma_info {
dma_addr_t addr;
union {
struct page *page;
- struct {
- u64 handle;
- void *data;
- } xsk;
+ struct xdp_buff *xsk;
};
};
@@ -581,7 +578,6 @@ struct mlx5e_rq {
} mpwqe;
};
struct {
- u16 umem_headroom;
u16 headroom;
u32 frame0_sz;
u8 map_dir; /* dma map direction */
@@ -614,7 +610,6 @@ struct mlx5e_rq {
struct page_pool *page_pool;
/* AF_XDP zero-copy */
- struct zero_copy_allocator zca;
struct xdp_umem *umem;
struct work_struct recover_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index eb2e1f2138e4..38e4f19d69f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
- u16 headroom = NET_IP_ALIGN;
+ u16 headroom;
- if (mlx5e_rx_is_xdp(params, xsk)) {
+ if (xsk)
+ return xsk->headroom;
+
+ headroom = NET_IP_ALIGN;
+ if (mlx5e_rx_is_xdp(params, xsk))
headroom += XDP_PACKET_HEADROOM;
- if (xsk)
- headroom += xsk->headroom;
- } else {
+ else
headroom += MLX5_RX_HEADROOM;
- }
return headroom;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 42202d19245c..3bea1d4be53b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -31,7 +31,7 @@
*/
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "en/xdp.h"
#include "en/params.h"
@@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd.data = xdpf->data;
xdptxd.len = xdpf->len;
- if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
+ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
/* The xdp_buff was in the UMEM and was copied into a newly
* allocated page. The UMEM page was returned via the ZCA, and
* this new page has to be mapped at this point and has to be
@@ -119,50 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len, bool xsk)
+ u32 *len, struct xdp_buff *xdp)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
- struct xdp_umem *umem = rq->umem;
- struct xdp_buff xdp;
u32 act;
int err;
if (!prog)
return false;
- xdp.data = va + *rx_headroom;
- xdp_set_data_meta_invalid(&xdp);
- xdp.data_end = xdp.data + *len;
- xdp.data_hard_start = va;
- if (xsk)
- xdp.handle = di->xsk.handle;
- xdp.rxq = &rq->xdp_rxq;
- xdp.frame_sz = rq->buff.frame0_sz;
-
- act = bpf_prog_run_xdp(prog, &xdp);
- if (xsk) {
- u64 off = xdp.data - xdp.data_hard_start;
-
- xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
- }
+ act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
- *rx_headroom = xdp.data - xdp.data_hard_start;
- *len = xdp.data_end - xdp.data;
+ *len = xdp->data_end - xdp->data;
return false;
case XDP_TX:
- if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
+ if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
- err = xdp_do_redirect(rq->netdev, &xdp, prog);
+ err = xdp_do_redirect(rq->netdev, xdp, prog);
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
- if (!xsk)
+ if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index be64eb68f4e5..ca48c293151b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -61,7 +61,7 @@
struct mlx5e_xsk_param;
int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len, bool xsk);
+ u32 *len, struct xdp_buff *xdp);
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 62fc8a128a8d..a33a1f762c70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -3,71 +3,10 @@
#include "rx.h"
#include "en/xdp.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* RX data path */
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
-{
- /* Check in advance that we have enough frames, instead of allocating
- * one-by-one, failing and moving frames to the Reuse Ring.
- */
- return xsk_umem_has_addrs_rq(rq->umem, count);
-}
-
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info)
-{
- struct xdp_umem *umem = rq->umem;
- u64 handle;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle))
- return -ENOMEM;
-
- dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
- rq->buff.umem_headroom);
- dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
-
- /* No need to add headroom to the DMA address. In striding RQ case, we
- * just provide pages for UMR, and headroom is counted at the setup
- * stage when creating a WQE. In non-striding RQ case, headroom is
- * accounted in mlx5e_alloc_rx_wqe.
- */
- dma_info->addr = xdp_umem_get_dma(umem, handle);
-
- xsk_umem_release_addr_rq(umem);
-
- dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
-
- return 0;
-}
-
-static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
-{
- xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
-}
-
-/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
- * the userspace if possible, and if not, this function is called to reuse them
- * in the driver.
- */
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info)
-{
- mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
-}
-
-/* Return a frame back to the hardware to fill in again. It is used by XDP when
- * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
- */
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
- struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
-
- mlx5e_xsk_recycle_frame(rq, handle);
-}
-
static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
u32 cqe_bcnt)
{
@@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
u32 head_offset,
u32 page_idx)
{
- struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
- u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+ struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
u32 cqe_bcnt32 = cqe_bcnt;
- void *va, *data;
- u32 frag_size;
bool consumed;
/* Check packet size. Note LRO doesn't use linear SKB */
@@ -103,22 +39,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
return NULL;
}
- /* head_offset is not used in this function, because di->xsk.data and
- * di->addr point directly to the necessary place. Furthermore, in the
- * current implementation, UMR pages are mapped to XSK frames, so
+ /* head_offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, in
+ * the current implementation, UMR pages are mapped to XSK frames, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);
- va = di->xsk.data;
- data = va + rx_headroom;
- frag_size = rq->buff.headroom + cqe_bcnt32;
-
- dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
- prefetch(data);
+ xdp->data_end = xdp->data + cqe_bcnt32;
+ xdp_set_data_meta_invalid(xdp);
+ xsk_buff_dma_sync_for_cpu(xdp);
+ prefetch(xdp->data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
+ consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
rcu_read_unlock();
/* Possible flows:
@@ -145,7 +79,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
+ return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
@@ -153,25 +87,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
- struct mlx5e_dma_info *di = wi->di;
- u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
- void *va, *data;
+ struct xdp_buff *xdp = wi->di->xsk;
bool consumed;
- u32 frag_size;
- /* wi->offset is not used in this function, because di->xsk.data and
- * di->addr point directly to the necessary place. Furthermore, in the
- * current implementation, one page = one packet = one frame, so
+ /* wi->offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, the
+ * XSK allocator allocates frames per packet, instead of pages, so
* wi->offset should always be 0.
*/
WARN_ON_ONCE(wi->offset);
- va = di->xsk.data;
- data = va + rx_headroom;
- frag_size = rq->buff.headroom + cqe_bcnt;
-
- dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
- prefetch(data);
+ xdp->data_end = xdp->data + cqe_bcnt;
+ xdp_set_data_meta_invalid(xdp);
+ xsk_buff_dma_sync_for_cpu(xdp);
+ prefetch(xdp->data);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
rq->stats->wqe_err++;
@@ -179,7 +108,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
}
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
+ consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
rcu_read_unlock();
if (likely(consumed))
@@ -189,5 +118,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
* will be handled by mlx5e_put_rx_frag.
* On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
+ return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index cab0e93497ae..d147b2f13b54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,16 +5,10 @@
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* RX data path */
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
u16 cqe_bcnt,
@@ -25,6 +19,23 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
+static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+ dma_info->xsk = xsk_buff_alloc(rq->umem);
+ if (!dma_info->xsk)
+ return -ENOMEM;
+
+ /* Store the DMA address without headroom. In striding RQ case, we just
+ * provide pages for UMR, and headroom is counted at the setup stage
+ * when creating a WQE. In non-striding RQ case, headroom is accounted
+ * in mlx5e_alloc_rx_wqe.
+ */
+ dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk);
+
+ return 0;
+}
+
static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
{
if (!xsk_umem_uses_need_wakeup(rq->umem))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 3bcdb5b2fc20..83dce9cdb8c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -5,7 +5,7 @@
#include "umem.h"
#include "en/xdp.h"
#include "en/params.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
@@ -92,12 +92,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
break;
}
- xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
- xdptxd.data = xdp_umem_get_data(umem, desc.addr);
+ xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
+ xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
xdptxd.len = desc.len;
- dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
- xdptxd.len, DMA_BIDIRECTIONAL);
+ xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len);
if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
if (sq->mpwqe.wqe)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index 79b487d89757..39fa0a705856 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,7 +5,7 @@
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* TX data path */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
index 4baaa5788320..7b17fcd0a56d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "umem.h"
#include "setup.h"
#include "en/params.h"
@@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
- u32 i;
- for (i = 0; i < umem->npgs; i++) {
- dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
-
- if (unlikely(dma_mapping_error(dev, dma)))
- goto err_unmap;
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-err_unmap:
- while (i--) {
- dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
- umem->pages[i].dma = 0;
- }
-
- return -ENOMEM;
+ return xsk_buff_dma_map(umem, dev, 0);
}
static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
- struct device *dev = priv->mdev->device;
- u32 i;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
- umem->pages[i].dma = 0;
- }
+ return xsk_buff_dma_unmap(umem, 0);
}
static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
@@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
{
- return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
+ return xsk_umem_get_headroom(umem) <= 0xffff &&
+ xsk_umem_get_chunk_size(umem) <= 0xffff;
}
void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
{
- xsk->headroom = umem->headroom;
- xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
+ xsk->headroom = xsk_umem_get_headroom(umem);
+ xsk->chunk_size = xsk_umem_get_chunk_size(umem);
}
static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
@@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
mlx5e_xsk_disable_umem(priv, ix);
}
-int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
-{
- struct xdp_umem_fq_reuse *reuseq;
-
- reuseq = xsk_reuseq_prepare(nentries);
- if (unlikely(!reuseq))
- return -ENOMEM;
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- return 0;
-}
-
u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
{
u16 res = xsk->refcnt ? params->num_channels : 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 07823abe5557..8b86c6ded302 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -38,7 +38,7 @@
#include <linux/bpf.h>
#include <linux/if_bridge.h>
#include <net/page_pool.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "eswitch.h"
#include "en.h"
#include "en/txrx.h"
@@ -373,7 +373,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
- u32 num_xsk_frames = 0;
u32 rq_xdp_ix;
u32 pool_size;
int wq_sz;
@@ -413,7 +412,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
- rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) {
@@ -427,10 +425,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
- if (xsk)
- num_xsk_frames = wq_sz <<
- mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
-
pool_size = MLX5_MPWRQ_PAGES_PER_WQE <<
mlx5e_mpwqe_get_log_rq_size(params, xsk);
@@ -482,9 +476,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
- if (xsk)
- num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
-
rq->wqe.info = rqp->frags_info;
rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
@@ -525,19 +516,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
}
if (xsk) {
- rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem);
-
- err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
- if (unlikely(err)) {
- mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
- num_xsk_frames);
- goto err_free;
- }
-
- rq->zca.free = mlx5e_xsk_zca_free;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &rq->zca);
+ MEM_TYPE_XSK_BUFF_POOL, NULL);
+ xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
} else {
/* Create a page_pool and register it with rxq */
pp_params.order = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index a514685fb560..fdba52136c5d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -300,7 +300,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq,
* put into the Reuse Ring, because there is no way to return
* the page to the userspace when the interface goes down.
*/
- mlx5e_xsk_page_release(rq, dma_info);
+ xsk_buff_free(dma_info->xsk);
else
mlx5e_page_release_dynamic(rq, dma_info, recycle);
}
@@ -385,7 +385,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
if (rq->umem) {
int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
- if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
+ /* Check in advance that we have enough frames, instead of
+ * allocating one-by-one, failing and moving frames to the
+ * Reuse Ring.
+ */
+ if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
return -ENOMEM;
}
@@ -480,8 +484,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
int err;
int i;
+ /* Check in advance that we have enough frames, instead of allocating
+ * one-by-one, failing and moving frames to the Reuse Ring.
+ */
if (rq->umem &&
- unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
+ unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
err = -ENOMEM;
goto err;
}
@@ -1044,12 +1051,24 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
return skb;
}
+static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
+ u32 len, struct xdp_buff *xdp)
+{
+ xdp->data_hard_start = va;
+ xdp_set_data_meta_invalid(xdp);
+ xdp->data = va + headroom;
+ xdp->data_end = xdp->data + len;
+ xdp->rxq = &rq->xdp_rxq;
+ xdp->frame_sz = rq->buff.frame0_sz;
+}
+
struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
{
struct mlx5e_dma_info *di = wi->di;
u16 rx_headroom = rq->buff.headroom;
+ struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
bool consumed;
@@ -1065,11 +1084,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
prefetch(data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
+ mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
+ consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
rcu_read_unlock();
if (consumed)
return NULL; /* page/packet was consumed by XDP */
+ rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
if (unlikely(!skb))
@@ -1343,6 +1364,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom;
u32 cqe_bcnt32 = cqe_bcnt;
+ struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
u32 frag_size;
@@ -1364,7 +1386,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prefetch(data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
+ mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
+ consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
rcu_read_unlock();
if (consumed) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@@ -1372,6 +1395,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
return NULL; /* page/packet was consumed by XDP */
}
+ rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
if (unlikely(!skb))
diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c
index 1e0c024b0a93..8e4141552423 100644
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
xdp->data_end = xdp->data + len;
xdp->rxq = &nvchan->xdp_rxq;
xdp->frame_sz = PAGE_SIZE;
- xdp->handle = 0;
memcpy(xdp->data, data, len);
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 272626cc3fc9..c66c545e161a 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
}
#define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3094fccf5a88..90f11760bd12 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -39,7 +39,7 @@ enum xdp_mem_type {
MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
MEM_TYPE_PAGE_POOL,
- MEM_TYPE_ZERO_COPY,
+ MEM_TYPE_XSK_BUFF_POOL,
MEM_TYPE_MAX,
};
@@ -54,10 +54,6 @@ struct xdp_mem_info {
struct page_pool;
-struct zero_copy_allocator {
- void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
-};
-
struct xdp_rxq_info {
struct net_device *dev;
u32 queue_index;
@@ -70,7 +66,6 @@ struct xdp_buff {
void *data_end;
void *data_meta;
void *data_hard_start;
- unsigned long handle;
struct xdp_rxq_info *rxq;
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
};
@@ -119,7 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
int metasize;
int headroom;
- if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
return xdp_convert_zc_to_xdp_frame(xdp);
/* Assure headroom is available for storing info */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index abd72de25fa4..96bfc5f5f24e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -15,40 +15,15 @@
struct net_device;
struct xsk_queue;
-
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
-
-struct xdp_umem_page {
- void *addr;
- dma_addr_t dma;
-};
-
-struct xdp_umem_fq_reuse {
- u32 nentries;
- u32 length;
- u64 handles[];
-};
-
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
+struct xdp_buff;
struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
- struct xdp_umem_page *pages;
- u64 chunk_mask;
+ struct xsk_buff_pool *pool;
u64 size;
u32 headroom;
- u32 chunk_size_nohr;
+ u32 chunk_size;
struct user_struct *user;
refcount_t users;
struct work_struct work;
@@ -59,28 +34,17 @@ struct xdp_umem {
u8 flags;
int id;
struct net_device *dev;
- struct xdp_umem_fq_reuse *fq_reuse;
bool zc;
spinlock_t xsk_tx_list_lock;
struct list_head xsk_tx_list;
};
-/* Nodes are linked in the struct xdp_sock map_list field, and used to
- * track which maps a certain socket reside in.
- */
-
struct xsk_map {
struct bpf_map map;
spinlock_t lock; /* Synchronize map updates */
struct xdp_sock *xsk_map[];
};
-struct xsk_map_node {
- struct list_head node;
- struct xsk_map *map;
- struct xdp_sock **map_entry;
-};
-
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
@@ -111,32 +75,9 @@ struct xdp_sock {
spinlock_t map_list_lock;
};
-struct xdp_buff;
#ifdef CONFIG_XDP_SOCKETS
-int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
-/* Used from netdev driver */
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_release_addr(struct xdp_umem *umem);
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq);
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry);
-int xsk_map_inc(struct xsk_map *map);
-void xsk_map_put(struct xsk_map *map);
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
@@ -153,230 +94,13 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
return xs;
}
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
- return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
- return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
- return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
- unsigned long page_addr;
-
- addr = xsk_umem_add_offset_to_addr(addr);
- page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
-
- return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
- addr = xsk_umem_add_offset_to_addr(addr);
-
- return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
-}
-
-/* Reuse-queue aware version of FILL queue helpers */
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (rq->length >= cnt)
- return true;
-
- return xsk_umem_has_addrs(umem, cnt - rq->length);
-}
-
-static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (!rq->length)
- return xsk_umem_peek_addr(umem, addr);
-
- *addr = rq->handles[rq->length - 1];
- return addr;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (!rq->length)
- xsk_umem_release_addr(umem);
- else
- rq->length--;
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- rq->handles[rq->length++] = addr;
-}
-
-/* Handle the offset appropriately depending on aligned or unaligned mode.
- * For unaligned mode, we store the offset in the upper 16-bits of the address.
- * For aligned mode, we simply add the offset to the address.
- */
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
- u64 offset)
-{
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
- return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
- else
- return address + offset;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
- return umem->chunk_size_nohr + umem->headroom;
-}
-
#else
+
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -ENOTSUPP;
}
-static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
-{
- return false;
-}
-
-static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
- return false;
-}
-
-static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
- return NULL;
-}
-
-static inline void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
-{
-}
-
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
- struct xdp_desc *desc)
-{
- return false;
-}
-
-static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
-{
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
- return NULL;
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
- struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq)
-{
- return NULL;
-}
-static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-}
-
-static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
- u16 queue_id)
-{
- return NULL;
-}
-
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
- return 0;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
- return 0;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
- return 0;
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
- return NULL;
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
- return 0;
-}
-
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
- return false;
-}
-
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
- return NULL;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-}
-
-static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
-{
- return false;
-}
-
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
- u64 offset)
-{
- return 0;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
- return 0;
-}
-
static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -EOPNOTSUPP;
@@ -391,6 +115,7 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
{
return NULL;
}
+
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
new file mode 100644
index 000000000000..ccf848f7efa4
--- /dev/null
+++ b/include/net/xdp_sock_drv.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Interface for implementing AF_XDP zero-copy support in drivers.
+ * Copyright(c) 2020 Intel Corporation.
+ */
+
+#ifndef _LINUX_XDP_SOCK_DRV_H
+#define _LINUX_XDP_SOCK_DRV_H
+
+#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#ifdef CONFIG_XDP_SOCKETS
+
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+ return XDP_PACKET_HEADROOM + umem->headroom;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+ return umem->chunk_size;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+ return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+ struct xdp_rxq_info *rxq)
+{
+ xp_set_rxq_info(umem->pool, rxq);
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+ unsigned long attrs)
+{
+ xp_dma_unmap(umem->pool, attrs);
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+ unsigned long attrs)
+{
+ return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ return xp_get_dma(xskb);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ return xp_get_frame_dma(xskb);
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+ return xp_alloc(umem->pool);
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+ return xp_can_alloc(umem->pool, count);
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ xp_free(xskb);
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return xp_raw_get_dma(umem->pool, addr);
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return xp_raw_get_data(umem->pool, addr);
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ xp_dma_sync_for_cpu(xskb);
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+ dma_addr_t dma,
+ size_t size)
+{
+ xp_dma_sync_for_device(umem->pool, dma, size);
+}
+
+#else
+
+static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+}
+
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+ struct xdp_desc *desc)
+{
+ return false;
+}
+
+static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+}
+
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ return NULL;
+}
+
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+ return false;
+}
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+ struct xdp_rxq_info *rxq)
+{
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+ unsigned long attrs)
+{
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+ unsigned long attrs)
+{
+ return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+ return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+ return 0;
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+ return NULL;
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+ return false;
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return 0;
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return NULL;
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+ dma_addr_t dma,
+ size_t size)
+{
+}
+
+#endif /* CONFIG_XDP_SOCKETS */
+
+#endif /* _LINUX_XDP_SOCK_DRV_H */
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
new file mode 100644
index 000000000000..a4ff226505c9
--- /dev/null
+++ b/include/net/xsk_buff_pool.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef XSK_BUFF_POOL_H_
+#define XSK_BUFF_POOL_H_
+
+#include <linux/if_xdp.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <net/xdp.h>
+
+struct xsk_buff_pool;
+struct xdp_rxq_info;
+struct xsk_queue;
+struct xdp_desc;
+struct device;
+struct page;
+
+struct xdp_buff_xsk {
+ struct xdp_buff xdp;
+ dma_addr_t dma;
+ dma_addr_t frame_dma;
+ struct xsk_buff_pool *pool;
+ bool unaligned;
+ u64 orig_addr;
+ struct list_head free_list_node;
+};
+
+struct xsk_buff_pool {
+ struct xsk_queue *fq;
+ struct list_head free_list;
+ dma_addr_t *dma_pages;
+ struct xdp_buff_xsk *heads;
+ u64 chunk_mask;
+ u64 addrs_cnt;
+ u32 free_list_cnt;
+ u32 dma_pages_cnt;
+ u32 heads_cnt;
+ u32 free_heads_cnt;
+ u32 headroom;
+ u32 chunk_size;
+ u32 frame_len;
+ bool cheap_dma;
+ bool unaligned;
+ void *addrs;
+ struct device *dev;
+ struct xdp_buff_xsk *free_heads[];
+};
+
+/* AF_XDP core. */
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+ u32 chunk_size, u32 headroom, u64 size,
+ bool unaligned);
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
+void xp_destroy(struct xsk_buff_pool *pool);
+void xp_release(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP, and XDP core. */
+void xp_free(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP ZC drivers, via xdp_sock_buff.h */
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+ unsigned long attrs, struct page **pages, u32 nr_pages);
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
+static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
+{
+ return xskb->dma;
+}
+
+static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+{
+ return xskb->frame_dma;
+}
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
+static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
+{
+ if (xskb->pool->cheap_dma)
+ return;
+
+ xp_dma_sync_for_cpu_slow(xskb);
+}
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+ size_t size);
+static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
+ dma_addr_t dma, size_t size)
+{
+ if (pool->cheap_dma)
+ return;
+
+ xp_dma_sync_for_device_slow(pool, dma, size);
+}
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+ u64 addr, u32 len)
+{
+ bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+
+ if (pool->dma_pages_cnt && cross_pg) {
+ return !(pool->dma_pages[addr >> PAGE_SHIFT] &
+ XSK_NEXT_PG_CONTIG_MASK);
+ }
+ return false;
+}
+
+static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
+{
+ return addr & pool->chunk_mask;
+}
+
+static inline u64 xp_unaligned_extract_addr(u64 addr)
+{
+ return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xp_unaligned_extract_offset(u64 addr)
+{
+ return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
+{
+ return xp_unaligned_extract_addr(addr) +
+ xp_unaligned_extract_offset(addr);
+}
+
+#endif /* XSK_BUFF_POOL_H_ */
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index b95d65e8c628..b73d3e141323 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -287,7 +287,7 @@ TRACE_EVENT(xdp_devmap_xmit,
FN(PAGE_SHARED) \
FN(PAGE_ORDER0) \
FN(PAGE_POOL) \
- FN(ZERO_COPY)
+ FN(XSK_BUFF_POOL)
#define __MEM_TYPE_TP_FN(x) \
TRACE_DEFINE_ENUM(MEM_TYPE_##x);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9b8a0f63b91..97e1fd19ff58 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -220,6 +220,10 @@ enum bpf_attach_type {
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 37b2d8620153..375b933010dd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 57dfc98289d5..431241c74614 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25b14ee0e26d..d2e27dba4ac6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool reg_type_not_null(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_BTF_ID;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
@@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
- if (__is_pointer_value(false, reg))
- return -1;
+ if (__is_pointer_value(false, reg)) {
+ if (!reg_type_not_null(reg->type))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
if (is_jmp32)
return is_branch32_taken(reg, val, opcode);
@@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (pred >= 0) {
- err = mark_chain_precision(env, insn->dst_reg);
+ /* If we get here with a dst_reg pointer type it is because
+ * above is_branch_taken() special cased the 0 comparison.
+ */
+ if (!__is_pointer_value(false, dst_reg))
+ err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
if (err)
@@ -7094,7 +7124,11 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -7120,10 +7154,11 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_TRACE_FEXIT:
range = tnum_const(0);
break;
- case BPF_TRACE_ITER:
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
return 0;
+ case BPF_TRACE_ITER:
+ break;
default:
return -ENOTSUPP;
}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 30ba7d38941d..bfd4ccd80847 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+ u32 user_size = kattr->test.data_size_in;
void *data;
if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
return ERR_PTR(-EINVAL);
+ if (user_size > size)
+ return ERR_PTR(-EMSGSIZE);
+
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(data + headroom, data_in, size)) {
+ if (copy_from_user(data + headroom, data_in, user_size)) {
kfree(data);
return ERR_PTR(-EFAULT);
}
@@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
/* XDP have extra tailroom as (most) drivers use full page */
max_data_sz = 4096 - headroom - tailroom;
- if (size > max_data_sz)
- return -EINVAL;
data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
if (IS_ERR(data))
diff --git a/net/core/filter.c b/net/core/filter.c
index 822d662f97ef..bd2853d23b50 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
@@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 490b8f5fa8ee..90f44f382115 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -17,6 +17,7 @@
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -109,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator)
mutex_unlock(&mem_id_lock);
}
-static void mem_id_disconnect(int id)
-{
- struct xdp_mem_allocator *xa;
-
- mutex_lock(&mem_id_lock);
-
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
- return;
- }
-
- trace_mem_disconnect(xa);
-
- if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
- mutex_unlock(&mem_id_lock);
-}
-
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -143,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
- if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
- return mem_id_disconnect(id);
-
if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
@@ -301,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->mem.type = type;
if (!allocator) {
- if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ if (type == MEM_TYPE_PAGE_POOL)
return -EINVAL; /* Setup time check page_pool req */
return 0;
}
@@ -358,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases.
+ * of xdp_frames/pages in those cases. This path is never used by the
+ * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
+ * the switch-statement.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- unsigned long handle)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -383,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
- case MEM_TYPE_ZERO_COPY:
- /* NB! Only valid from an xdp_buff! */
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
- rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
void xdp_return_buff(struct xdp_buff *xdp)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+ __xdp_return(xdp->data, &xdp->rxq->mem, true);
}
-EXPORT_SYMBOL_GPL(xdp_return_buff);
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
@@ -493,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->metasize = metasize;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- xdp_return_buff(xdp);
+ xsk_buff_free(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 3aa4975919d7..9ef54cdcf662 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "netlink.h"
#include "common.h"
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index eeb1137a3f23..31e0b4e88a9d 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -24,7 +24,7 @@
#include <linux/sched/signal.h>
#include <linux/net.h>
#include <net/devlink.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/flow_offload.h>
#include <linux/ethtool_netlink.h>
#include <generated/utsrelease.h>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c35a8b2e0499..02aa5cb3a4fd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -756,12 +756,11 @@ do_err:
}
EXPORT_SYMBOL(inet_accept);
-
/*
* This does both peername and sockname.
*/
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
@@ -782,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ NULL);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b69496eaf922..0625a97a8894 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -505,9 +505,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
struct sock *sk = sock->sk;
@@ -532,9 +531,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
sin->sin6_port = inet->inet_sport;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET6_GETPEERNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME,
+ NULL);
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
return sizeof(*sin);
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 71e2bdafb2ce..30cdc4315f42 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 37ace3bc0d48..19e59d1a5e9f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
umem->zc = false;
}
-static void xdp_umem_unmap_pages(struct xdp_umem *umem)
-{
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++)
- if (PageHighMem(umem->pgs[i]))
- vunmap(umem->pages[i].addr);
-}
-
-static int xdp_umem_map_pages(struct xdp_umem *umem)
-{
- unsigned int i;
- void *addr;
-
- for (i = 0; i < umem->npgs; i++) {
- if (PageHighMem(umem->pgs[i]))
- addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
- else
- addr = page_address(umem->pgs[i]);
-
- if (!addr) {
- xdp_umem_unmap_pages(umem);
- return -ENOMEM;
- }
-
- umem->pages[i].addr = addr;
- }
-
- return 0;
-}
-
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
umem->cq = NULL;
}
- xsk_reuseq_destroy(umem);
-
- xdp_umem_unmap_pages(umem);
+ xp_destroy(umem->pool);
xdp_umem_unpin_pages(umem);
- kvfree(umem->pages);
- umem->pages = NULL;
-
xdp_umem_unaccount_pages(umem);
kfree(umem);
}
@@ -385,11 +349,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
- umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
- : ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
- umem->chunk_size_nohr = chunk_size - headroom;
+ umem->chunk_size = chunk_size;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
@@ -407,18 +369,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
goto out_account;
- umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
- GFP_KERNEL_ACCOUNT);
- if (!umem->pages) {
+ umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
+ headroom, size, unaligned_chunks);
+ if (!umem->pool) {
err = -ENOMEM;
goto out_pin;
}
-
- err = xdp_umem_map_pages(umem);
- if (!err)
- return 0;
-
- kvfree(umem->pages);
+ return 0;
out_pin:
xdp_umem_unpin_pages(umem);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index a63a9fb251f5..32067fe98f65 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -6,7 +6,7 @@
#ifndef XDP_UMEM_H_
#define XDP_UMEM_H_
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 45ffd67b367d..b6c0f08bd80d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -22,7 +22,7 @@
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "xsk_queue.h"
@@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
READ_ONCE(xs->umem->fq);
}
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
- return xskq_cons_has_entries(umem->fq, cnt);
-}
-EXPORT_SYMBOL(xsk_umem_has_addrs);
-
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
- return xskq_cons_peek_addr(umem->fq, addr, umem);
-}
-EXPORT_SYMBOL(xsk_umem_peek_addr);
-
-void xsk_umem_release_addr(struct xdp_umem *umem)
-{
- xskq_cons_release(umem->fq);
-}
-EXPORT_SYMBOL(xsk_umem_release_addr);
-
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
if (umem->need_wakeup & XDP_WAKEUP_RX)
@@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
-/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
- * each page. This is only required in copy mode.
- */
-static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
- u32 len, u32 metalen)
+void xp_release(struct xdp_buff_xsk *xskb)
{
- void *to_buf = xdp_umem_get_data(umem, addr);
-
- addr = xsk_umem_add_offset_to_addr(addr);
- if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
- void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
- u64 page_start = addr & ~(PAGE_SIZE - 1);
- u64 first_len = PAGE_SIZE - (addr - page_start);
-
- memcpy(to_buf, from_buf, first_len);
- memcpy(next_pg_addr, from_buf + first_len,
- len + metalen - first_len);
+ xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
- return;
- }
+static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+ u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
- memcpy(to_buf, from_buf, len + metalen);
+ offset += xskb->pool->headroom;
+ if (!xskb->pool->unaligned)
+ return xskb->orig_addr + offset;
+ return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- u64 offset = xs->umem->headroom;
- u64 addr, memcpy_addr;
- void *from_buf;
- u32 metalen;
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ u64 addr;
int err;
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
+ addr = xp_get_handle(xskb);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
+ if (err) {
xs->rx_dropped++;
- return -ENOSPC;
+ return err;
}
- if (unlikely(xdp_data_meta_unsupported(xdp))) {
- from_buf = xdp->data;
- metalen = 0;
- } else {
- from_buf = xdp->data_meta;
- metalen = xdp->data - xdp->data_meta;
- }
+ xp_release(xskb);
+ return 0;
+}
- memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
+{
+ void *from_buf, *to_buf;
+ u32 metalen;
- offset += metalen;
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (!err) {
- xskq_cons_release(xs->umem->fq);
- xdp_return_buff(xdp);
- return 0;
+ if (unlikely(xdp_data_meta_unsupported(from))) {
+ from_buf = from->data;
+ to_buf = to->data;
+ metalen = 0;
+ } else {
+ from_buf = from->data_meta;
+ metalen = from->data - from->data_meta;
+ to_buf = to->data - metalen;
}
- xs->rx_dropped++;
- return err;
+ memcpy(to_buf, from_buf, len + metalen);
}
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
+ bool explicit_free)
{
- int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
+ struct xdp_buff *xsk_xdp;
+ int err;
- if (err)
+ if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
xs->rx_dropped++;
+ return -ENOSPC;
+ }
- return err;
+ xsk_xdp = xsk_buff_alloc(xs->umem);
+ if (!xsk_xdp) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ xsk_copy_xdp(xsk_xdp, xdp, len);
+ err = __xsk_rcv_zc(xs, xsk_xdp, len);
+ if (err) {
+ xsk_buff_free(xsk_xdp);
+ return err;
+ }
+ if (explicit_free)
+ xdp_return_buff(xdp);
+ return 0;
}
static bool xsk_is_bound(struct xdp_sock *xs)
@@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
-static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
+ bool explicit_free)
{
u32 len;
@@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
len = xdp->data_end - xdp->data;
- return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
- __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+ return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+ __xsk_rcv_zc(xs, xdp, len) :
+ __xsk_rcv(xs, xdp, len, explicit_free);
}
static void xsk_flush(struct xdp_sock *xs)
@@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
- u32 metalen = xdp->data - xdp->data_meta;
- u32 len = xdp->data_end - xdp->data;
- u64 offset = xs->umem->headroom;
- void *buffer;
- u64 addr;
int err;
spin_lock_bh(&xs->rx_lock);
-
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
- err = -EINVAL;
- goto out_unlock;
- }
-
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
- err = -ENOSPC;
- goto out_drop;
- }
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- buffer = xdp_umem_get_data(xs->umem, addr);
- memcpy(buffer, xdp->data_meta, len + metalen);
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (err)
- goto out_drop;
-
- xskq_cons_release(xs->umem->fq);
- xskq_prod_submit(xs->rx);
-
- spin_unlock_bh(&xs->rx_lock);
-
- xs->sk.sk_data_ready(&xs->sk);
- return 0;
-
-out_drop:
- xs->rx_dropped++;
-out_unlock:
+ err = xsk_rcv(xs, xdp, false);
+ xsk_flush(xs);
spin_unlock_bh(&xs->rx_lock);
return err;
}
@@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
- err = xsk_rcv(xs, xdp);
+ err = xsk_rcv(xs, xdp, true);
if (err)
return err;
@@ -404,7 +359,7 @@ static int xsk_generic_xmit(struct sock *sk)
skb_put(skb, len);
addr = desc.addr;
- buffer = xdp_umem_get_data(xs->umem, addr);
+ buffer = xsk_buff_raw_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
@@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
-/* Check if umem pages are contiguous.
- * If zero-copy mode, use the DMA address to do the page contiguity check
- * For all other modes we use addr (kernel virtual address)
- * Store the result in the low bits of addr.
- */
-static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
-{
- struct xdp_umem_page *pgs = umem->pages;
- int i, is_contig;
-
- for (i = 0; i < umem->npgs - 1; i++) {
- is_contig = (flags & XDP_ZEROCOPY) ?
- (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
- (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
- pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
- }
-}
-
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_unlock;
} else {
/* This xsk has its own umem. */
- xskq_set_umem(xs->umem->fq, xs->umem->size,
- xs->umem->chunk_mask);
- xskq_set_umem(xs->umem->cq, xs->umem->size,
- xs->umem->chunk_mask);
-
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
-
- xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
- xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
- xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
@@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
&xs->umem->cq;
err = xsk_init_queue(entries, q, true);
+ if (optname == XDP_UMEM_FILL_RING)
+ xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 4cfd106bdb53..455ddd480f3d 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,20 @@
#ifndef XSK_H_
#define XSK_H_
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
+
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
@@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 {
struct xdp_ring_offset_v1 cr;
};
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+ struct list_head node;
+ struct xsk_map *map;
+ struct xdp_sock **map_entry;
+};
+
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;
}
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
new file mode 100644
index 000000000000..540ed75e4482
--- /dev/null
+++ b/net/xdp/xsk_buff_pool.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/swiotlb.h>
+
+#include "xsk_queue.h"
+
+static void xp_addr_unmap(struct xsk_buff_pool *pool)
+{
+ vunmap(pool->addrs);
+}
+
+static int xp_addr_map(struct xsk_buff_pool *pool,
+ struct page **pages, u32 nr_pages)
+{
+ pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!pool->addrs)
+ return -ENOMEM;
+ return 0;
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return;
+
+ xp_addr_unmap(pool);
+ kvfree(pool->heads);
+ kvfree(pool);
+}
+
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+ u32 chunk_size, u32 headroom, u64 size,
+ bool unaligned)
+{
+ struct xsk_buff_pool *pool;
+ struct xdp_buff_xsk *xskb;
+ int err;
+ u32 i;
+
+ pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+ if (!pool)
+ goto out;
+
+ pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+ if (!pool->heads)
+ goto out;
+
+ pool->chunk_mask = ~((u64)chunk_size - 1);
+ pool->addrs_cnt = size;
+ pool->heads_cnt = chunks;
+ pool->free_heads_cnt = chunks;
+ pool->headroom = headroom;
+ pool->chunk_size = chunk_size;
+ pool->cheap_dma = true;
+ pool->unaligned = unaligned;
+ pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ INIT_LIST_HEAD(&pool->free_list);
+
+ for (i = 0; i < pool->free_heads_cnt; i++) {
+ xskb = &pool->heads[i];
+ xskb->pool = pool;
+ xskb->xdp.frame_sz = chunk_size - headroom;
+ pool->free_heads[i] = xskb;
+ }
+
+ err = xp_addr_map(pool, pages, nr_pages);
+ if (!err)
+ return pool;
+
+out:
+ xp_destroy(pool);
+ return NULL;
+}
+
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
+{
+ pool->fq = fq;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+ u32 i;
+
+ for (i = 0; i < pool->heads_cnt; i++)
+ pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+ dma_addr_t *dma;
+ u32 i;
+
+ if (pool->dma_pages_cnt == 0)
+ return;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ dma = &pool->dma_pages[i];
+ if (*dma) {
+ dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ *dma = 0;
+ }
+ }
+
+ kvfree(pool->dma_pages);
+ pool->dma_pages_cnt = 0;
+ pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+{
+ u32 i;
+
+ for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
+ if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
+ pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+ else
+ pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+ }
+}
+
+static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_SWIOTLB)
+ phys_addr_t paddr;
+ u32 i;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
+ if (is_swiotlb_buffer(paddr))
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_HAS_DMA)
+ const struct dma_map_ops *ops = get_dma_ops(pool->dev);
+
+ if (ops) {
+ return !ops->sync_single_for_cpu &&
+ !ops->sync_single_for_device;
+ }
+
+ if (!dma_is_direct(ops))
+ return false;
+
+ if (!xp_check_swiotlb_dma(pool))
+ return false;
+
+ if (!dev_is_dma_coherent(pool->dev)) {
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+ return false;
+#endif
+ }
+#endif
+ return true;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+ unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+ dma_addr_t dma;
+ u32 i;
+
+ pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
+ GFP_KERNEL);
+ if (!pool->dma_pages)
+ return -ENOMEM;
+
+ pool->dev = dev;
+ pool->dma_pages_cnt = nr_pages;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ if (dma_mapping_error(dev, dma)) {
+ xp_dma_unmap(pool, attrs);
+ return -ENOMEM;
+ }
+ pool->dma_pages[i] = dma;
+ }
+
+ if (pool->unaligned)
+ xp_check_dma_contiguity(pool);
+
+ pool->dev = dev;
+ pool->cheap_dma = xp_check_cheap_dma(pool);
+ return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+ u64 addr)
+{
+ return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_unaligned_extract_addr(*addr);
+ if (*addr >= pool->addrs_cnt ||
+ *addr + pool->chunk_size > pool->addrs_cnt ||
+ xp_addr_crosses_non_contig_pg(pool, *addr))
+ return false;
+ return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_aligned_extract_addr(pool, *addr);
+ return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+ u64 addr;
+ bool ok;
+
+ if (pool->free_heads_cnt == 0)
+ return NULL;
+
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+
+ for (;;) {
+ if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+ xp_release(xskb);
+ return NULL;
+ }
+
+ ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+ xp_check_aligned(pool, &addr);
+ if (!ok) {
+ pool->fq->invalid_descs++;
+ xskq_cons_release(pool->fq);
+ continue;
+ }
+ break;
+ }
+ xskq_cons_release(pool->fq);
+
+ xskb->orig_addr = addr;
+ xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+ if (pool->dma_pages_cnt) {
+ xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
+ ~XSK_NEXT_PG_CONTIG_MASK) +
+ (addr & ~PAGE_MASK);
+ xskb->dma = xskb->frame_dma + pool->headroom +
+ XDP_PACKET_HEADROOM;
+ }
+ return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+
+ if (!pool->free_list_cnt) {
+ xskb = __xp_alloc(pool);
+ if (!xskb)
+ return NULL;
+ } else {
+ pool->free_list_cnt--;
+ xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+ free_list_node);
+ list_del(&xskb->free_list_node);
+ }
+
+ xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+ xskb->xdp.data_meta = xskb->xdp.data;
+
+ if (!pool->cheap_dma) {
+ dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+ pool->frame_len,
+ DMA_BIDIRECTIONAL);
+ }
+ return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+ if (pool->free_list_cnt >= count)
+ return true;
+ return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+ xskb->pool->free_list_cnt++;
+ list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return (pool->dma_pages[addr >> PAGE_SHIFT] &
+ ~XSK_NEXT_PG_CONTIG_MASK) +
+ (addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
+{
+ dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+ xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+ size_t size)
+{
+ dma_sync_single_range_for_device(pool->dev, dma, 0,
+ size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index f59791ba43a0..0163b26aaf63 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
- du.chunk_size = umem->chunk_size_nohr + umem->headroom;
+ du.chunk_size = umem->chunk_size;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 57fb81bd593c..6cf9586e5027 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -6,18 +6,10 @@
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/overflow.h>
+#include <net/xdp_sock_drv.h>
#include "xsk_queue.h"
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
-{
- if (!q)
- return;
-
- q->umem_size = umem_size;
- q->chunk_mask = chunk_mask;
-}
-
static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
{
struct xdp_umem_ring *umem_ring;
@@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q)
page_frag_free(q->ring);
kfree(q);
}
-
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
- struct xdp_umem_fq_reuse *newq;
-
- /* Check for overflow */
- if (nentries > (u32)roundup_pow_of_two(nentries))
- return NULL;
- nentries = roundup_pow_of_two(nentries);
-
- newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
- if (!newq)
- return NULL;
- memset(newq, 0, offsetof(typeof(*newq), handles));
-
- newq->nentries = nentries;
- return newq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
-
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq)
-{
- struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
-
- if (!oldq) {
- umem->fq_reuse = newq;
- return NULL;
- }
-
- if (newq->nentries < oldq->length)
- return newq;
-
- memcpy(newq->handles, oldq->handles,
- array_size(oldq->length, sizeof(u64)));
- newq->length = oldq->length;
-
- umem->fq_reuse = newq;
- return oldq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
-
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
- kvfree(rq);
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_free);
-
-void xsk_reuseq_destroy(struct xdp_umem *umem)
-{
- xsk_reuseq_free(umem->fq_reuse);
- umem->fq_reuse = NULL;
-}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 648733ec24ac..5b5d24d2dd37 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -9,6 +9,9 @@
#include <linux/types.h>
#include <linux/if_xdp.h>
#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#include "xsk.h"
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
@@ -29,8 +32,6 @@ struct xdp_umem_ring {
};
struct xsk_queue {
- u64 chunk_mask;
- u64 umem_size;
u32 ring_mask;
u32 nentries;
u32 cached_prod;
@@ -103,98 +104,73 @@ struct xsk_queue {
/* Functions that read and validate content from consumer rings. */
-static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
- u64 addr,
- u64 length)
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
- bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
- bool next_pg_contig =
- (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
- XSK_NEXT_PG_CONTIG_MASK;
-
- return cross_pg && !next_pg_contig;
-}
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
- u64 addr,
- u64 length,
- struct xdp_umem *umem)
-{
- u64 base_addr = xsk_umem_extract_addr(addr);
+ if (q->cached_cons != q->cached_prod) {
+ u32 idx = q->cached_cons & q->ring_mask;
- addr = xsk_umem_add_offset_to_addr(addr);
- if (base_addr >= q->umem_size || addr >= q->umem_size ||
- xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
- q->invalid_descs++;
- return false;
+ *addr = ring->desc[idx];
+ return true;
}
- return true;
+ return false;
}
-static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
{
- if (addr >= q->umem_size) {
- q->invalid_descs++;
+ u64 chunk, chunk_end;
+
+ chunk = xp_aligned_extract_addr(pool, desc->addr);
+ chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
+ if (chunk != chunk_end)
+ return false;
+
+ if (chunk >= pool->addrs_cnt)
return false;
- }
+ if (desc->options)
+ return false;
return true;
}
-static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
- struct xdp_umem *umem)
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
{
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
- while (q->cached_cons != q->cached_prod) {
- u32 idx = q->cached_cons & q->ring_mask;
+ u64 addr, base_addr;
- *addr = ring->desc[idx] & q->chunk_mask;
+ base_addr = xp_unaligned_extract_addr(desc->addr);
+ addr = xp_unaligned_add_offset_to_addr(desc->addr);
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
- if (xskq_cons_is_valid_unaligned(q, *addr,
- umem->chunk_size_nohr,
- umem))
- return true;
- goto out;
- }
+ if (desc->len > pool->chunk_size)
+ return false;
- if (xskq_cons_is_valid_addr(q, *addr))
- return true;
+ if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+ xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+ return false;
-out:
- q->cached_cons++;
- }
+ if (desc->options)
+ return false;
+ return true;
+}
- return false;
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+ xp_aligned_validate_desc(pool, desc);
}
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
struct xdp_umem *umem)
{
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
- if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
- return false;
-
- if (d->len > umem->chunk_size_nohr || d->options) {
- q->invalid_descs++;
- return false;
- }
-
- return true;
- }
-
- if (!xskq_cons_is_valid_addr(q, d->addr))
- return false;
-
- if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
- d->options) {
+ if (!xp_validate_desc(umem->pool, d)) {
q->invalid_descs++;
return false;
}
-
return true;
}
@@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
return entries >= cnt;
}
-static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
- struct xdp_umem *umem)
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
if (q->cached_prod == q->cached_cons)
xskq_cons_get_entries(q);
- return xskq_cons_read_addr(q, addr, umem);
+ return xskq_cons_read_addr_unchecked(q, addr);
}
static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
@@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
return q ? q->invalid_descs : 0;
}
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);
-/* Executed by the core when the entire UMEM gets freed */
-void xsk_reuseq_destroy(struct xdp_umem *umem);
-
#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/kernel/bpf/xskmap.c b/net/xdp/xskmap.c
index 2cc5c8f4c800..1dc7208c71ba 100644
--- a/kernel/bpf/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -9,6 +9,8 @@
#include <linux/slab.h>
#include <linux/sched.h>
+#include "xsk.h"
+
int xsk_map_inc(struct xsk_map *map)
{
bpf_map_inc(&map->map);
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index 23837f2ed458..034800c4d1e6 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -50,3 +50,4 @@ xdp_rxq_info
xdp_sample_pkts
xdp_tx_iptunnel
xdpsock
+testfile.img
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 424f6fe7ce38..8403e4762306 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -63,14 +63,14 @@ TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
fds_example-objs := fds_example.o
sockex1-objs := sockex1_user.o
sockex2-objs := sockex2_user.o
-sockex3-objs := bpf_load.o sockex3_user.o
-tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS)
-tracex2-objs := bpf_load.o tracex2_user.o
-tracex3-objs := bpf_load.o tracex3_user.o
-tracex4-objs := bpf_load.o tracex4_user.o
-tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS)
-tracex6-objs := bpf_load.o tracex6_user.o
-tracex7-objs := bpf_load.o tracex7_user.o
+sockex3-objs := sockex3_user.o
+tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
+tracex2-objs := tracex2_user.o
+tracex3-objs := tracex3_user.o
+tracex4-objs := tracex4_user.o
+tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
+tracex6-objs := tracex6_user.o
+tracex7-objs := tracex7_user.o
test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
lathist-objs := bpf_load.o lathist_user.o
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
index e504dc308371..f24806ac24e7 100644
--- a/samples/bpf/sampleip_kern.c
+++ b/samples/bpf/sampleip_kern.c
@@ -13,12 +13,12 @@
#define MAX_IPS 8192
-struct bpf_map_def SEC("maps") ip_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(u64),
- .value_size = sizeof(u32),
- .max_entries = MAX_IPS,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u64);
+ __type(value, u32);
+ __uint(max_entries, MAX_IPS);
+} ip_map SEC(".maps");
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx)
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4372d2da2f9e..921c505bb567 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -18,9 +18,6 @@
#include "perf-sys.h"
#include "trace_helpers.h"
-#define __must_check
-#include <linux/err.h>
-
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
#define MAX_IPS 8192
@@ -57,7 +54,7 @@ static int sampling_start(int freq, struct bpf_program *prog,
return 1;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(links[i])) {
+ if (libbpf_get_error(links[i])) {
fprintf(stderr, "ERROR: Attach perf event\n");
links[i] = NULL;
close(pmu_fd);
@@ -182,7 +179,7 @@ int main(int argc, char **argv)
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
- if (IS_ERR(obj)) {
+ if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
obj = NULL;
goto cleanup;
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index 779a5249c418..cab9cca0b8eb 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -19,12 +19,12 @@
#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
-struct bpf_map_def SEC("maps") jmp_table = {
- .type = BPF_MAP_TYPE_PROG_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
- .max_entries = 8,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 8);
+} jmp_table SEC(".maps");
#define PARSE_VLAN 1
#define PARSE_MPLS 2
@@ -92,12 +92,12 @@ struct globals {
struct flow_key_record flow;
};
-struct bpf_map_def SEC("maps") percpu_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(__u32),
- .value_size = sizeof(struct globals),
- .max_entries = 32,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, struct globals);
+ __uint(max_entries, 32);
+} percpu_map SEC(".maps");
/* user poor man's per_cpu until native support is ready */
static struct globals *this_cpu_globals(void)
@@ -113,12 +113,12 @@ struct pair {
__u64 bytes;
};
-struct bpf_map_def SEC("maps") hash_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct flow_key_record),
- .value_size = sizeof(struct pair),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct flow_key_record);
+ __type(value, struct pair);
+ __uint(max_entries, 1024);
+} hash_map SEC(".maps");
static void update_stats(struct __sk_buff *skb, struct globals *g)
{
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index bbb1cd0666a9..4dbee7427d47 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -1,18 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
-#include <linux/bpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/resource.h>
-#define PARSE_IP 3
-#define PARSE_IP_PROG_FD (prog_fd[0])
-#define PROG_ARRAY_FD (map_fd[0])
-
struct flow_key_record {
__be32 src;
__be32 dst;
@@ -30,31 +25,55 @@ struct pair {
int main(int argc, char **argv)
{
+ int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd;
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ const char *title;
FILE *f;
- int i, sock, err, id, key = PARSE_IP;
- struct bpf_prog_info info = {};
- uint32_t info_len = sizeof(info);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table");
+ hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
+ if (jmp_table_fd < 0 || hash_map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
}
- /* Test fd array lookup which returns the id of the bpf_prog */
- err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len);
- assert(!err);
- err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id);
- assert(!err);
- assert(id == info.id);
+ bpf_object__for_each_program(prog, obj) {
+ fd = bpf_program__fd(prog);
+
+ title = bpf_program__title(prog, false);
+ if (sscanf(title, "socket/%d", &key) != 1) {
+ fprintf(stderr, "ERROR: finding prog failed\n");
+ goto cleanup;
+ }
+
+ if (key == 0)
+ main_prog_fd = fd;
+ else
+ bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY);
+ }
sock = open_raw_sock("lo");
- assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+ /* attach BPF program to socket */
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
sizeof(__u32)) == 0);
if (argc > 1)
@@ -69,8 +88,8 @@ int main(int argc, char **argv)
sleep(1);
printf("IP src.port -> dst.port bytes packets\n");
- while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[2], &next_key, &value);
+ while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
printf("%s.%05d -> %s.%05d %12lld %12lld\n",
inet_ntoa((struct in_addr){htonl(next_key.src)}),
next_key.port16[0],
@@ -80,5 +99,8 @@ int main(int argc, char **argv)
key = next_key;
}
}
+
+cleanup:
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h
new file mode 100644
index 000000000000..8cb5400aed1f
--- /dev/null
+++ b/samples/bpf/trace_common.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACE_COMMON_H
+#define __TRACE_COMMON_H
+
+#ifdef __x86_64__
+#define SYSCALL(SYS) "__x64_" __stringify(SYS)
+#elif defined(__s390x__)
+#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
+#else
+#define SYSCALL(SYS) __stringify(SYS)
+#endif
+
+#endif
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index da1d69e20645..7d3c66fb3f88 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -18,19 +18,19 @@ struct key_t {
u32 userstack;
};
-struct bpf_map_def SEC("maps") counts = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct key_t),
- .value_size = sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct key_t);
+ __type(value, u64);
+ __uint(max_entries, 10000);
+} counts SEC(".maps");
-struct bpf_map_def SEC("maps") stackmap = {
- .type = BPF_MAP_TYPE_STACK_TRACE,
- .key_size = sizeof(u32),
- .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index b6cd358d0418..ac1ba368195c 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -16,9 +16,6 @@
#include "perf-sys.h"
#include "trace_helpers.h"
-#define __must_check
-#include <linux/err.h>
-
#define SAMPLE_FREQ 50
static int pid;
@@ -159,7 +156,7 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
goto all_cpu_err;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(links[i])) {
+ if (libbpf_get_error(links[i])) {
printf("bpf_program__attach_perf_event failed\n");
links[i] = NULL;
close(pmu_fd);
@@ -198,7 +195,7 @@ static void test_perf_event_task(struct perf_event_attr *attr)
goto err;
}
link = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(link)) {
+ if (libbpf_get_error(link)) {
printf("bpf_program__attach_perf_event failed\n");
link = NULL;
close(pmu_fd);
@@ -314,7 +311,7 @@ int main(int argc, char **argv)
}
obj = bpf_object__open_file(filename, NULL);
- if (IS_ERR(obj)) {
+ if (libbpf_get_error(obj)) {
printf("opening BPF object file failed\n");
obj = NULL;
goto cleanup;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
index 55fddbd08702..9d4adb7fd834 100644
--- a/samples/bpf/tracex1_user.c
+++ b/samples/bpf/tracex1_user.c
@@ -1,21 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
-#include <linux/bpf.h>
#include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "trace_helpers.h"
int main(int ac, char **argv)
{
- FILE *f;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
f = popen("taskset 1 ping -c5 localhost", "r");
@@ -23,5 +43,8 @@ int main(int ac, char **argv)
read_trace_pipe();
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index d865bb309bcb..5bc696bac27d 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -10,13 +10,14 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#include "trace_common.h"
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, long);
+ __uint(max_entries, 1024);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
@@ -70,14 +71,14 @@ struct hist_key {
u64 index;
};
-struct bpf_map_def SEC("maps") my_hist_map = {
- .type = BPF_MAP_TYPE_PERCPU_HASH,
- .key_size = sizeof(struct hist_key),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(struct hist_key));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, 1024);
+} my_hist_map SEC(".maps");
-SEC("kprobe/sys_write")
+SEC("kprobe/" SYSCALL(sys_write))
int bpf_prog3(struct pt_regs *ctx)
{
long write_size = PT_REGS_PARM3(ctx);
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index c9544a4ce61a..3e36b3e4e3ef 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -3,17 +3,19 @@
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
-#include <linux/bpf.h>
#include <string.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "bpf_util.h"
#define MAX_INDEX 64
#define MAX_STARS 38
+/* my_map, my_hist_map */
+static int map_fd[2];
+
static void stars(char *str, long val, long max, int width)
{
int i;
@@ -115,18 +117,39 @@ static void int_exit(int sig)
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
- char filename[256];
long key, next_key, value;
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int i, j = 0;
FILE *f;
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
@@ -138,9 +161,14 @@ int main(int ac, char **argv)
f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
(void) f;
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
}
for (i = 0; i < 5; i++) {
@@ -156,5 +184,10 @@ int main(int ac, char **argv)
}
print_hist(map_fd[1]);
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
index fe21c14feb8d..659613c19a82 100644
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -11,12 +11,12 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(u64),
- .max_entries = 4096,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, u64);
+ __uint(max_entries, 4096);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
@@ -42,12 +42,12 @@ static unsigned int log2l(unsigned long long n)
#define SLOTS 100
-struct bpf_map_def SEC("maps") lat_map = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = SLOTS,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, SLOTS);
+} lat_map SEC(".maps");
SEC("kprobe/blk_account_io_completion")
int bpf_prog2(struct pt_regs *ctx)
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index cf8fedc773f2..70e987775c15 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -7,11 +7,10 @@
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
-#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "bpf_util.h"
#define SLOTS 100
@@ -109,20 +108,11 @@ static void print_hist(int fd)
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
- if (setrlimit(RLIMIT_MEMLOCK, &r)) {
- perror("setrlimit(RLIMIT_MEMLOCK)");
- return 1;
- }
-
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
- }
+ int map_fd, i, j = 0;
for (i = 1; i < ac; i++) {
if (strcmp(argv[i], "-a") == 0) {
@@ -137,6 +127,40 @@ int main(int ac, char **argv)
}
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+
printf(" heatmap of IO latency\n");
if (text_only)
printf(" %s", sym[num_colors - 1]);
@@ -153,9 +177,14 @@ int main(int ac, char **argv)
for (i = 0; ; i++) {
if (i % 20 == 0)
print_banner();
- print_hist(map_fd[1]);
+ print_hist(map_fd);
sleep(2);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
index b1bb9df88f8e..eb0f8fdd14bf 100644
--- a/samples/bpf/tracex4_kern.c
+++ b/samples/bpf/tracex4_kern.c
@@ -15,12 +15,12 @@ struct pair {
u64 ip;
};
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(struct pair),
- .max_entries = 1000000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, struct pair);
+ __uint(max_entries, 1000000);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
index ec52203fce39..e8faf8f184ae 100644
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -8,11 +8,10 @@
#include <stdbool.h>
#include <string.h>
#include <time.h>
-#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
struct pair {
long long val;
@@ -36,8 +35,8 @@ static void print_old_objects(int fd)
key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
key = -1;
- while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[0], &next_key, &v);
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &v);
key = next_key;
if (val - v.val < 1000000000ll)
/* object was allocated more then 1 sec ago */
@@ -50,25 +49,55 @@ static void print_old_objects(int fd)
int main(int ac, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ int map_fd, i, j = 0;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
return 1;
}
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
}
for (i = 0; ; i++) {
- print_old_objects(map_fd[1]);
+ print_old_objects(map_fd);
sleep(1);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
index 481790fde864..32b49e8ab6bd 100644
--- a/samples/bpf/tracex5_kern.c
+++ b/samples/bpf/tracex5_kern.c
@@ -15,16 +15,16 @@
#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
-struct bpf_map_def SEC("maps") progs = {
- .type = BPF_MAP_TYPE_PROG_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
#ifdef __mips__
- .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */
+ __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
#else
- .max_entries = 1024,
+ __uint(max_entries, 1024);
#endif
-};
+} progs SEC(".maps");
SEC("kprobe/__seccomp_filter")
int bpf_prog1(struct pt_regs *ctx)
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
index c2317b39e0d2..98dad57a96c4 100644
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
@@ -1,15 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
-#include <linux/bpf.h>
+#include <stdlib.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include <sys/resource.h>
#include "trace_helpers.h"
+#ifdef __mips__
+#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
+#else
+#define MAX_ENTRIES 1024
+#endif
+
/* install fake seccomp program to enable seccomp code path inside the kernel,
* so that our kprobe attached to seccomp_phase1() can be triggered
*/
@@ -28,16 +34,57 @@ static void install_accept_all_seccomp(void)
int main(int ac, char **argv)
{
- FILE *f;
- char filename[256];
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int key, fd, progs_fd;
+ char filename[256];
+ const char *title;
+ FILE *f;
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
+ if (progs_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ title = bpf_program__title(prog, false);
+ /* register only syscalls to PROG_ARRAY */
+ if (sscanf(title, "kprobe/%d", &key) != 1)
+ continue;
+
+ fd = bpf_program__fd(prog);
+ bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
}
install_accept_all_seccomp();
@@ -47,5 +94,8 @@ int main(int ac, char **argv)
read_trace_pipe();
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
index 96c234efa852..acad5712d8b4 100644
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -3,24 +3,26 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
-struct bpf_map_def SEC("maps") counters = {
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(u32),
- .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(int),
- .value_size = sizeof(u64),
- .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values2 = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(int),
- .value_size = sizeof(struct bpf_perf_event_value),
- .max_entries = 64,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 64);
+} counters SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, u64);
+ __uint(max_entries, 64);
+} values SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, struct bpf_perf_event_value);
+ __uint(max_entries, 64);
+} values2 SEC(".maps");
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index 4bb3c830adb2..33df9784775d 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -4,7 +4,6 @@
#include <assert.h>
#include <fcntl.h>
#include <linux/perf_event.h>
-#include <linux/bpf.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
@@ -15,12 +14,15 @@
#include <sys/wait.h>
#include <unistd.h>
-#include "bpf_load.h"
#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
#include "perf-sys.h"
#define SAMPLE_PERIOD 0x7fffffffffffffffULL
+/* counters, values, values2 */
+static int map_fd[3];
+
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
struct bpf_perf_event_value value2;
@@ -174,16 +176,51 @@ static void test_bpf_perf_event(void)
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ int i = 0;
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
- setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
+ map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
+ if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
}
test_bpf_perf_event();
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
index ea6dae78f0df..fdcd6580dd73 100644
--- a/samples/bpf/tracex7_user.c
+++ b/samples/bpf/tracex7_user.c
@@ -1,28 +1,51 @@
#define _GNU_SOURCE
#include <stdio.h>
-#include <linux/bpf.h>
#include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
int main(int argc, char **argv)
{
- FILE *f;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
char command[256];
- int ret;
+ int ret = 0;
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
f = popen(command, "r");
ret = pclose(f);
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return ret ? 0 : 1;
}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 9b8f21abeac4..f3468168982e 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -19,9 +19,6 @@ static const char *__doc__ =
#include <time.h>
#include <linux/limits.h>
-#define __must_check
-#include <linux/err.h>
-
#include <arpa/inet.h>
#include <linux/if_link.h>
@@ -622,7 +619,7 @@ static struct bpf_link * attach_tp(struct bpf_object *obj,
}
link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
- if (IS_ERR(link))
+ if (libbpf_get_error(link))
exit(EXIT_FAIL_BPF);
return link;
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index e4d9da654e84..a226aee3574f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -29,8 +29,8 @@ CGROUP COMMANDS
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
| **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
-| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** |
-| **getsockopt** | **setsockopt** }
+| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
+| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** }
| *ATTACH_FLAGS* := { **multi** | **override** }
DESCRIPTION
@@ -101,7 +101,11 @@ DESCRIPTION
an unconnected udp6 socket (since 5.2);
**sysctl** sysctl access (since 5.2);
**getsockopt** call to getsockopt (since 5.3);
- **setsockopt** call to setsockopt (since 5.3).
+ **setsockopt** call to setsockopt (since 5.3);
+ **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8);
+ **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
+ **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
+ **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 5948e9d89c8d..2b254959d488 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -41,7 +41,8 @@ PROG COMMANDS
| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
-| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
+| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** |
+| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
| **cgroup/getsockopt** | **cgroup/setsockopt** |
| **struct_ops** | **fentry** | **fexit** | **freplace**
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 9f0f20e73b87..25b25aca1112 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -472,6 +472,8 @@ _bpftool()
lwt_seg6local sockops sk_skb sk_msg \
lirc_mode2 cgroup/bind4 cgroup/bind6 \
cgroup/connect4 cgroup/connect6 \
+ cgroup/getpeername4 cgroup/getpeername6 \
+ cgroup/getsockname4 cgroup/getsockname6 \
cgroup/sendmsg4 cgroup/sendmsg6 \
cgroup/recvmsg4 cgroup/recvmsg6 \
cgroup/post_bind4 cgroup/post_bind6 \
@@ -966,9 +968,10 @@ _bpftool()
;;
attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \
- device bind4 bind6 post_bind4 post_bind6 connect4 \
- connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
- getsockopt setsockopt'
+ device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
+ getpeername4 getpeername6 getsockname4 getsockname6 \
+ sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
+ setsockopt'
local ATTACH_FLAGS='multi override'
local PROG_TYPE='id pinned tag name'
case $prev in
@@ -977,9 +980,9 @@ _bpftool()
return 0
;;
ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
- post_bind4|post_bind6|connect4|connect6|sendmsg4|\
- sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\
- setsockopt)
+ post_bind4|post_bind6|connect4|connect6|getpeername4|\
+ getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
+ recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
"$cur" ) )
return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 1693c802bb20..27931db421d8 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -25,9 +25,10 @@
" ATTACH_TYPE := { ingress | egress | sock_create |\n" \
" sock_ops | device | bind4 | bind6 |\n" \
" post_bind4 | post_bind6 | connect4 |\n" \
- " connect6 | sendmsg4 | sendmsg6 |\n" \
- " recvmsg4 | recvmsg6 | sysctl |\n" \
- " getsockopt | setsockopt }"
+ " connect6 | getpeername4 | getpeername6 |\n" \
+ " getsockname4 | getsockname6 | sendmsg4 |\n" \
+ " sendmsg6 | recvmsg4 | recvmsg6 |\n" \
+ " sysctl | getsockopt | setsockopt }"
static unsigned int query_flags;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index f89ac70ef973..5cdf0bc049bd 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+ [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4",
+ [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6",
+ [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4",
+ [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6",
[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[BPF_CGROUP_SYSCTL] = "sysctl",
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index b6e5ba568f98..245f941fdbcf 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv)
" sk_reuseport | flow_dissector | cgroup/sysctl |\n"
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
- " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
- " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n"
+ " cgroup/getpeername4 | cgroup/getpeername6 |\n"
+ " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
+ " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
+ " cgroup/getsockopt | cgroup/setsockopt |\n"
" struct_ops | fentry | fexit | freplace }\n"
" ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
" flow_dissector }\n"
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 146c742f1d49..97e1fd19ff58 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -73,7 +73,7 @@ struct bpf_insn {
/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
struct bpf_lpm_trie_key {
__u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
- __u8 data[]; /* Arbitrary size */
+ __u8 data[0]; /* Arbitrary size */
};
struct bpf_cgroup_storage_key {
@@ -220,6 +220,10 @@ enum bpf_attach_type {
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
@@ -2015,8 +2019,8 @@ union bpf_attr {
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
- * only possible to shrink the packet as of this writing,
- * therefore *delta* must be a negative integer.
+ * possible to both shrink and grow the packet tail.
+ * Shrink done via *delta* being a negative integer.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c
index cffb96202e0d..a405dad068f5 100644
--- a/tools/lib/bpf/hashmap.c
+++ b/tools/lib/bpf/hashmap.c
@@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn,
void hashmap__clear(struct hashmap *map)
{
struct hashmap_entry *cur, *tmp;
- int bkt;
+ size_t bkt;
hashmap__for_each_entry_safe(map, cur, tmp, bkt) {
free(cur);
@@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map)
struct hashmap_entry **new_buckets;
struct hashmap_entry *cur, *tmp;
size_t new_cap_bits, new_cap;
- size_t h;
- int bkt;
+ size_t h, bkt;
new_cap_bits = map->cap_bits + 1;
if (new_cap_bits < HASHMAP_MIN_CAP_BITS)
diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h
index bae8879cdf58..e823b35e7371 100644
--- a/tools/lib/bpf/hashmap.h
+++ b/tools/lib/bpf/hashmap.h
@@ -15,7 +15,6 @@
#else
#include <bits/reg.h>
#endif
-#include "libbpf_internal.h"
static inline size_t hash_bits(size_t h, int bits)
{
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 292257995487..fa04cbe547ed 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = {
BPF_CGROUP_UDP4_RECVMSG),
BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_UDP6_RECVMSG),
+ BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET4_GETPEERNAME),
+ BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET6_GETPEERNAME),
+ BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET4_GETSOCKNAME),
+ BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET6_GETSOCKNAME),
BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_CGROUP_SYSCTL),
BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT,
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 0f67f1b470b0..e885d351595f 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -1,6 +1,8 @@
==================
BPF Selftest Notes
==================
+General instructions on running selftests can be found in
+`Documentation/bpf/bpf_devel_QA.rst`_.
Additional information about selftest failures are
documented here.
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 60e3ae5d4e48..2118e23ac07a 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
+CONFIG_IPV6_SEG6_BPF=y
CONFIG_NET_FOU=m
CONFIG_NET_FOU_IP_TUNNELS=y
CONFIG_IPV6_FOU=m
@@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m
CONFIG_BPF_JIT=y
CONFIG_BPF_LSM=y
CONFIG_SECURITY=y
+CONFIG_LIRC=y
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 999a775484c1..e36dd1a1780d 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -5,6 +5,8 @@
#include <string.h>
#include <unistd.h>
+#include <arpa/inet.h>
+
#include <sys/epoll.h>
#include <linux/err.h>
@@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = {
.tcp.doff = 5,
};
-int start_server(int family, int type)
+int start_server_with_port(int family, int type, __u16 port)
{
struct sockaddr_storage addr = {};
socklen_t len;
@@ -45,11 +47,13 @@ int start_server(int family, int type)
struct sockaddr_in *sin = (void *)&addr;
sin->sin_family = AF_INET;
+ sin->sin_port = htons(port);
len = sizeof(*sin);
} else {
struct sockaddr_in6 *sin6 = (void *)&addr;
sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = htons(port);
len = sizeof(*sin6);
}
@@ -76,6 +80,11 @@ int start_server(int family, int type)
return fd;
}
+int start_server(int family, int type)
+{
+ return start_server_with_port(family, type, 0);
+}
+
static const struct timeval timeo_sec = { .tv_sec = 3 };
static const size_t timeo_optlen = sizeof(timeo_sec);
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 86914e6e7b53..6a8009605670 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -34,6 +34,7 @@ struct ipv6_packet {
extern struct ipv6_packet pkt_v6;
int start_server(int family, int type);
+int start_server_with_port(int family, int type, __u16 port);
int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index 0262f7b374f9..c548aded6585 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -1,24 +1,5 @@
-#include <asm/types.h>
-#include <linux/types.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#include <linux/unistd.h>
-#include <linux/filter.h>
-#include <linux/bpf_perf_event.h>
-#include <linux/bpf.h>
-
-#include <bpf/bpf.h>
-
-#include "../../../include/linux/filter.h"
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
#define MAX_INSNS 512
#define MAX_MATCHES 16
@@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = {
* is still (4n), fixed offset is not changed.
* Also, we create a new reg->id.
*/
- {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"},
+ {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (18)
* which is 20. Then the variable offset is (4n), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
- {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
- {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
+ {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
},
},
{
@@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = {
/* Adding 14 makes R6 be (4n+2) */
{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
/* Packet pointer has (4n+2) offset */
- {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
- {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
/* Newly read value in R6 was shifted left by 2, so has
* known alignment of 4.
*/
@@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = {
/* Added (4n) to packet pointer's (4n+2) var_off, giving
* another (4n+2).
*/
- {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
- {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
+ {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
- {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
+ {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
},
},
{
@@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = {
.matches = {
{4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
/* (ptr - ptr) << 2 == unknown, (4n) */
- {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
+ {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
/* (4n) + 14 == (4n+2). We blow our bounds, because
* the add could overflow.
*/
- {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
+ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
/* Checked s>=0 */
- {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+ {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
/* packet pointer + nonnegative (4n+2) */
- {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
- {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+ {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+ {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
* We checked the bounds, but it might have been able
* to overflow if the packet pointer started in the
@@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = {
* So we did not get a 'range' on R6, and the access
* attempt will fail.
*/
- {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+ {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
}
},
{
@@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = {
/* New unknown value in R7 is (4n) */
{11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
/* Subtracting it from R6 blows our unsigned bounds */
- {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"},
+ {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
/* Checked s>= 0 */
{14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
/* At the time the word size load is performed from R5,
@@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = {
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
- {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+
},
},
{
@@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = {
/* Adding 14 makes R6 be (4n+2) */
{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
/* Subtracting from packet pointer overflows ubounds */
- {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"},
+ {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
/* New unknown value in R7 is (4n), >= 76 */
{15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
/* Adding it to packet pointer gives nice bounds again */
- {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
+ {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
- {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
+ {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
},
},
};
@@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test)
return ret;
}
-static int do_test(unsigned int from, unsigned int to)
+void test_align(void)
{
- int all_pass = 0;
- int all_fail = 0;
unsigned int i;
- for (i = from; i < to; i++) {
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_align_test *test = &tests[i];
- int fail;
- printf("Test %3d: %s ... ",
- i, test->descr);
- fail = do_test_single(test);
- if (fail) {
- all_fail++;
- printf("FAIL\n");
- } else {
- all_pass++;
- printf("PASS\n");
- }
- }
- printf("Results: %d pass %d fail\n",
- all_pass, all_fail);
- return all_fail ? EXIT_FAILURE : EXIT_SUCCESS;
-}
+ if (!test__start_subtest(test->descr))
+ continue;
-int main(int argc, char **argv)
-{
- unsigned int from = 0, to = ARRAY_SIZE(tests);
-
- if (argc == 3) {
- unsigned int l = atoi(argv[argc - 2]);
- unsigned int u = atoi(argv[argc - 1]);
-
- if (l < to && u < to) {
- from = l;
- to = u + 1;
- }
- } else if (argc == 2) {
- unsigned int t = atoi(argv[argc - 1]);
-
- if (t < to) {
- from = t;
- to = t + 1;
- }
+ CHECK_FAIL(do_test_single(test));
}
- return do_test(from, to);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
index 47fbb20cb6a6..17bbf76812ca 100644
--- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -4,7 +4,8 @@
#include "cgroup_helpers.h"
#include "network_helpers.h"
-static int verify_port(int family, int fd, int expected)
+static int verify_ports(int family, int fd,
+ __u16 expected_local, __u16 expected_peer)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
@@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected)
else
port = ((struct sockaddr_in6 *)&addr)->sin6_port;
- if (ntohs(port) != expected) {
- log_err("Unexpected port %d, expected %d", ntohs(port),
- expected);
+ if (ntohs(port) != expected_local) {
+ log_err("Unexpected local port %d, expected %d", ntohs(port),
+ expected_local);
+ return -1;
+ }
+
+ if (getpeername(fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get peer addr");
+ return -1;
+ }
+
+ if (family == AF_INET)
+ port = ((struct sockaddr_in *)&addr)->sin_port;
+ else
+ port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+ if (ntohs(port) != expected_peer) {
+ log_err("Unexpected peer port %d, expected %d", ntohs(port),
+ expected_peer);
return -1;
}
@@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected)
static int run_test(int cgroup_fd, int server_fd, int family, int type)
{
+ bool v4 = family == AF_INET;
+ __u16 expected_local_port = v4 ? 22222 : 22223;
+ __u16 expected_peer_port = 60000;
struct bpf_prog_load_attr attr = {
- .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .file = v4 ? "./connect_force_port4.o" :
+ "./connect_force_port6.o",
};
+ struct bpf_program *prog;
struct bpf_object *obj;
- int expected_port;
- int prog_fd;
- int err;
- int fd;
-
- if (family == AF_INET) {
- attr.file = "./connect_force_port4.o";
- attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
- expected_port = 22222;
- } else {
- attr.file = "./connect_force_port6.o";
- attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
- expected_port = 22223;
- }
+ int xlate_fd, fd, err;
+ __u32 duration = 0;
- err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
if (err) {
log_err("Failed to load BPF object");
return -1;
}
- err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
- 0);
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/connect4" :
+ "cgroup/connect6");
+ if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_CONNECT :
+ BPF_CGROUP_INET6_CONNECT, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getpeername4" :
+ "cgroup/getpeername6");
+ if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET6_GETPEERNAME, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getsockname4" :
+ "cgroup/getsockname6");
+ if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETSOCKNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
@@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
goto close_bpf_object;
}
- err = verify_port(family, fd, expected_port);
-
+ err = verify_ports(family, fd, expected_local_port,
+ expected_peer_port);
close(fd);
close_bpf_object:
@@ -86,25 +137,25 @@ void test_connect_force_port(void)
if (CHECK_FAIL(cgroup_fd < 0))
return;
- server_fd = start_server(AF_INET, SOCK_STREAM);
+ server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
close(server_fd);
- server_fd = start_server(AF_INET6, SOCK_STREAM);
+ server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
close(server_fd);
- server_fd = start_server(AF_INET, SOCK_DGRAM);
+ server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
close(server_fd);
- server_fd = start_server(AF_INET6, SOCK_DGRAM);
+ server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
index 4867cd3445c8..b57bd6fef208 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -1,11 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
index ab9e2650e021..c8e9ca74c87b 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -1,9 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__ipv6_route
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__ipv6_route {
+ struct bpf_iter_meta *meta;
+ struct fib6_info *rt;
+} __attribute__((preserve_access_index));
+
char _license[] SEC("license") = "GPL";
extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
index 6b40a233d4e0..e7b8753eac0b 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -1,6 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__netlink bpf_iter__netlink___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__netlink
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
@@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL";
#define sk_rmem_alloc sk_backlog.rmem_alloc
#define sk_refcnt __sk_common.skc_refcnt
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__netlink {
+ struct bpf_iter_meta *meta;
+ struct netlink_sock *sk;
+} __attribute__((preserve_access_index));
+
static inline struct inode *SOCK_INODE(struct socket *socket)
{
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
index 90f9011c57ca..ee754021f98e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -1,11 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index c6ced38f0880..0f0ec3db20ba 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -1,11 +1,29 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task_file bpf_iter__task_file___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task_file
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task_file {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+ __u32 fd;
+ struct file *file;
+} __attribute__((preserve_access_index));
+
SEC("iter/task_file")
int dump_task_file(struct bpf_iter__task_file *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
index 636a00fa074d..13c2c90c835f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
@@ -1,10 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
index b18dc0471d07..0aa71b333cf3 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
@@ -1,10 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
__u32 map1_id = 0, map2_id = 0;
__u32 map1_accessed = 0, map2_accessed = 0;
__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
index bdd51cf14b54..dee1339e6905 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
@@ -1,11 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
int count = 0;
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c
index 1b8eb34b2db0..7396308677a3 100644
--- a/tools/testing/selftests/bpf/progs/connect_force_port4.c
+++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <string.h>
+#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/in.h>
@@ -12,17 +13,71 @@
char _license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
+struct svc_addr {
+ __be32 addr;
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
SEC("cgroup/connect4")
-int _connect4(struct bpf_sock_addr *ctx)
+int connect4(struct bpf_sock_addr *ctx)
{
struct sockaddr_in sa = {};
+ struct svc_addr *orig;
+ /* Force local address to 127.0.0.1:22222. */
sa.sin_family = AF_INET;
sa.sin_port = bpf_htons(22222);
- sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */
+ sa.sin_addr.s_addr = bpf_htonl(0x7f000001);
if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
return 0;
+ /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr = ctx->user_ip4;
+ orig->port = ctx->user_port;
+
+ ctx->user_ip4 = bpf_htonl(0x7f000001);
+ ctx->user_port = bpf_htons(60123);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname4")
+int getsockname4(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as 1.2.3.4:60000 to client. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ ctx->user_ip4 = bpf_htonl(0x01020304);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername4")
+int getpeername4(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service 1.2.3.4:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip4 = orig->addr;
+ ctx->user_port = orig->port;
+ }
+ }
return 1;
}
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c
index ae6f7d750b4c..c1a2b555e9ad 100644
--- a/tools/testing/selftests/bpf/progs/connect_force_port6.c
+++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c
@@ -12,17 +12,83 @@
char _license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
+struct svc_addr {
+ __be32 addr[4];
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
SEC("cgroup/connect6")
-int _connect6(struct bpf_sock_addr *ctx)
+int connect6(struct bpf_sock_addr *ctx)
{
struct sockaddr_in6 sa = {};
+ struct svc_addr *orig;
+ /* Force local address to [::1]:22223. */
sa.sin6_family = AF_INET6;
sa.sin6_port = bpf_htons(22223);
- sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */
+ sa.sin6_addr.s6_addr32[3] = bpf_htonl(1);
if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
return 0;
+ /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr[0] = ctx->user_ip6[0];
+ orig->addr[1] = ctx->user_ip6[1];
+ orig->addr[2] = ctx->user_ip6[2];
+ orig->addr[3] = ctx->user_ip6[3];
+ orig->port = ctx->user_port;
+
+ ctx->user_ip6[0] = 0;
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60124);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname6")
+int getsockname6(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as [fc00::1]:60000 to client. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ ctx->user_ip6[0] = bpf_htonl(0xfc000000);
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername6")
+int getpeername6(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service [fc00::1]:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip6[0] = orig->addr[0];
+ ctx->user_ip6[1] = orig->addr[1];
+ ctx->user_ip6[2] = orig->addr[2];
+ ctx->user_ip6[3] = orig->addr[3];
+ ctx->user_port = orig->port;
+ }
+ }
return 1;
}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
index d2b38fa6a5b0..e83d0b48d80c 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
@@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb)
tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ bpf_printk("sk=%d\n", sk ? 1 : 0);
if (sk)
bpf_sk_release(sk);
return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
index 9b4d3a68a91a..a443d3637db3 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
@@ -110,8 +110,6 @@ int bpf_prog2(struct __sk_buff *skb)
flags = *f;
}
- bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
- len, flags);
#ifdef SOCKMAP
return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
#else
@@ -143,8 +141,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
err = bpf_sock_hash_update(skops, &sock_map, &ret,
BPF_NOEXIST);
#endif
- bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
}
break;
case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
@@ -160,8 +156,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
err = bpf_sock_hash_update(skops, &sock_map, &ret,
BPF_NOEXIST);
#endif
- bpf_printk("active(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
}
break;
default:
@@ -199,72 +193,6 @@ int bpf_prog4(struct sk_msg_md *msg)
}
SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
- int *start, *end, *start_push, *end_push, *start_pop, *pop;
- int *bytes, len1, len2 = 0, len3, len4;
- int err1 = -1, err2 = -1;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end) {
- int err;
-
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
-
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push) {
- int err;
-
- bpf_printk("sk_msg2: push(%i:%i)\n",
- start_push ? *start_push : 0,
- end_push ? *end_push : 0);
- err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
- if (err)
- bpf_printk("sk_msg2: push_data err %i\n", err);
- len3 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length push_update %i->%i\n",
- len2 ? len2 : len1, len3);
- }
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop) {
- int err;
-
- bpf_printk("sk_msg2: pop(%i@%i)\n",
- start_pop, pop);
- err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- if (err)
- bpf_printk("sk_msg2: pop_data err %i\n", err);
- len4 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length pop_data %i->%i\n",
- len1 ? len1 : 0, len4);
- }
-
- bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
- len1, err1, err2);
- return SK_PASS;
-}
-
-SEC("sk_msg3")
int bpf_prog6(struct sk_msg_md *msg)
{
int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
@@ -305,86 +233,7 @@ int bpf_prog6(struct sk_msg_md *msg)
#endif
}
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
- int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
- int len1, len2 = 0, len3, len4;
- int err1 = 0, err2 = 0, key = 0;
- __u64 flags = 0;
-
- int err;
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
-
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end) {
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
-
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push) {
- bpf_printk("sk_msg4: push(%i:%i)\n",
- start_push ? *start_push : 0,
- end_push ? *end_push : 0);
- err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
- if (err)
- bpf_printk("sk_msg4: push_data err %i\n",
- err);
- len3 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg4: length push_update %i->%i\n",
- len2 ? len2 : len1, len3);
- }
-
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop) {
- int err;
-
- bpf_printk("sk_msg4: pop(%i@%i)\n",
- start_pop, pop);
- err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- if (err)
- bpf_printk("sk_msg4: pop_data err %i\n", err);
- len4 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg4: length pop_data %i->%i\n",
- len1 ? len1 : 0, len4);
- }
-
-
- f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
- if (f && *f) {
- key = 2;
- flags = *f;
- }
- bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
- len1, flags, err1 ? err1 : err2);
-#ifdef SOCKMAP
- err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-#else
- err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
-#endif
- bpf_printk("sk_msg3: err %i\n", err);
- return err;
-}
-
-SEC("sk_msg5")
+SEC("sk_msg3")
int bpf_prog8(struct sk_msg_md *msg)
{
void *data_end = (void *)(long) msg->data_end;
@@ -401,7 +250,7 @@ int bpf_prog8(struct sk_msg_md *msg)
}
return SK_PASS;
}
-SEC("sk_msg6")
+SEC("sk_msg4")
int bpf_prog9(struct sk_msg_md *msg)
{
void *data_end = (void *)(long) msg->data_end;
@@ -419,7 +268,7 @@ int bpf_prog9(struct sk_msg_md *msg)
return SK_PASS;
}
-SEC("sk_msg7")
+SEC("sk_msg5")
int bpf_prog10(struct sk_msg_md *msg)
{
int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
@@ -443,7 +292,6 @@ int bpf_prog10(struct sk_msg_md *msg)
pop = bpf_map_lookup_elem(&sock_bytes, &five);
if (start_pop && pop)
bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- bpf_printk("return sk drop\n");
return SK_DROP;
}
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 779e11da979c..c80643828b82 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -54,7 +54,7 @@ static void running_handler(int a);
#define S1_PORT 10000
#define S2_PORT 10001
-#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
#define CG_PATH "/sockmap"
@@ -68,9 +68,7 @@ struct bpf_map *maps[8];
int prog_fd[11];
int txmsg_pass;
-int txmsg_noisy;
int txmsg_redir;
-int txmsg_redir_noisy;
int txmsg_drop;
int txmsg_apply;
int txmsg_cork;
@@ -89,15 +87,13 @@ static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
{"cgroup", required_argument, NULL, 'c' },
{"rate", required_argument, NULL, 'r' },
- {"verbose", no_argument, NULL, 'v' },
+ {"verbose", optional_argument, NULL, 'v' },
{"iov_count", required_argument, NULL, 'i' },
{"length", required_argument, NULL, 'l' },
{"test", required_argument, NULL, 't' },
{"data_test", no_argument, NULL, 'd' },
{"txmsg", no_argument, &txmsg_pass, 1 },
- {"txmsg_noisy", no_argument, &txmsg_noisy, 1 },
{"txmsg_redir", no_argument, &txmsg_redir, 1 },
- {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1},
{"txmsg_drop", no_argument, &txmsg_drop, 1 },
{"txmsg_apply", required_argument, NULL, 'a'},
{"txmsg_cork", required_argument, NULL, 'k'},
@@ -111,9 +107,104 @@ static const struct option long_options[] = {
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"whitelist", required_argument, NULL, 'n' },
+ {"blacklist", required_argument, NULL, 'b' },
{0, 0, NULL, 0 }
};
+struct test_env {
+ const char *type;
+ const char *subtest;
+ const char *prepend;
+
+ int test_num;
+ int subtest_num;
+
+ int succ_cnt;
+ int fail_cnt;
+ int fail_last;
+};
+
+struct test_env env;
+
+struct sockmap_options {
+ int verbose;
+ bool base;
+ bool sendpage;
+ bool data_test;
+ bool drop_expected;
+ int iov_count;
+ int iov_length;
+ int rate;
+ char *map;
+ char *whitelist;
+ char *blacklist;
+ char *prepend;
+};
+
+struct _test {
+ char *title;
+ void (*tester)(int cg_fd, struct sockmap_options *opt);
+};
+
+static void test_start(void)
+{
+ env.subtest_num++;
+}
+
+static void test_fail(void)
+{
+ env.fail_cnt++;
+}
+
+static void test_pass(void)
+{
+ env.succ_cnt++;
+}
+
+static void test_reset(void)
+{
+ txmsg_start = txmsg_end = 0;
+ txmsg_start_pop = txmsg_pop = 0;
+ txmsg_start_push = txmsg_end_push = 0;
+ txmsg_pass = txmsg_drop = txmsg_redir = 0;
+ txmsg_apply = txmsg_cork = 0;
+ txmsg_ingress = txmsg_skb = 0;
+}
+
+static int test_start_subtest(const struct _test *t, struct sockmap_options *o)
+{
+ env.type = o->map;
+ env.subtest = t->title;
+ env.prepend = o->prepend;
+ env.test_num++;
+ env.subtest_num = 0;
+ env.fail_last = env.fail_cnt;
+ test_reset();
+ return 0;
+}
+
+static void test_end_subtest(void)
+{
+ int error = env.fail_cnt - env.fail_last;
+ int type = strcmp(env.type, BPF_SOCKMAP_FILENAME);
+
+ if (!error)
+ test_pass();
+
+ fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n",
+ env.test_num, env.subtest_num,
+ !type ? "sockmap" : "sockhash",
+ env.prepend ? : "",
+ env.subtest, error ? "FAIL" : "OK");
+}
+
+static void test_print_results(void)
+{
+ fprintf(stdout, "Pass: %d Fail: %d\n",
+ env.succ_cnt, env.fail_cnt);
+}
+
static void usage(char *argv[])
{
int i;
@@ -296,7 +387,7 @@ static int sockmap_init_sockets(int verbose)
return errno;
}
- if (verbose) {
+ if (verbose > 1) {
printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
c1, s1, c2, s2);
@@ -311,17 +402,6 @@ struct msg_stats {
struct timespec end;
};
-struct sockmap_options {
- int verbose;
- bool base;
- bool sendpage;
- bool data_test;
- bool drop_expected;
- int iov_count;
- int iov_length;
- int rate;
-};
-
static int msg_loop_sendpage(int fd, int iov_length, int cnt,
struct msg_stats *s,
struct sockmap_options *opt)
@@ -345,14 +425,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
clock_gettime(CLOCK_MONOTONIC, &s->start);
for (i = 0; i < cnt; i++) {
- int sent = sendfile(fd, fp, NULL, iov_length);
+ int sent;
+
+ errno = 0;
+ sent = sendfile(fd, fp, NULL, iov_length);
if (!drop && sent < 0) {
- perror("send loop error");
+ perror("sendpage loop error");
fclose(file);
return sent;
} else if (drop && sent >= 0) {
- printf("sendpage loop error expected: %i\n", sent);
+ printf("sendpage loop error expected: %i errno %i\n",
+ sent, errno);
fclose(file);
return -EIO;
}
@@ -464,13 +548,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
if (tx) {
clock_gettime(CLOCK_MONOTONIC, &s->start);
for (i = 0; i < cnt; i++) {
- int sent = sendmsg(fd, &msg, flags);
+ int sent;
+
+ errno = 0;
+ sent = sendmsg(fd, &msg, flags);
if (!drop && sent < 0) {
- perror("send loop error");
+ perror("sendmsg loop error");
goto out_errno;
} else if (drop && sent >= 0) {
- printf("send loop error expected: %i\n", sent);
+ fprintf(stderr,
+ "sendmsg loop error expected: %i errno %i\n",
+ sent, errno);
errno = -EIO;
goto out_errno;
}
@@ -497,9 +586,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
* paths.
*/
total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
- txmsg_pop_total = txmsg_pop;
if (txmsg_apply)
- txmsg_pop_total *= (total_bytes / txmsg_apply);
+ txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
+ else
+ txmsg_pop_total = txmsg_pop * cnt;
total_bytes -= txmsg_pop_total;
err = clock_gettime(CLOCK_MONOTONIC, &s->start);
if (err < 0)
@@ -633,14 +723,18 @@ static int sendmsg_test(struct sockmap_options *opt)
rxpid = fork();
if (rxpid == 0) {
+ iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
if (opt->drop_expected)
- exit(0);
+ _exit(0);
+
+ if (!iov_buf) /* zero bytes sent case */
+ _exit(0);
if (opt->sendpage)
iov_count = 1;
err = msg_loop(rx_fd, iov_count, iov_buf,
cnt, &s, false, opt);
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stderr,
"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
@@ -648,7 +742,7 @@ static int sendmsg_test(struct sockmap_options *opt)
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
}
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stdout,
"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
@@ -678,7 +772,7 @@ static int sendmsg_test(struct sockmap_options *opt)
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
}
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stdout,
"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
@@ -694,14 +788,14 @@ static int sendmsg_test(struct sockmap_options *opt)
if (WIFEXITED(rx_status)) {
err = WEXITSTATUS(rx_status);
if (err) {
- fprintf(stderr, "rx thread exited with err %d. ", err);
+ fprintf(stderr, "rx thread exited with err %d.\n", err);
goto out;
}
}
if (WIFEXITED(tx_status)) {
err = WEXITSTATUS(tx_status);
if (err)
- fprintf(stderr, "tx thread exited with err %d. ", err);
+ fprintf(stderr, "tx thread exited with err %d.\n", err);
}
out:
return err;
@@ -783,6 +877,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt)
}
enum {
+ SELFTESTS,
PING_PONG,
SENDMSG,
BASE,
@@ -834,19 +929,14 @@ run:
/* Attach txmsg program to sockmap */
if (txmsg_pass)
tx_prog_fd = prog_fd[3];
- else if (txmsg_noisy)
- tx_prog_fd = prog_fd[4];
else if (txmsg_redir)
+ tx_prog_fd = prog_fd[4];
+ else if (txmsg_apply)
tx_prog_fd = prog_fd[5];
- else if (txmsg_redir_noisy)
+ else if (txmsg_cork)
tx_prog_fd = prog_fd[6];
else if (txmsg_drop)
- tx_prog_fd = prog_fd[9];
- /* apply and cork must be last */
- else if (txmsg_apply)
tx_prog_fd = prog_fd[7];
- else if (txmsg_cork)
- tx_prog_fd = prog_fd[8];
else
tx_prog_fd = 0;
@@ -870,7 +960,7 @@ run:
goto out;
}
- if (txmsg_redir || txmsg_redir_noisy)
+ if (txmsg_redir)
redir_fd = c2;
else
redir_fd = c1;
@@ -1112,12 +1202,8 @@ static void test_options(char *options)
if (txmsg_pass)
strncat(options, "pass,", OPTSTRING);
- if (txmsg_noisy)
- strncat(options, "pass_noisy,", OPTSTRING);
if (txmsg_redir)
strncat(options, "redir,", OPTSTRING);
- if (txmsg_redir_noisy)
- strncat(options, "redir_noisy,", OPTSTRING);
if (txmsg_drop)
strncat(options, "drop,", OPTSTRING);
if (txmsg_apply) {
@@ -1168,416 +1254,283 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
test_options(options);
- fprintf(stdout,
- "[TEST %i]: (%i, %i, %i, %s, %s): ",
- test_cnt, opt->rate, opt->iov_count, opt->iov_length,
- test_to_str(test), options);
- fflush(stdout);
+ if (opt->verbose) {
+ fprintf(stdout,
+ " [TEST %i]: (%i, %i, %i, %s, %s): ",
+ test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+ test_to_str(test), options);
+ fflush(stdout);
+ }
err = run_options(opt, cgrp, test);
- fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
+ if (opt->verbose)
+ fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED");
test_cnt++;
!err ? passed++ : failed++;
free(options);
return err;
}
-static int test_exec(int cgrp, struct sockmap_options *opt)
-{
- int err = __test_exec(cgrp, SENDMSG, opt);
-
- if (err)
- goto out;
-
- err = __test_exec(cgrp, SENDPAGE, opt);
-out:
- return err;
-}
-
-static int test_loop(int cgrp)
-{
- struct sockmap_options opt;
-
- int err, i, l, r;
-
- opt.verbose = 0;
- opt.base = false;
- opt.sendpage = false;
- opt.data_test = false;
- opt.drop_expected = false;
- opt.iov_count = 0;
- opt.iov_length = 0;
- opt.rate = 0;
-
- r = 1;
- for (i = 1; i < 100; i += 33) {
- for (l = 1; l < 100; l += 33) {
- opt.rate = r;
- opt.iov_count = i;
- opt.iov_length = l;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
- }
- sched_yield();
-out:
- return err;
-}
-
-static int test_txmsg(int cgrp)
+static void test_exec(int cgrp, struct sockmap_options *opt)
{
+ int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME);
int err;
- txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
- txmsg_apply = txmsg_cork = 0;
- txmsg_ingress = txmsg_skb = 0;
-
- txmsg_pass = 1;
- err = test_loop(cgrp);
- txmsg_pass = 0;
- if (err)
- goto out;
-
- txmsg_redir = 1;
- err = test_loop(cgrp);
- txmsg_redir = 0;
- if (err)
- goto out;
-
- txmsg_drop = 1;
- err = test_loop(cgrp);
- txmsg_drop = 0;
- if (err)
- goto out;
-
- txmsg_redir = 1;
- txmsg_ingress = 1;
- err = test_loop(cgrp);
- txmsg_redir = 0;
- txmsg_ingress = 0;
- if (err)
- goto out;
-out:
- txmsg_pass = 0;
- txmsg_redir = 0;
- txmsg_drop = 0;
- return err;
+ if (type == 0) {
+ test_start();
+ err = __test_exec(cgrp, SENDMSG, opt);
+ if (err)
+ test_fail();
+ } else {
+ test_start();
+ err = __test_exec(cgrp, SENDPAGE, opt);
+ if (err)
+ test_fail();
+ }
}
-static int test_send(struct sockmap_options *opt, int cgrp)
+static void test_send_one(struct sockmap_options *opt, int cgrp)
{
- int err;
-
opt->iov_length = 1;
opt->iov_count = 1;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->iov_length = 1;
opt->iov_count = 1024;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->iov_length = 1024;
opt->iov_count = 1;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
- opt->iov_length = 1;
+}
+
+static void test_send_many(struct sockmap_options *opt, int cgrp)
+{
+ opt->iov_length = 3;
opt->iov_count = 1;
opt->rate = 512;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
-
- opt->iov_length = 256;
- opt->iov_count = 1024;
- opt->rate = 2;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->rate = 100;
opt->iov_count = 1;
opt->iov_length = 5;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
-out:
- sched_yield();
- return err;
+ test_exec(cgrp, opt);
}
-static int test_mixed(int cgrp)
+static void test_send_large(struct sockmap_options *opt, int cgrp)
{
- struct sockmap_options opt = {0};
- int err;
+ opt->iov_length = 256;
+ opt->iov_count = 1024;
+ opt->rate = 2;
+ test_exec(cgrp, opt);
+}
- txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
- txmsg_apply = txmsg_cork = 0;
- txmsg_start = txmsg_end = 0;
- txmsg_start_push = txmsg_end_push = 0;
- txmsg_start_pop = txmsg_pop = 0;
+static void test_send(struct sockmap_options *opt, int cgrp)
+{
+ test_send_one(opt, cgrp);
+ test_send_many(opt, cgrp);
+ test_send_large(opt, cgrp);
+ sched_yield();
+}
+static void test_txmsg_pass(int cgrp, struct sockmap_options *opt)
+{
/* Test small and large iov_count values with pass/redir/apply/cork */
txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 0;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_drop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_drop = 1;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1024;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = txmsg_drop = 0;
+ txmsg_ingress = txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
+/* Test cork with hung data. This tests poor usage patterns where
+ * cork can leave data on the ring if user program is buggy and
+ * doesn't flush them somehow. They do take some time however
+ * because they wait for a timeout. Test pass, redir and cork with
+ * apply logic. Use cork size of 4097 with send_large to avoid
+ * aligning cork size with send size.
+ */
+static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt)
+{
txmsg_pass = 1;
txmsg_redir = 0;
+ txmsg_cork = 4097;
+ txmsg_apply = 4097;
+ test_send_large(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
txmsg_apply = 0;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
- txmsg_pass = 1;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 4097;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic start/end */
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
+
+ /* Test >4k pull */
+ txmsg_start = 4096;
+ txmsg_end = 9182;
+ test_send_large(opt, cgrp);
+
+ /* Test pull + redirect */
txmsg_redir = 0;
- txmsg_apply = 1024;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
- txmsg_pass = 1;
+ /* Test pull + cork */
txmsg_redir = 0;
- txmsg_cork = 4096;
- txmsg_apply = 4096;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pull + cork + redirect */
txmsg_redir = 1;
- txmsg_apply = 1;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
+}
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 0;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic pop */
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 1024;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ /* Test pop with >4k */
+ txmsg_start_pop = 4096;
+ txmsg_pop = 4096;
+ test_send_large(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pop + redirect */
txmsg_redir = 1;
- txmsg_apply = 0;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 1024;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ /* Test pop + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pop + redirect + cork */
txmsg_redir = 1;
- txmsg_cork = 4096;
- txmsg_apply = 4096;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
-out:
- return err;
+ txmsg_cork = 4;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
}
-static int test_start_end(int cgrp)
+static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
{
- struct sockmap_options opt = {0};
- int err, i;
+ /* Test basic push */
+ txmsg_start_push = 1;
+ txmsg_end_push = 1;
+ test_send(opt, cgrp);
- /* Test basic start/end with lots of iov_count and iov_lengths */
- txmsg_start = 1;
- txmsg_end = 2;
+ /* Test push 4kB >4k */
+ txmsg_start_push = 4096;
+ txmsg_end_push = 4096;
+ test_send_large(opt, cgrp);
+
+ /* Test push + redirect */
+ txmsg_redir = 1;
txmsg_start_push = 1;
txmsg_end_push = 2;
- txmsg_start_pop = 1;
- txmsg_pop = 1;
- err = test_txmsg(cgrp);
- if (err)
- goto out;
+ test_send_many(opt, cgrp);
- /* Cut a byte of pushed data but leave reamining in place */
- txmsg_start = 1;
- txmsg_end = 2;
+ /* Test push + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
txmsg_start_push = 1;
- txmsg_end_push = 3;
- txmsg_start_pop = 1;
- txmsg_pop = 1;
- err = test_txmsg(cgrp);
- if (err)
- goto out;
-
- /* Test start/end with cork */
- opt.rate = 16;
- opt.iov_count = 1;
- opt.iov_length = 100;
- txmsg_cork = 1600;
-
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- for (i = 99; i <= 1600; i += 500) {
- txmsg_start = 0;
- txmsg_end = i;
- txmsg_start_push = 0;
- txmsg_end_push = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
-
- /* Test pop data in middle of cork */
- for (i = 99; i <= 1600; i += 500) {
- txmsg_start_pop = 10;
- txmsg_pop = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- /* Test start/end with cork but pull data in middle */
- for (i = 199; i <= 1600; i += 500) {
- txmsg_start = 100;
- txmsg_end = i;
- txmsg_start_push = 100;
- txmsg_end_push = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
-
- /* Test start/end with cork pulling last sg entry */
- txmsg_start = 1500;
- txmsg_end = 1600;
- txmsg_start_push = 1500;
- txmsg_end_push = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_end_push = 2;
+ test_send_many(opt, cgrp);
+}
- /* Test pop with cork pulling last sg entry */
- txmsg_start_pop = 1500;
- txmsg_pop = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- /* Test start/end pull of single byte in last page */
- txmsg_start = 1111;
- txmsg_end = 1112;
- txmsg_start_push = 1111;
- txmsg_end_push = 1112;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_start_push = 1;
+ txmsg_end_push = 10;
+ txmsg_start_pop = 5;
+ txmsg_pop = 4;
+ test_send_large(opt, cgrp);
+}
- /* Test pop of single byte in last page */
- txmsg_start_pop = 1111;
- txmsg_pop = 1112;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
- /* Test start/end with end < start */
- txmsg_start = 1111;
- txmsg_end = 0;
- txmsg_start_push = 1111;
- txmsg_end_push = 0;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
- /* Test start/end with end > data */
- txmsg_start = 0;
- txmsg_end = 1601;
- txmsg_start_push = 0;
- txmsg_end_push = 1601;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
- /* Test start/end with start > data */
- txmsg_start = 1601;
- txmsg_end = 1600;
- txmsg_start_push = 1601;
- txmsg_end_push = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
+}
- /* Test pop with start > data */
- txmsg_start_pop = 1601;
- txmsg_pop = 1;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_cork(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 0;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
- /* Test pop with pop range > data */
- txmsg_start_pop = 1599;
- txmsg_pop = 10;
- err = test_exec(cgrp, &opt);
-out:
- txmsg_start = 0;
- txmsg_end = 0;
- sched_yield();
- return err;
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
}
char *map_names[] = {
@@ -1662,73 +1615,116 @@ static int populate_progs(char *bpf_file)
return 0;
}
-static int __test_suite(int cg_fd, char *bpf_file)
+struct _test test[] = {
+ {"txmsg test passthrough", test_txmsg_pass},
+ {"txmsg test redirect", test_txmsg_redir},
+ {"txmsg test drop", test_txmsg_drop},
+ {"txmsg test ingress redirect", test_txmsg_ingress_redir},
+ {"txmsg test apply", test_txmsg_apply},
+ {"txmsg test cork", test_txmsg_cork},
+ {"txmsg test hanging corks", test_txmsg_cork_hangs},
+ {"txmsg test push_data", test_txmsg_push},
+ {"txmsg test pull-data", test_txmsg_pull},
+ {"txmsg test pop-data", test_txmsg_pop},
+ {"txmsg test push/pop data", test_txmsg_push_pop},
+};
+
+static int check_whitelist(struct _test *t, struct sockmap_options *opt)
{
- int err, cleanup = cg_fd;
+ char *entry, *ptr;
+
+ if (!opt->whitelist)
+ return 0;
+ ptr = strdup(opt->whitelist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
- err = populate_progs(bpf_file);
+static int check_blacklist(struct _test *t, struct sockmap_options *opt)
+{
+ char *entry, *ptr;
+
+ if (!opt->blacklist)
+ return -EINVAL;
+ ptr = strdup(opt->blacklist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
+
+static int __test_selftests(int cg_fd, struct sockmap_options *opt)
+{
+ int i, err;
+
+ err = populate_progs(opt->map);
if (err < 0) {
fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
return err;
}
- if (cg_fd < 0) {
- if (setup_cgroup_environment()) {
- fprintf(stderr, "ERROR: cgroup env failed\n");
- return -EINVAL;
- }
+ /* Tests basic commands and APIs */
+ for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) {
+ struct _test t = test[i];
- cg_fd = create_and_get_cgroup(CG_PATH);
- if (cg_fd < 0) {
- fprintf(stderr,
- "ERROR: (%i) open cg path failed: %s\n",
- cg_fd, optarg);
- return cg_fd;
- }
+ if (check_whitelist(&t, opt) != 0)
+ continue;
+ if (check_blacklist(&t, opt) == 0)
+ continue;
- if (join_cgroup(CG_PATH)) {
- fprintf(stderr, "ERROR: failed to join cgroup\n");
- return -EINVAL;
- }
+ test_start_subtest(&t, opt);
+ t.tester(cg_fd, opt);
+ test_end_subtest();
}
- /* Tests basic commands and APIs with range of iov values */
- txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0;
- err = test_txmsg(cg_fd);
- if (err)
- goto out;
+ return err;
+}
- /* Tests interesting combinations of APIs used together */
- err = test_mixed(cg_fd);
- if (err)
- goto out;
+static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKMAP_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
- /* Tests pull_data API using start/end API */
- err = test_start_end(cg_fd);
- if (err)
- goto out;
+static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
-out:
- printf("Summary: %i PASSED %i FAILED\n", passed, failed);
- if (cleanup < 0) {
- cleanup_cgroup_environment();
- close(cg_fd);
- }
- return err;
+static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ opt->prepend = "ktls";
+ ktls = 1;
+ __test_selftests(cg_fd, opt);
+ ktls = 0;
}
-static int test_suite(int cg_fd)
+static int test_selftest(int cg_fd, struct sockmap_options *opt)
{
- int err;
- err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
- if (err)
- goto out;
- err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
-out:
- if (cg_fd > -1)
- close(cg_fd);
- return err;
+ test_selftests_sockmap(cg_fd, opt);
+ test_selftests_sockhash(cg_fd, opt);
+ test_selftests_ktls(cg_fd, opt);
+ test_print_results();
+ return 0;
}
int main(int argc, char **argv)
@@ -1737,12 +1733,10 @@ int main(int argc, char **argv)
struct sockmap_options options = {0};
int opt, longindex, err, cg_fd = 0;
char *bpf_file = BPF_SOCKMAP_FILENAME;
- int test = PING_PONG;
+ int test = SELFTESTS;
+ bool cg_created = 0;
- if (argc < 2)
- return test_suite(-1);
-
- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
@@ -1783,6 +1777,8 @@ int main(int argc, char **argv)
break;
case 'v':
options.verbose = 1;
+ if (optarg)
+ options.verbose = atoi(optarg);
break;
case 'i':
iov_count = atoi(optarg);
@@ -1809,6 +1805,15 @@ int main(int argc, char **argv)
return -1;
}
break;
+ case 'n':
+ options.whitelist = strdup(optarg);
+ if (!options.whitelist)
+ return -ENOMEM;
+ break;
+ case 'b':
+ options.blacklist = strdup(optarg);
+ if (!options.blacklist)
+ return -ENOMEM;
case 0:
break;
case 'h':
@@ -1818,13 +1823,30 @@ int main(int argc, char **argv)
}
}
- if (argc <= 3 && cg_fd)
- return test_suite(cg_fd);
-
if (!cg_fd) {
- fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
- argv[0]);
- return -1;
+ if (setup_cgroup_environment()) {
+ fprintf(stderr, "ERROR: cgroup env failed\n");
+ return -EINVAL;
+ }
+
+ cg_fd = create_and_get_cgroup(CG_PATH);
+ if (cg_fd < 0) {
+ fprintf(stderr,
+ "ERROR: (%i) open cg path failed: %s\n",
+ cg_fd, strerror(errno));
+ return cg_fd;
+ }
+
+ if (join_cgroup(CG_PATH)) {
+ fprintf(stderr, "ERROR: failed to join cgroup\n");
+ return -EINVAL;
+ }
+ cg_created = 1;
+ }
+
+ if (test == SELFTESTS) {
+ err = test_selftest(cg_fd, &options);
+ goto out;
}
err = populate_progs(bpf_file);
@@ -1843,6 +1865,13 @@ int main(int argc, char **argv)
options.rate = rate;
err = run_options(&options, cg_fd, test);
+out:
+ if (options.whitelist)
+ free(options.whitelist);
+ if (options.blacklist)
+ free(options.blacklist);
+ if (cg_created)
+ cleanup_cgroup_environment();
close(cg_fd);
return err;
}
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 604b46151736..056e0273bf12 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -821,3 +821,36 @@
.result = REJECT,
.errstr = "invalid mem access",
},
+{
+ "reference tracking: branch tracking valid pointer null comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: branch tracking valid pointer value comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c
index 860d4a71cd83..3ecb70a3d939 100644
--- a/tools/testing/selftests/bpf/verifier/value_or_null.c
+++ b/tools/testing/selftests/bpf/verifier/value_or_null.c
@@ -150,3 +150,22 @@
.result_unpriv = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
+{
+ "map lookup and null branch prediction",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},