From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 44ac082b30dc2a05a7e23ed7e17b5f9513873386 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:49 +0200 Subject: xsk: Fix xsk_umem_xdp_frame_sz() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calculating the "data_hard_end" for an XDP buffer coming from AF_XDP zero-copy mode, the return value of xsk_umem_xdp_frame_sz() is added to "data_hard_start". Currently, the chunk size of the UMEM is returned by xsk_umem_xdp_frame_sz(). This is not correct, if the fixed UMEM headroom is non-zero. Fix this by returning the chunk_size without the UMEM headroom. Fixes: 2a637c5b1aaf ("xdp: For Intel AF_XDP drivers add XDP frame_sz") Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-2-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index abd72de25fa4..6b1137ce1692 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -239,7 +239,7 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) { - return umem->chunk_size_nohr + umem->headroom; + return umem->chunk_size_nohr; } #else -- cgit v1.2.3 From d20a1676df7e4c3c23d73299159811a50e4854bc Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:50 +0200 Subject: xsk: Move xskmap.c to net/xdp/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP is partly implemented by net/xdp/xsk.c. Move xskmap.c from kernel/bpf/ to net/xdp/, which is the logical place for AF_XDP related code. Also, move AF_XDP struct definitions, and function declarations only used by AF_XDP internals into net/xdp/xsk.h. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-3-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 20 ---- kernel/bpf/Makefile | 3 - kernel/bpf/xskmap.c | 265 ------------------------------------------------ net/xdp/Makefile | 2 +- net/xdp/xsk.h | 16 +++ net/xdp/xskmap.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 284 insertions(+), 289 deletions(-) delete mode 100644 kernel/bpf/xskmap.c create mode 100644 net/xdp/xskmap.c (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6b1137ce1692..8f3f6f5b0dfe 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -65,22 +65,12 @@ struct xdp_umem { struct list_head xsk_tx_list; }; -/* Nodes are linked in the struct xdp_sock map_list field, and used to - * track which maps a certain socket reside in. - */ - struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; -struct xsk_map_node { - struct list_head node; - struct xsk_map *map; - struct xdp_sock **map_entry; -}; - struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -114,7 +104,6 @@ struct xdp_sock { struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); @@ -133,10 +122,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -248,11 +233,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; } -static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return false; -} - static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { return false; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c deleted file mode 100644 index 2cc5c8f4c800..000000000000 --- a/kernel/bpf/xskmap.c +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* XSKMAP used for AF_XDP sockets - * Copyright(c) 2018 Intel Corporation. - */ - -#include -#include -#include -#include -#include - -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - -static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *node; - int err; - - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); - if (!node) - return ERR_PTR(-ENOMEM); - - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } - - node->map = map; - node->map_entry = map_entry; - return node; -} - -static void xsk_map_node_free(struct xsk_map_node *node) -{ - xsk_map_put(node->map); - kfree(node); -} - -static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) -{ - spin_lock_bh(&xs->map_list_lock); - list_add_tail(&node->node, &xs->map_list); - spin_unlock_bh(&xs->map_list_lock); -} - -static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - struct xsk_map_node *n, *tmp; - - spin_lock_bh(&xs->map_list_lock); - list_for_each_entry_safe(n, tmp, &xs->map_list, node) { - if (map_entry == n->map_entry) { - list_del(&n->node); - xsk_map_node_free(n); - } - } - spin_unlock_bh(&xs->map_list_lock); -} - -static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) -{ - struct bpf_map_memory mem; - int err, numa_node; - struct xsk_map *m; - u64 size; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) - return ERR_PTR(-EINVAL); - - numa_node = bpf_map_attr_numa_node(attr); - size = struct_size(m, xsk_map, attr->max_entries); - - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); - return ERR_PTR(-ENOMEM); - } - - bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); - spin_lock_init(&m->lock); - - return &m->map; -} - -static void xsk_map_free(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - - bpf_clear_redirect_map(map); - synchronize_net(); - bpf_map_area_free(m); -} - -static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= m->map.max_entries) { - *next = 0; - return 0; - } - - if (index == m->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) -{ - const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; - struct bpf_insn *insn = insn_buf; - - *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); - *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); - *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); - *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); - *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *insn++ = BPF_MOV64_IMM(ret, 0); - return insn - insn_buf; -} - -static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __xsk_map_lookup_elem(map, *(u32 *)key); -} - -static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; - u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xsk_map_node *node; - struct socket *sock; - int err; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= m->map.max_entries)) - return -E2BIG; - - sock = sockfd_lookup(fd, &err); - if (!sock) - return err; - - if (sock->sk->sk_family != PF_XDP) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - xs = (struct xdp_sock *)sock->sk; - - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - - map_entry = &m->xsk_map[i]; - node = xsk_map_node_alloc(m, map_entry); - if (IS_ERR(node)) { - sockfd_put(sock); - return PTR_ERR(node); - } - - spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); - if (old_xs == xs) { - err = 0; - goto out; - } else if (old_xs && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto out; - } else if (!old_xs && map_flags == BPF_EXIST) { - err = -ENOENT; - goto out; - } - xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - sockfd_put(sock); - return 0; - -out: - spin_unlock_bh(&m->lock); - sockfd_put(sock); - xsk_map_node_free(node); - return err; -} - -static int xsk_map_delete_elem(struct bpf_map *map, void *key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; - int k = *(u32 *)key; - - if (k >= map->max_entries) - return -EINVAL; - - spin_lock_bh(&m->lock); - map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); - if (old_xs) - xsk_map_sock_delete(old_xs, map_entry); - spin_unlock_bh(&m->lock); - - return 0; -} - -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) -{ - spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); - xsk_map_sock_delete(xs, map_entry); - } - spin_unlock_bh(&map->lock); -} - -const struct bpf_map_ops xsk_map_ops = { - .map_alloc = xsk_map_alloc, - .map_free = xsk_map_free, - .map_get_next_key = xsk_map_get_next_key, - .map_lookup_elem = xsk_map_lookup_elem, - .map_gen_lookup = xsk_map_gen_lookup, - .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, - .map_update_elem = xsk_map_update_elem, - .map_delete_elem = xsk_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 71e2bdafb2ce..90b5460d6166 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 4cfd106bdb53..d6a0979050e6 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -17,9 +17,25 @@ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 cr; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + #endif /* XSK_H_ */ diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c new file mode 100644 index 000000000000..1dc7208c71ba --- /dev/null +++ b/net/xdp/xskmap.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include +#include +#include +#include +#include + +#include "xsk.h" + +int xsk_map_inc(struct xsk_map *map) +{ + bpf_map_inc(&map->map); + return 0; +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return ERR_PTR(-ENOMEM); + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct bpf_map_memory mem; + int err, numa_node; + struct xsk_map *m; + u64 size; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + + err = bpf_map_charge_init(&mem, size); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); + spin_lock_init(&m->lock); + + return &m->map; +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + bpf_clear_redirect_map(map); + synchronize_net(); + bpf_map_area_free(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xsk_map_node *node; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } + + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + sockfd_put(sock); + return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs, **map_entry; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + + return 0; +} + +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, +}; -- cgit v1.2.3 From a71506a4fda92a39c8ece119876bc7ccde6d3c9d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 20 May 2020 21:20:51 +0200 Subject: xsk: Move driver interface to xdp_sock_drv.h Move the AF_XDP zero-copy driver interface to its own include file called xdp_sock_drv.h. This, hopefully, will make it more clear for NIC driver implementors to know what functions to use for zero-copy support. v4->v5: Fix -Wmissing-prototypes by include header file. (Jakub) Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-4-bjorn.topel@gmail.com --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xsk/umem.c | 2 +- include/net/xdp_sock.h | 214 +------------------- include/net/xdp_sock_drv.h | 217 +++++++++++++++++++++ net/ethtool/channels.c | 2 +- net/ethtool/ioctl.c | 2 +- net/xdp/xdp_umem.h | 2 +- net/xdp/xsk.c | 2 +- net/xdp/xsk_queue.c | 1 + 15 files changed, 238 insertions(+), 218 deletions(-) create mode 100644 include/net/xdp_sock_drv.h (limited to 'include') diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2a037ec244b9..d6b2db4f2c65 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11,7 +11,7 @@ #include "i40e_diag.h" #include "i40e_xsk.h" #include -#include +#include /* All i40e tracepoints are defined by the include below, which * must be included exactly once across the whole kernel with * CREATE_TRACE_POINTS defined diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 2b9184aead5f..d8b0be29099a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include -#include +#include #include #include "i40e.h" diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 23e5515d4527..70e204307a93 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019, Intel Corporation. */ #include -#include +#include #include #include "ice.h" #include "ice_base.h" diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index a656ee9a1fae..82e4effae704 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include -#include +#include #include #include "ixgbe.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 761c8979bd41..3507d23f0eb8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -31,7 +31,7 @@ */ #include -#include +#include #include "en/xdp.h" #include "en/params.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h index cab0e93497ae..a8e11adbf426 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_RX_H__ #include "en.h" -#include +#include /* RX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h index 79b487d89757..39fa0a705856 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_TX_H__ #include "en.h" -#include +#include /* TX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c index 4baaa5788320..5e49fdb564b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2019 Mellanox Technologies. */ -#include +#include #include "umem.h" #include "setup.h" #include "en/params.h" diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 8f3f6f5b0dfe..6a986dcbc336 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -15,6 +15,7 @@ struct net_device; struct xsk_queue; +struct xdp_buff; /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we @@ -101,27 +102,9 @@ struct xdp_sock { spinlock_t map_list_lock; }; -struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS -int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -/* Used from netdev driver */ -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -153,131 +136,24 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr) return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr; -} - #else + static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) -{ -} - -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) -{ - return false; -} - -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) -{ -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - return NULL; + return -EOPNOTSUPP; } -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +static inline void __xsk_map_flush(void) { } -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) { return NULL; } @@ -297,80 +173,6 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr) return 0; } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) -{ - return false; -} - -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - -static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) -{ - return -EOPNOTSUPP; -} - -static inline void __xsk_map_flush(void) -{ -} - -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h new file mode 100644 index 000000000000..d67f2361937a --- /dev/null +++ b/include/net/xdp_sock_drv.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Interface for implementing AF_XDP zero-copy support in drivers. + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef _LINUX_XDP_SOCK_DRV_H +#define _LINUX_XDP_SOCK_DRV_H + +#include + +#ifdef CONFIG_XDP_SOCKETS + +bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_release_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq); +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); +void xsk_set_rx_need_wakeup(struct xdp_umem *umem); +void xsk_set_tx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); +bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + unsigned long page_addr; + + addr = xsk_umem_add_offset_to_addr(addr); + page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; + + return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + addr = xsk_umem_add_offset_to_addr(addr); + + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); +} + +/* Reuse-queue aware version of FILL queue helpers */ +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (rq->length >= cnt) + return true; + + return xsk_umem_has_addrs(umem, cnt - rq->length); +} + +static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + return xsk_umem_peek_addr(umem, addr); + + *addr = rq->handles[rq->length - 1]; + return addr; +} + +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + xsk_umem_release_addr(umem); + else + rq->length--; +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + rq->handles[rq->length++] = addr; +} + +/* Handle the offset appropriately depending on aligned or unaligned mode. + * For unaligned mode, we store the offset in the upper 16-bits of the address. + * For aligned mode, we simply add the offset to the address. + */ +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, + u64 offset) +{ + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) + return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + else + return address + offset; +} + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return umem->chunk_size_nohr; +} + +#else + +static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + +static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_release_addr(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, + struct xdp_desc *desc) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + return NULL; +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( + struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq) +{ + return NULL; +} + +static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ +} + +static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + return NULL; +} + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ +} + +static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +{ + return false; +} + +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, + u64 offset) +{ + return 0; +} + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return 0; +} + +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 389924b65d05..658a8580b464 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "netlink.h" #include "common.h" diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 52102ab1709b..74892623bacd 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index a63a9fb251f5..32067fe98f65 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,7 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include +#include int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45ffd67b367d..8bda654e82ec 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include "xsk_queue.h" diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 57fb81bd593c..554b1ebb4d02 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "xsk_queue.h" -- cgit v1.2.3 From 89e4a376e3a3dab639a3947a6c7cf5d461d1aa4c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:52 +0200 Subject: xsk: Move defines only used by AF_XDP internals to xsk.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the XSK_NEXT_PG_CONTIG_{MASK,SHIFT}, and XDP_UMEM_USES_NEED_WAKEUP defines from xdp_sock.h to the AF_XDP internal xsk.h file. Also, start using the BIT{,_ULL} macro instead of explicit shifts. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-5-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 14 -------------- net/xdp/xsk.h | 14 ++++++++++++++ net/xdp/xsk_queue.h | 2 ++ 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6a986dcbc336..fb7fe3060175 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,13 +17,6 @@ struct net_device; struct xsk_queue; struct xdp_buff; -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT) - struct xdp_umem_page { void *addr; dma_addr_t dma; @@ -35,13 +28,6 @@ struct xdp_umem_fq_reuse { u64 handles[]; }; -/* Flags for the umem flags field. - * - * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public - * flags. See inlude/uapi/include/linux/if_xdp.h. - */ -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1) - struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index d6a0979050e6..455ddd480f3d 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,20 @@ #ifndef XSK_H_ #define XSK_H_ +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) + struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 648733ec24ac..a322a7dac58c 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,6 +10,8 @@ #include #include +#include "xsk.h" + struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 2b43470add8c8ff1e1ee28dffc5c5df97e955d09 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:53 +0200 Subject: xsk: Introduce AF_XDP buffer allocation API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com --- include/net/xdp.h | 4 +- include/net/xdp_sock.h | 2 + include/net/xdp_sock_drv.h | 164 ++++++++++++++++ include/net/xsk_buff_pool.h | 56 ++++++ include/trace/events/xdp.h | 3 +- net/core/xdp.c | 14 +- net/xdp/Makefile | 1 + net/xdp/xdp_umem.c | 19 +- net/xdp/xsk.c | 147 +++++--------- net/xdp/xsk_buff_pool.c | 467 ++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_diag.c | 2 +- net/xdp/xsk_queue.h | 59 ++++-- 12 files changed, 819 insertions(+), 119 deletions(-) create mode 100644 include/net/xsk_buff_pool.h create mode 100644 net/xdp/xsk_buff_pool.c (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 3094fccf5a88..f432134c7c00 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_ZERO_COPY, + MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -119,7 +120,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index fb7fe3060175..6e7265f63c04 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,11 +31,13 @@ struct xdp_umem_fq_reuse { struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; + struct xsk_buff_pool *pool; struct xdp_umem_page *pages; u64 chunk_mask; u64 size; u32 headroom; u32 chunk_size_nohr; + u32 chunk_size; struct user_struct *user; refcount_t users; struct work_struct work; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index d67f2361937a..7752c8663d1b 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -7,6 +7,7 @@ #define _LINUX_XDP_SOCK_DRV_H #include +#include #ifdef CONFIG_XDP_SOCKETS @@ -101,6 +102,94 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return umem->chunk_size_nohr; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return XDP_PACKET_HEADROOM + umem->headroom; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return umem->chunk_size; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ + xp_set_rxq_info(umem->pool, rxq); +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ + xp_dma_unmap(umem->pool, attrs); +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_dma(xskb); +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_frame_dma(xskb); +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return xp_alloc(umem->pool); +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return xp_can_alloc(umem->pool, count); +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_free(xskb); +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_dma(umem->pool, addr); +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_data(umem->pool, addr); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_dma_sync_for_cpu(xskb); +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ + xp_dma_sync_for_device(umem->pool, dma, size); +} + #else static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) @@ -212,6 +301,81 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return 0; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return NULL; +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return false; +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h new file mode 100644 index 000000000000..9f221b36e405 --- /dev/null +++ b/include/net/xsk_buff_pool.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef XSK_BUFF_POOL_H_ +#define XSK_BUFF_POOL_H_ + +#include +#include +#include + +struct xsk_buff_pool; +struct xdp_rxq_info; +struct xsk_queue; +struct xdp_desc; +struct device; +struct page; + +struct xdp_buff_xsk { + struct xdp_buff xdp; + dma_addr_t dma; + dma_addr_t frame_dma; + struct xsk_buff_pool *pool; + bool unaligned; + u64 orig_addr; + struct list_head free_list_node; +}; + +/* AF_XDP core. */ +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned); +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +void xp_destroy(struct xsk_buff_pool *pool); +void xp_release(struct xdp_buff_xsk *xskb); +u64 xp_get_handle(struct xdp_buff_xsk *xskb); +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); + +/* AF_XDP, and XDP core. */ +void xp_free(struct xdp_buff_xsk *xskb); + +/* AF_XDP ZC drivers, via xdp_sock_buff.h */ +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages); +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); + +#endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b95d65e8c628..48547a12fa27 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,8 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) + FN(ZERO_COPY) \ + FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); diff --git a/net/core/xdp.c b/net/core/xdp.c index 490b8f5fa8ee..f0ce8b195193 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include /* struct xdp_mem_allocator */ #include +#include #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -361,7 +362,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) + unsigned long handle, struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -390,6 +391,11 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); + break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ break; @@ -398,19 +404,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 90b5460d6166..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 37ace3bc0d48..7f04688045d5 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -245,7 +245,7 @@ static void xdp_umem_release(struct xdp_umem *umem) } xsk_reuseq_destroy(umem); - + xp_destroy(umem->pool); xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); @@ -390,6 +390,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; @@ -415,11 +416,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) } err = xdp_umem_map_pages(umem); - if (!err) - return 0; + if (err) + goto out_pages; - kvfree(umem->pages); + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { + err = -ENOMEM; + goto out_unmap; + } + return 0; +out_unmap: + xdp_umem_unmap_pages(umem); +out_pages: + kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 8bda654e82ec..6933f0d494ba 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -117,76 +117,67 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; + int err; - return; + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { + xs->rx_dropped++; + return err; } - memcpy(to_buf, from_buf, len + metalen); + xp_release(xskb); + return 0; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; + void *from_buf, *to_buf; u32 metalen; - int err; - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - xs->rx_dropped++; - return -ENOSPC; - } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; metalen = 0; } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); - - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; - } - - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +190,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +203,10 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +218,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +232,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -404,7 +363,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -860,6 +819,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..e214a5795a62 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "xsk_queue.h" + +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +void xp_release(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) + return false; + + if (desc->options) + return false; + return true; +} + +static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr, base_addr; + + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (desc->len > pool->chunk_size) + return false; + + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (desc->options) + return false; + return true; +} + +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + +u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} +EXPORT_SYMBOL(xp_get_dma); + +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} +EXPORT_SYMBOL(xp_get_frame_dma); + +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu); + +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + if (pool->cheap_dma) + return; + + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a322a7dac58c..9151aef7dbca 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "xsk.h" @@ -172,31 +173,45 @@ out: return false; } -static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, - struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; + q->cached_cons++; + } + + return false; +} + +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; return true; } - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; + return false; +} - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) +{ + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -260,6 +275,20 @@ static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, return xskq_cons_read_addr(q, addr, umem); } +static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_aligned(q, addr); +} + +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_unchecked(q, addr); +} + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) -- cgit v1.2.3 From 0807892ecb35734b7ce6f7c29b078f1b60151c94 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:00 +0200 Subject: xsk: Remove MEM_TYPE_ZERO_COPY and corresponding code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users of MEM_TYPE_ZERO_COPY. Remove all corresponding code, including the "handle" member of struct xdp_buff. rfc->v1: Fixed spelling in commit message. (Björn) Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-13-bjorn.topel@gmail.com --- drivers/net/hyperv/netvsc_bpf.c | 1 - include/net/xdp.h | 9 +-- include/net/xdp_sock.h | 45 ------------ include/net/xdp_sock_drv.h | 149 ---------------------------------------- include/trace/events/xdp.h | 1 - net/core/xdp.c | 42 ++--------- net/xdp/xdp_umem.c | 56 +-------------- net/xdp/xsk.c | 48 +------------ net/xdp/xsk_buff_pool.c | 7 ++ net/xdp/xsk_queue.c | 62 ----------------- net/xdp/xsk_queue.h | 105 ---------------------------- 11 files changed, 15 insertions(+), 510 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 1e0c024b0a93..8e4141552423 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; xdp->frame_sz = PAGE_SIZE; - xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/include/net/xdp.h b/include/net/xdp.h index f432134c7c00..90f11760bd12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -39,7 +39,6 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, - MEM_TYPE_ZERO_COPY, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -55,10 +54,6 @@ struct xdp_mem_info { struct page_pool; -struct zero_copy_allocator { - void (*free)(struct zero_copy_allocator *zca, unsigned long handle); -}; - struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -71,7 +66,6 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; - unsigned long handle; struct xdp_rxq_info *rxq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; @@ -120,8 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6e7265f63c04..96bfc5f5f24e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,26 +17,12 @@ struct net_device; struct xsk_queue; struct xdp_buff; -struct xdp_umem_page { - void *addr; - dma_addr_t dma; -}; - -struct xdp_umem_fq_reuse { - u32 nentries; - u32 length; - u64 handles[]; -}; - struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; struct xsk_buff_pool *pool; - struct xdp_umem_page *pages; - u64 chunk_mask; u64 size; u32 headroom; - u32 chunk_size_nohr; u32 chunk_size; struct user_struct *user; refcount_t users; @@ -48,7 +34,6 @@ struct xdp_umem { u8 flags; int id; struct net_device *dev; - struct xdp_umem_fq_reuse *fq_reuse; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; @@ -109,21 +94,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return xs; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -146,21 +116,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return 0; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 7752c8663d1b..ccf848f7efa4 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,16 +11,9 @@ #ifdef CONFIG_XDP_SOCKETS -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xdp_umem *umem); void xsk_set_tx_need_wakeup(struct xdp_umem *umem); @@ -28,80 +21,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return XDP_PACKET_HEADROOM + umem->headroom; @@ -192,20 +111,6 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, #else -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { } @@ -220,55 +125,12 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - return NULL; -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} - -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ -} - static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { } @@ -290,17 +152,6 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) return false; } -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return 0; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 48547a12fa27..b73d3e141323 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,6 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ diff --git a/net/core/xdp.c b/net/core/xdp.c index f0ce8b195193..a8c2f243367d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -144,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -302,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -362,7 +338,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle, struct xdp_buff *xdp) + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -384,14 +360,6 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); - break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ xsk_buff_free(xdp); @@ -404,19 +372,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7f04688045d5..19e59d1a5e9f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); xp_destroy(umem->pool); - xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -385,11 +349,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; @@ -408,29 +369,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { - err = -ENOMEM; - goto out_pin; - } - - err = xdp_umem_map_pages(umem); - if (err) - goto out_pages; - umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, headroom, size, unaligned_chunks); if (!umem->pool) { err = -ENOMEM; - goto out_unmap; + goto out_pin; } return 0; -out_unmap: - xdp_umem_unmap_pages(umem); -out_pages: - kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6933f0d494ba..3f2ab732ab8b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -203,8 +185,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, len = xdp->data_end - xdp->data; - return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len, explicit_free); } @@ -588,24 +569,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -688,23 +651,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e214a5795a62..89dae78865e7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -8,6 +8,13 @@ #include "xsk_queue.h" +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + struct xsk_buff_pool { struct xsk_queue *fq; struct list_head free_list; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 554b1ebb4d02..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -10,15 +10,6 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) -{ - if (!q) - return; - - q->umem_size = umem_size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -64,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9151aef7dbca..16bf15864788 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -106,90 +104,6 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) -{ - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} - -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->umem_size || addr >= q->umem_size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) -{ - if (addr >= q->umem_size) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx] & q->chunk_mask; - - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } - - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - -out: - q->cached_cons++; - } - - return false; -} - -static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx]; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - - q->cached_cons++; - } - - return false; -} - static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -267,21 +181,6 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); -} - -static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr_aligned(q, addr); -} - static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) @@ -410,11 +309,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3 From 26062b185eee49142adc45f9aa187d909d02d961 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:02 +0200 Subject: xsk: Explicitly inline functions and move definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to reduce the number of function calls, the struct xsk_buff_pool definition is moved to xsk_buff_pool.h. The functions xp_get_dma(), xp_dma_sync_for_cpu(), xp_dma_sync_for_device(), xp_validate_desc() and various helper functions are explicitly inlined. Further, move xp_get_handle() and xp_release() to xsk.c, to allow for the compiler to perform inlining. rfc->v1: Make sure xp_validate_desc() is inlined for Tx perf. (Maxim) Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-15-bjorn.topel@gmail.com --- include/net/xsk_buff_pool.h | 98 ++++++++++++++++++++++++++--- net/xdp/xsk.c | 15 +++++ net/xdp/xsk_buff_pool.c | 148 ++------------------------------------------ net/xdp/xsk_queue.h | 45 ++++++++++++++ 4 files changed, 156 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 9f221b36e405..a4ff226505c9 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -4,6 +4,7 @@ #ifndef XSK_BUFF_POOL_H_ #define XSK_BUFF_POOL_H_ +#include #include #include #include @@ -25,6 +26,27 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + /* AF_XDP core. */ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, u32 chunk_size, u32 headroom, u64 size, @@ -32,8 +54,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); -u64 xp_get_handle(struct xdp_buff_xsk *xskb); -bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); @@ -47,10 +67,74 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); -dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); -dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); -void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); -void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size); +static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} + +static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); +static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + xp_dma_sync_for_cpu_slow(xskb); +} + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); +static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, + dma_addr_t dma, size_t size) +{ + if (pool->cheap_dma) + return; + + xp_dma_sync_for_device_slow(pool, dma, size); +} + +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static inline u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} #endif /* XSK_BUFF_POOL_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3f2ab732ab8b..b6c0f08bd80d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -99,6 +99,21 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); +void xp_release(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 89dae78865e7..540ed75e4482 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -8,34 +8,6 @@ #include "xsk_queue.h" -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - -struct xsk_buff_pool { - struct xsk_queue *fq; - struct list_head free_list; - dma_addr_t *dma_pages; - struct xdp_buff_xsk *heads; - u64 chunk_mask; - u64 addrs_cnt; - u32 free_list_cnt; - u32 dma_pages_cnt; - u32 heads_cnt; - u32 free_heads_cnt; - u32 headroom; - u32 chunk_size; - u32 frame_len; - bool cheap_dma; - bool unaligned; - void *addrs; - struct device *dev; - struct xdp_buff_xsk *free_heads[]; -}; - static void xp_addr_unmap(struct xsk_buff_pool *pool) { vunmap(pool->addrs); @@ -228,50 +200,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, } EXPORT_SYMBOL(xp_dma_map); -static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, - u64 addr, u32 len) -{ - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; - - if (pool->dma_pages_cnt && cross_pg) { - return !(pool->dma_pages[addr >> PAGE_SHIFT] & - XSK_NEXT_PG_CONTIG_MASK); - } - return false; -} - static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr) { return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); } -void xp_release(struct xdp_buff_xsk *xskb) -{ - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; -} - -static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) -{ - return addr & pool->chunk_mask; -} - -static u64 xp_unaligned_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static u64 xp_unaligned_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static u64 xp_unaligned_add_offset_to_addr(u64 addr) -{ - return xp_unaligned_extract_addr(addr) + - xp_unaligned_extract_offset(addr); -} - static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) { *addr = xp_unaligned_extract_addr(*addr); @@ -370,60 +304,6 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, - struct xdp_desc *desc) -{ - u64 chunk, chunk_end; - - chunk = xp_aligned_extract_addr(pool, desc->addr); - chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); - if (chunk != chunk_end) - return false; - - if (chunk >= pool->addrs_cnt) - return false; - - if (desc->options) - return false; - return true; -} - -static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, - struct xdp_desc *desc) -{ - u64 addr, base_addr; - - base_addr = xp_unaligned_extract_addr(desc->addr); - addr = xp_unaligned_add_offset_to_addr(desc->addr); - - if (desc->len > pool->chunk_size) - return false; - - if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) - return false; - - if (desc->options) - return false; - return true; -} - -bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) -{ - return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : - xp_aligned_validate_desc(pool, desc); -} - -u64 xp_get_handle(struct xdp_buff_xsk *xskb) -{ - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); -} - void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; @@ -440,35 +320,17 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) } EXPORT_SYMBOL(xp_raw_get_dma); -dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) -{ - return xskb->dma; -} -EXPORT_SYMBOL(xp_get_dma); - -dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) { - return xskb->frame_dma; -} -EXPORT_SYMBOL(xp_get_frame_dma); - -void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) -{ - if (xskb->pool->cheap_dma) - return; - dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, xskb->pool->frame_len, DMA_BIDIRECTIONAL); } -EXPORT_SYMBOL(xp_dma_sync_for_cpu); +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); -void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size) +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) { - if (pool->cheap_dma) - return; - dma_sync_single_range_for_device(pool->dev, dma, 0, size, DMA_BIDIRECTIONAL); } -EXPORT_SYMBOL(xp_dma_sync_for_device); +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 16bf15864788..5b5d24d2dd37 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -118,6 +118,51 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) + return false; + + if (desc->options) + return false; + return true; +} + +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr, base_addr; + + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (desc->len > pool->chunk_size) + return false; + + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (desc->options) + return false; + return true; +} + +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) -- cgit v1.2.3