summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-07-20 01:02:17 +0300
committerJakub Kicinski <kuba@kernel.org>2023-07-20 01:02:18 +0300
commite93165d5e75d025280763ff01eece98575b32901 (patch)
treee74e36fe63ebafa383ac6f9914bfa611bd784319 /include
parent97083c21c5d3a1ab5e61a96bc6e76a62f7f23576 (diff)
parent6f5a630d7c57cd79b1f526a95e757311e32d41e5 (diff)
downloadlinux-e93165d5e75d025280763ff01eece98575b32901.tar.xz
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2023-07-19 We've added 45 non-merge commits during the last 3 day(s) which contain a total of 71 files changed, 7808 insertions(+), 592 deletions(-). The main changes are: 1) multi-buffer support in AF_XDP, from Maciej Fijalkowski, Magnus Karlsson, Tirthendu Sarkar. 2) BPF link support for tc BPF programs, from Daniel Borkmann. 3) Enable bpf_map_sum_elem_count kfunc for all program types, from Anton Protopopov. 4) Add 'owner' field to bpf_rb_node to fix races in shared ownership, Dave Marchevsky. 5) Prevent potential skb_header_pointer() misuse, from Alexei Starovoitov. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (45 commits) bpf, net: Introduce skb_pointer_if_linear(). bpf: sync tools/ uapi header with selftests/bpf: Add mprog API tests for BPF tcx links selftests/bpf: Add mprog API tests for BPF tcx opts bpftool: Extend net dump with tcx progs libbpf: Add helper macro to clear opts structs libbpf: Add link-based API for tcx libbpf: Add opts-based attach/detach/query API for tcx bpf: Add fd-based tcx multi-prog infra with link support bpf: Add generic attach/detach/query API for multi-progs selftests/xsk: reset NIC settings to default after running test suite selftests/xsk: add test for too many frags selftests/xsk: add metadata copy test for multi-buff selftests/xsk: add invalid descriptor test for multi-buffer selftests/xsk: add unaligned mode test for multi-buffer selftests/xsk: add basic multi-buffer test selftests/xsk: transmit and receive multi-buffer packets xsk: add multi-buffer documentation i40e: xsk: add TX multi-buffer support ice: xsk: Tx multi-buffer support ... ==================== Link: https://lore.kernel.org/r/20230719175424.75717-1-alexei.starovoitov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h12
-rw-r--r--include/linux/bpf_mprog.h327
-rw-r--r--include/linux/btf_ids.h1
-rw-r--r--include/linux/netdevice.h16
-rw-r--r--include/linux/skbuff.h14
-rw-r--r--include/net/sch_generic.h2
-rw-r--r--include/net/tcx.h206
-rw-r--r--include/net/xdp_sock.h7
-rw-r--r--include/net/xdp_sock_drv.h54
-rw-r--r--include/net/xsk_buff_pool.h7
-rw-r--r--include/uapi/linux/bpf.h72
-rw-r--r--include/uapi/linux/if_xdp.h13
-rw-r--r--include/uapi/linux/netdev.h1
13 files changed, 707 insertions, 25 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 360433f14496..ceaa8c23287f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -228,6 +228,18 @@ struct btf_record {
struct btf_field fields[];
};
+/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
+struct bpf_rb_node_kern {
+ struct rb_node rb_node;
+ void *owner;
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
+struct bpf_list_node_kern {
+ struct list_head list_head;
+ void *owner;
+} __attribute__((aligned(8)));
+
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h
new file mode 100644
index 000000000000..2b429488f840
--- /dev/null
+++ b/include/linux/bpf_mprog.h
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Isovalent */
+#ifndef __BPF_MPROG_H
+#define __BPF_MPROG_H
+
+#include <linux/bpf.h>
+
+/* bpf_mprog framework:
+ *
+ * bpf_mprog is a generic layer for multi-program attachment. In-kernel users
+ * of the bpf_mprog don't need to care about the dependency resolution
+ * internals, they can just consume it with few API calls. Currently available
+ * dependency directives are BPF_F_{BEFORE,AFTER} which enable insertion of
+ * a BPF program or BPF link relative to an existing BPF program or BPF link
+ * inside the multi-program array as well as prepend and append behavior if
+ * no relative object was specified, see corresponding selftests for concrete
+ * examples (e.g. tc_links and tc_opts test cases of test_progs).
+ *
+ * Usage of bpf_mprog_{attach,detach,query}() core APIs with pseudo code:
+ *
+ * Attach case:
+ *
+ * struct bpf_mprog_entry *entry, *entry_new;
+ * int ret;
+ *
+ * // bpf_mprog user-side lock
+ * // fetch active @entry from attach location
+ * [...]
+ * ret = bpf_mprog_attach(entry, &entry_new, [...]);
+ * if (!ret) {
+ * if (entry != entry_new) {
+ * // swap @entry to @entry_new at attach location
+ * // ensure there are no inflight users of @entry:
+ * synchronize_rcu();
+ * }
+ * bpf_mprog_commit(entry);
+ * } else {
+ * // error path, bail out, propagate @ret
+ * }
+ * // bpf_mprog user-side unlock
+ *
+ * Detach case:
+ *
+ * struct bpf_mprog_entry *entry, *entry_new;
+ * int ret;
+ *
+ * // bpf_mprog user-side lock
+ * // fetch active @entry from attach location
+ * [...]
+ * ret = bpf_mprog_detach(entry, &entry_new, [...]);
+ * if (!ret) {
+ * // all (*) marked is optional and depends on the use-case
+ * // whether bpf_mprog_bundle should be freed or not
+ * if (!bpf_mprog_total(entry_new)) (*)
+ * entry_new = NULL (*)
+ * // swap @entry to @entry_new at attach location
+ * // ensure there are no inflight users of @entry:
+ * synchronize_rcu();
+ * bpf_mprog_commit(entry);
+ * if (!entry_new) (*)
+ * // free bpf_mprog_bundle (*)
+ * } else {
+ * // error path, bail out, propagate @ret
+ * }
+ * // bpf_mprog user-side unlock
+ *
+ * Query case:
+ *
+ * struct bpf_mprog_entry *entry;
+ * int ret;
+ *
+ * // bpf_mprog user-side lock
+ * // fetch active @entry from attach location
+ * [...]
+ * ret = bpf_mprog_query(attr, uattr, entry);
+ * // bpf_mprog user-side unlock
+ *
+ * Data/fast path:
+ *
+ * struct bpf_mprog_entry *entry;
+ * struct bpf_mprog_fp *fp;
+ * struct bpf_prog *prog;
+ * int ret = [...];
+ *
+ * rcu_read_lock();
+ * // fetch active @entry from attach location
+ * [...]
+ * bpf_mprog_foreach_prog(entry, fp, prog) {
+ * ret = bpf_prog_run(prog, [...]);
+ * // process @ret from program
+ * }
+ * [...]
+ * rcu_read_unlock();
+ *
+ * bpf_mprog locking considerations:
+ *
+ * bpf_mprog_{attach,detach,query}() must be protected by an external lock
+ * (like RTNL in case of tcx).
+ *
+ * bpf_mprog_entry pointer can be an __rcu annotated pointer (in case of tcx
+ * the netdevice has tcx_ingress and tcx_egress __rcu pointer) which gets
+ * updated via rcu_assign_pointer() pointing to the active bpf_mprog_entry of
+ * the bpf_mprog_bundle.
+ *
+ * Fast path accesses the active bpf_mprog_entry within RCU critical section
+ * (in case of tcx it runs in NAPI which provides RCU protection there,
+ * other users might need explicit rcu_read_lock()). The bpf_mprog_commit()
+ * assumes that for the old bpf_mprog_entry there are no inflight users
+ * anymore.
+ *
+ * The READ_ONCE()/WRITE_ONCE() pairing for bpf_mprog_fp's prog access is for
+ * the replacement case where we don't swap the bpf_mprog_entry.
+ */
+
+#define bpf_mprog_foreach_tuple(entry, fp, cp, t) \
+ for (fp = &entry->fp_items[0], cp = &entry->parent->cp_items[0];\
+ ({ \
+ t.prog = READ_ONCE(fp->prog); \
+ t.link = cp->link; \
+ t.prog; \
+ }); \
+ fp++, cp++)
+
+#define bpf_mprog_foreach_prog(entry, fp, p) \
+ for (fp = &entry->fp_items[0]; \
+ (p = READ_ONCE(fp->prog)); \
+ fp++)
+
+#define BPF_MPROG_MAX 64
+
+struct bpf_mprog_fp {
+ struct bpf_prog *prog;
+};
+
+struct bpf_mprog_cp {
+ struct bpf_link *link;
+};
+
+struct bpf_mprog_entry {
+ struct bpf_mprog_fp fp_items[BPF_MPROG_MAX];
+ struct bpf_mprog_bundle *parent;
+};
+
+struct bpf_mprog_bundle {
+ struct bpf_mprog_entry a;
+ struct bpf_mprog_entry b;
+ struct bpf_mprog_cp cp_items[BPF_MPROG_MAX];
+ struct bpf_prog *ref;
+ atomic64_t revision;
+ u32 count;
+};
+
+struct bpf_tuple {
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+};
+
+static inline struct bpf_mprog_entry *
+bpf_mprog_peer(const struct bpf_mprog_entry *entry)
+{
+ if (entry == &entry->parent->a)
+ return &entry->parent->b;
+ else
+ return &entry->parent->a;
+}
+
+static inline void bpf_mprog_bundle_init(struct bpf_mprog_bundle *bundle)
+{
+ BUILD_BUG_ON(sizeof(bundle->a.fp_items[0]) > sizeof(u64));
+ BUILD_BUG_ON(ARRAY_SIZE(bundle->a.fp_items) !=
+ ARRAY_SIZE(bundle->cp_items));
+
+ memset(bundle, 0, sizeof(*bundle));
+ atomic64_set(&bundle->revision, 1);
+ bundle->a.parent = bundle;
+ bundle->b.parent = bundle;
+}
+
+static inline void bpf_mprog_inc(struct bpf_mprog_entry *entry)
+{
+ entry->parent->count++;
+}
+
+static inline void bpf_mprog_dec(struct bpf_mprog_entry *entry)
+{
+ entry->parent->count--;
+}
+
+static inline int bpf_mprog_max(void)
+{
+ return ARRAY_SIZE(((struct bpf_mprog_entry *)NULL)->fp_items) - 1;
+}
+
+static inline int bpf_mprog_total(struct bpf_mprog_entry *entry)
+{
+ int total = entry->parent->count;
+
+ WARN_ON_ONCE(total > bpf_mprog_max());
+ return total;
+}
+
+static inline bool bpf_mprog_exists(struct bpf_mprog_entry *entry,
+ struct bpf_prog *prog)
+{
+ const struct bpf_mprog_fp *fp;
+ const struct bpf_prog *tmp;
+
+ bpf_mprog_foreach_prog(entry, fp, tmp) {
+ if (tmp == prog)
+ return true;
+ }
+ return false;
+}
+
+static inline void bpf_mprog_mark_for_release(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ WARN_ON_ONCE(entry->parent->ref);
+ if (!tuple->link)
+ entry->parent->ref = tuple->prog;
+}
+
+static inline void bpf_mprog_complete_release(struct bpf_mprog_entry *entry)
+{
+ /* In the non-link case prog deletions can only drop the reference
+ * to the prog after the bpf_mprog_entry got swapped and the
+ * bpf_mprog ensured that there are no inflight users anymore.
+ *
+ * Paired with bpf_mprog_mark_for_release().
+ */
+ if (entry->parent->ref) {
+ bpf_prog_put(entry->parent->ref);
+ entry->parent->ref = NULL;
+ }
+}
+
+static inline void bpf_mprog_revision_new(struct bpf_mprog_entry *entry)
+{
+ atomic64_inc(&entry->parent->revision);
+}
+
+static inline void bpf_mprog_commit(struct bpf_mprog_entry *entry)
+{
+ bpf_mprog_complete_release(entry);
+ bpf_mprog_revision_new(entry);
+}
+
+static inline u64 bpf_mprog_revision(struct bpf_mprog_entry *entry)
+{
+ return atomic64_read(&entry->parent->revision);
+}
+
+static inline void bpf_mprog_entry_copy(struct bpf_mprog_entry *dst,
+ struct bpf_mprog_entry *src)
+{
+ memcpy(dst->fp_items, src->fp_items, sizeof(src->fp_items));
+}
+
+static inline void bpf_mprog_entry_grow(struct bpf_mprog_entry *entry, int idx)
+{
+ int total = bpf_mprog_total(entry);
+
+ memmove(entry->fp_items + idx + 1,
+ entry->fp_items + idx,
+ (total - idx) * sizeof(struct bpf_mprog_fp));
+
+ memmove(entry->parent->cp_items + idx + 1,
+ entry->parent->cp_items + idx,
+ (total - idx) * sizeof(struct bpf_mprog_cp));
+}
+
+static inline void bpf_mprog_entry_shrink(struct bpf_mprog_entry *entry, int idx)
+{
+ /* Total array size is needed in this case to enure the NULL
+ * entry is copied at the end.
+ */
+ int total = ARRAY_SIZE(entry->fp_items);
+
+ memmove(entry->fp_items + idx,
+ entry->fp_items + idx + 1,
+ (total - idx - 1) * sizeof(struct bpf_mprog_fp));
+
+ memmove(entry->parent->cp_items + idx,
+ entry->parent->cp_items + idx + 1,
+ (total - idx - 1) * sizeof(struct bpf_mprog_cp));
+}
+
+static inline void bpf_mprog_read(struct bpf_mprog_entry *entry, u32 idx,
+ struct bpf_mprog_fp **fp,
+ struct bpf_mprog_cp **cp)
+{
+ *fp = &entry->fp_items[idx];
+ *cp = &entry->parent->cp_items[idx];
+}
+
+static inline void bpf_mprog_write(struct bpf_mprog_fp *fp,
+ struct bpf_mprog_cp *cp,
+ struct bpf_tuple *tuple)
+{
+ WRITE_ONCE(fp->prog, tuple->prog);
+ cp->link = tuple->link;
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision);
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision);
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry);
+
+static inline bool bpf_mprog_supported(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return true;
+ default:
+ return false;
+ }
+}
+#endif /* __BPF_MPROG_H */
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 00950cc03bff..a3462a9b8e18 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -267,5 +267,6 @@ MAX_BTF_TRACING_TYPE,
extern u32 btf_tracing_ids[];
extern u32 bpf_cgroup_btf_id[];
extern u32 bpf_local_storage_map_btf_id[];
+extern u32 btf_bpf_map_id[];
#endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b828c7a75be2..3800d0479698 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1930,8 +1930,7 @@ enum netdev_ml_priv_type {
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
- * @miniq_ingress: ingress/clsact qdisc specific data for
- * ingress processing
+ * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing
* @ingress_queue: XXX: need comments on this one
* @nf_hooks_ingress: netfilter hooks executed for ingress packets
* @broadcast: hw bcast address
@@ -1952,8 +1951,7 @@ enum netdev_ml_priv_type {
* @xps_maps: all CPUs/RXQs maps for XPS device
*
* @xps_maps: XXX: need comments on this one
- * @miniq_egress: clsact qdisc specific data for
- * egress processing
+ * @tcx_egress: BPF & clsact qdisc specific data for egress processing
* @nf_hooks_egress: netfilter hooks executed for egress packets
* @qdisc_hash: qdisc hash table
* @watchdog_timeo: Represents the timeout that is used by
@@ -2250,11 +2248,11 @@ struct net_device {
#define GRO_MAX_SIZE (8 * 65535u)
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
+ unsigned int xdp_zc_max_segs;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
-
-#ifdef CONFIG_NET_CLS_ACT
- struct mini_Qdisc __rcu *miniq_ingress;
+#ifdef CONFIG_NET_XGRESS
+ struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
@@ -2282,8 +2280,8 @@ struct net_device {
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
-#ifdef CONFIG_NET_CLS_ACT
- struct mini_Qdisc __rcu *miniq_egress;
+#ifdef CONFIG_NET_XGRESS
+ struct bpf_mprog_entry __rcu *tcx_egress;
#endif
#ifdef CONFIG_NETFILTER_EGRESS
struct nf_hook_entries __rcu *nf_hooks_egress;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 91ed66952580..faaba050f843 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -944,7 +944,7 @@ struct sk_buff {
__u8 __mono_tc_offset[0];
/* public: */
__u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_XGRESS
__u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
__u8 tc_skip_classify:1;
#endif
@@ -993,7 +993,7 @@ struct sk_buff {
__u8 csum_not_inet:1;
#endif
-#ifdef CONFIG_NET_SCHED
+#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
__u16 tc_index; /* traffic control index */
#endif
@@ -4023,7 +4023,7 @@ __skb_header_pointer(const struct sk_buff *skb, int offset, int len,
if (likely(hlen - offset >= len))
return (void *)data + offset;
- if (!skb || !buffer || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
+ if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
return NULL;
return buffer;
@@ -4036,6 +4036,14 @@ skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
skb_headlen(skb), buffer);
}
+static inline void * __must_check
+skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len)
+{
+ if (likely(skb_headlen(skb) - offset >= len))
+ return skb->data + offset;
+ return NULL;
+}
+
/**
* skb_needs_linearize - check if we need to linearize a given skb
* depending on the given device features.
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e92f73bb3198..15be2d96b06d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -703,7 +703,7 @@ int skb_do_redirect(struct sk_buff *);
static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_XGRESS
return skb->tc_at_ingress;
#else
return false;
diff --git a/include/net/tcx.h b/include/net/tcx.h
new file mode 100644
index 000000000000..264f147953ba
--- /dev/null
+++ b/include/net/tcx.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Isovalent */
+#ifndef __NET_TCX_H
+#define __NET_TCX_H
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+#include <net/sch_generic.h>
+
+struct mini_Qdisc;
+
+struct tcx_entry {
+ struct mini_Qdisc __rcu *miniq;
+ struct bpf_mprog_bundle bundle;
+ bool miniq_active;
+ struct rcu_head rcu;
+};
+
+struct tcx_link {
+ struct bpf_link link;
+ struct net_device *dev;
+ u32 location;
+};
+
+static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
+{
+#ifdef CONFIG_NET_XGRESS
+ skb->tc_at_ingress = ingress;
+#endif
+}
+
+#ifdef CONFIG_NET_XGRESS
+static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
+{
+ struct bpf_mprog_bundle *bundle = entry->parent;
+
+ return container_of(bundle, struct tcx_entry, bundle);
+}
+
+static inline struct tcx_link *tcx_link(struct bpf_link *link)
+{
+ return container_of(link, struct tcx_link, link);
+}
+
+static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link)
+{
+ return tcx_link((struct bpf_link *)link);
+}
+
+void tcx_inc(void);
+void tcx_dec(void);
+
+static inline void tcx_entry_sync(void)
+{
+ /* bpf_mprog_entry got a/b swapped, therefore ensure that
+ * there are no inflight users on the old one anymore.
+ */
+ synchronize_rcu();
+}
+
+static inline void
+tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
+ bool ingress)
+{
+ ASSERT_RTNL();
+ if (ingress)
+ rcu_assign_pointer(dev->tcx_ingress, entry);
+ else
+ rcu_assign_pointer(dev->tcx_egress, entry);
+}
+
+static inline struct bpf_mprog_entry *
+tcx_entry_fetch(struct net_device *dev, bool ingress)
+{
+ ASSERT_RTNL();
+ if (ingress)
+ return rcu_dereference_rtnl(dev->tcx_ingress);
+ else
+ return rcu_dereference_rtnl(dev->tcx_egress);
+}
+
+static inline struct bpf_mprog_entry *tcx_entry_create(void)
+{
+ struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL);
+
+ if (tcx) {
+ bpf_mprog_bundle_init(&tcx->bundle);
+ return &tcx->bundle.a;
+ }
+ return NULL;
+}
+
+static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
+{
+ kfree_rcu(tcx_entry(entry), rcu);
+}
+
+static inline struct bpf_mprog_entry *
+tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
+{
+ struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);
+
+ *created = false;
+ if (!entry) {
+ entry = tcx_entry_create();
+ if (!entry)
+ return NULL;
+ *created = true;
+ }
+ return entry;
+}
+
+static inline void tcx_skeys_inc(bool ingress)
+{
+ tcx_inc();
+ if (ingress)
+ net_inc_ingress_queue();
+ else
+ net_inc_egress_queue();
+}
+
+static inline void tcx_skeys_dec(bool ingress)
+{
+ if (ingress)
+ net_dec_ingress_queue();
+ else
+ net_dec_egress_queue();
+ tcx_dec();
+}
+
+static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry,
+ const bool active)
+{
+ ASSERT_RTNL();
+ tcx_entry(entry)->miniq_active = active;
+}
+
+static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
+{
+ ASSERT_RTNL();
+ return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
+}
+
+static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
+ int code)
+{
+ switch (code) {
+ case TCX_PASS:
+ skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
+ fallthrough;
+ case TCX_DROP:
+ case TCX_REDIRECT:
+ return code;
+ case TCX_NEXT:
+ default:
+ return TCX_NEXT;
+ }
+}
+#endif /* CONFIG_NET_XGRESS */
+
+#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
+void tcx_uninstall(struct net_device *dev, bool ingress);
+
+int tcx_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+
+static inline void dev_tcx_uninstall(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ tcx_uninstall(dev, true);
+ tcx_uninstall(dev, false);
+}
+#else
+static inline int tcx_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int tcx_link_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int tcx_prog_detach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int tcx_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return -EINVAL;
+}
+
+static inline void dev_tcx_uninstall(struct net_device *dev)
+{
+}
+#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
+#endif /* __NET_TCX_H */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e96a1151ec75..1617af380162 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -52,6 +52,7 @@ struct xdp_sock {
struct xsk_buff_pool *pool;
u16 queue_id;
bool zc;
+ bool sg;
enum {
XSK_READY = 0,
XSK_BOUND,
@@ -67,6 +68,12 @@ struct xdp_sock {
u64 rx_dropped;
u64 rx_queue_full;
+ /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current
+ * packet, the partially built skb is saved here so that packet building can resume in next
+ * call of __xsk_generic_xmit().
+ */
+ struct sk_buff *skb;
+
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index c243f906ebed..1f6fc8c7a84c 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -89,6 +89,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
return xp_alloc(pool);
}
+static inline bool xsk_is_eop_desc(struct xdp_desc *desc)
+{
+ return !xp_mb_desc(desc);
+}
+
/* Returns as many entries as possible up to max. 0 <= N <= max. */
static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
{
@@ -103,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
static inline void xsk_buff_free(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ struct list_head *xskb_list = &xskb->pool->xskb_list;
+ struct xdp_buff_xsk *pos, *tmp;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+ list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+ list_del(&pos->xskb_list_node);
+ xp_free(pos);
+ }
+
+ xdp_get_shared_info_from_buff(xdp)->nr_frags = 0;
+out:
xp_free(xskb);
}
+static inline void xsk_buff_add_frag(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list);
+}
+
+static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+{
+ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
+ struct xdp_buff *ret = NULL;
+ struct xdp_buff_xsk *frag;
+
+ frag = list_first_entry_or_null(&xskb->pool->xskb_list,
+ struct xdp_buff_xsk, xskb_list_node);
+ if (frag) {
+ list_del(&frag->xskb_list_node);
+ ret = &frag->xdp;
+ }
+
+ return ret;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
@@ -241,6 +281,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
return NULL;
}
+static inline bool xsk_is_eop_desc(struct xdp_desc *desc)
+{
+ return false;
+}
+
static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
{
return 0;
@@ -255,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
{
}
+static inline void xsk_buff_add_frag(struct xdp_buff *xdp)
+{
+}
+
+static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+{
+ return NULL;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
}
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index a8d7b8a3688a..b0bdff26fc88 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -29,6 +29,7 @@ struct xdp_buff_xsk {
struct xsk_buff_pool *pool;
u64 orig_addr;
struct list_head free_list_node;
+ struct list_head xskb_list_node;
};
#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
@@ -54,6 +55,7 @@ struct xsk_buff_pool {
struct xdp_umem *umem;
struct work_struct work;
struct list_head free_list;
+ struct list_head xskb_list;
u32 heads_cnt;
u16 queue_id;
@@ -184,6 +186,11 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
!(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK);
}
+static inline bool xp_mb_desc(struct xdp_desc *desc)
+{
+ return desc->options & XDP_PKT_CONTD;
+}
+
static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
{
return addr & pool->chunk_mask;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 600d0caebbd8..739c15906a65 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1036,6 +1036,8 @@ enum bpf_attach_type {
BPF_LSM_CGROUP,
BPF_STRUCT_OPS,
BPF_NETFILTER,
+ BPF_TCX_INGRESS,
+ BPF_TCX_EGRESS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1053,7 +1055,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_KPROBE_MULTI = 8,
BPF_LINK_TYPE_STRUCT_OPS = 9,
BPF_LINK_TYPE_NETFILTER = 10,
-
+ BPF_LINK_TYPE_TCX = 11,
MAX_BPF_LINK_TYPE,
};
@@ -1113,7 +1115,12 @@ enum bpf_perf_event_type {
*/
#define BPF_F_ALLOW_OVERRIDE (1U << 0)
#define BPF_F_ALLOW_MULTI (1U << 1)
+/* Generic attachment flags. */
#define BPF_F_REPLACE (1U << 2)
+#define BPF_F_BEFORE (1U << 3)
+#define BPF_F_AFTER (1U << 4)
+#define BPF_F_ID (1U << 5)
+#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will perform strict alignment checking as if the kernel
@@ -1444,14 +1451,19 @@ union bpf_attr {
};
struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
- __u32 target_fd; /* container object to attach to */
- __u32 attach_bpf_fd; /* eBPF program to attach */
+ union {
+ __u32 target_fd; /* target object to attach to or ... */
+ __u32 target_ifindex; /* target ifindex */
+ };
+ __u32 attach_bpf_fd;
__u32 attach_type;
__u32 attach_flags;
- __u32 replace_bpf_fd; /* previously attached eBPF
- * program to replace if
- * BPF_F_REPLACE is used
- */
+ __u32 replace_bpf_fd;
+ union {
+ __u32 relative_fd;
+ __u32 relative_id;
+ };
+ __u64 expected_revision;
};
struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -1497,16 +1509,26 @@ union bpf_attr {
} info;
struct { /* anonymous struct used by BPF_PROG_QUERY command */
- __u32 target_fd; /* container object to query */
+ union {
+ __u32 target_fd; /* target object to query or ... */
+ __u32 target_ifindex; /* target ifindex */
+ };
__u32 attach_type;
__u32 query_flags;
__u32 attach_flags;
__aligned_u64 prog_ids;
- __u32 prog_cnt;
+ union {
+ __u32 prog_cnt;
+ __u32 count;
+ };
+ __u32 :32;
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
__aligned_u64 prog_attach_flags;
+ __aligned_u64 link_ids;
+ __aligned_u64 link_attach_flags;
+ __u64 revision;
} query;
struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
@@ -1549,13 +1571,13 @@ union bpf_attr {
__u32 map_fd; /* struct_ops to attach */
};
union {
- __u32 target_fd; /* object to attach to */
- __u32 target_ifindex; /* target ifindex */
+ __u32 target_fd; /* target object to attach to or ... */
+ __u32 target_ifindex; /* target ifindex */
};
__u32 attach_type; /* attach type */
__u32 flags; /* extra flags */
union {
- __u32 target_btf_id; /* btf_id of target to attach to */
+ __u32 target_btf_id; /* btf_id of target to attach to */
struct {
__aligned_u64 iter_info; /* extra bpf_iter_link_info */
__u32 iter_info_len; /* iter_info length */
@@ -1589,6 +1611,13 @@ union bpf_attr {
__s32 priority;
__u32 flags;
} netfilter;
+ struct {
+ union {
+ __u32 relative_fd;
+ __u32 relative_id;
+ };
+ __u64 expected_revision;
+ } tcx;
};
} link_create;
@@ -6197,6 +6226,19 @@ struct bpf_sock_tuple {
};
};
+/* (Simplified) user return codes for tcx prog type.
+ * A valid tcx program must return one of these defined values. All other
+ * return codes are reserved for future use. Must remain compatible with
+ * their TC_ACT_* counter-parts. For compatibility in behavior, unknown
+ * return codes are mapped to TCX_NEXT.
+ */
+enum tcx_action_base {
+ TCX_NEXT = -1,
+ TCX_PASS = 0,
+ TCX_DROP = 2,
+ TCX_REDIRECT = 7,
+};
+
struct bpf_xdp_sock {
__u32 queue_id;
};
@@ -6479,6 +6521,10 @@ struct bpf_link_info {
} event; /* BPF_PERF_EVENT_EVENT */
};
} perf_event;
+ struct {
+ __u32 ifindex;
+ __u32 attach_type;
+ } tcx;
};
} __attribute__((aligned(8)));
@@ -7052,6 +7098,7 @@ struct bpf_list_head {
struct bpf_list_node {
__u64 :64;
__u64 :64;
+ __u64 :64;
} __attribute__((aligned(8)));
struct bpf_rb_root {
@@ -7063,6 +7110,7 @@ struct bpf_rb_node {
__u64 :64;
__u64 :64;
__u64 :64;
+ __u64 :64;
} __attribute__((aligned(8)));
struct bpf_refcount {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a8096f4ce..8d48863472b9 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -25,6 +25,12 @@
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
+/* By setting this option, userspace application indicates that it can
+ * handle multiple descriptors per packet thus enabling AF_XDP to split
+ * multi-buffer XDP frames into multiple Rx descriptors. Without this set
+ * such frames will be dropped.
+ */
+#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
@@ -108,4 +114,11 @@ struct xdp_desc {
/* UMEM descriptor is __u64 */
+/* Flag indicating that the packet continues with the buffer pointed out by the
+ * next frame in the ring. The end of the packet is signalled by setting this
+ * bit to zero. For single buffer packets, every descriptor has 'options' set
+ * to 0 and this maintains backward compatibility.
+ */
+#define XDP_PKT_CONTD (1 << 0)
+
#endif /* _LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 639524b59930..bf71698a1e82 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -41,6 +41,7 @@ enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
+ NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)