summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-06-29 01:28:03 +0300
committerDavid S. Miller <davem@davemloft.net>2021-06-29 01:28:03 +0300
commite1289cfb634c19b5755452ba03c82aa76c0cfd7c (patch)
treefbe559fefa4b0141b31dc00bc3fbe962741a19f8 /net
parent1fd07f33c3ea2b4aa77426f13e8cb91d4f55af8f (diff)
parenta78cae2476812cecaa4a33d0086bbb53986906bc (diff)
downloadlinux-e1289cfb634c19b5755452ba03c82aa76c0cfd7c.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2021-06-28 The following pull-request contains BPF updates for your *net-next* tree. We've added 37 non-merge commits during the last 12 day(s) which contain a total of 56 files changed, 394 insertions(+), 380 deletions(-). The main changes are: 1) XDP driver RCU cleanups, from Toke Høiland-Jørgensen and Paul E. McKenney. 2) Fix bpf_skb_change_proto() IPv4/v6 GSO handling, from Maciej Żenczykowski. 3) Fix false positive kmemleak report for BPF ringbuf alloc, from Rustam Kovhaev. 4) Fix x86 JIT's extable offset calculation for PROBE_LDX NULL, from Ravi Bangoria. 5) Enable libbpf fallback probing with tracing under RHEL7, from Jonathan Edwards. 6) Clean up x86 JIT to remove unused cnt tracking from EMIT macro, from Jiri Olsa. 7) Netlink cleanups for libbpf to please Coverity, from Kumar Kartikeya Dwivedi. 8) Allow to retrieve ancestor cgroup id in tracing programs, from Namhyung Kim. 9) Fix lirc BPF program query to use user-provided prog_cnt, from Sean Young. 10) Add initial libbpf doc including generated kdoc for its API, from Grant Seltzer. 11) Make xdp_rxq_info_unreg_mem_model() more robust, from Jakub Kicinski. 12) Fix up bpfilter startup log-level to info level, from Gary Lin. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/bpfilter/main.c2
-rw-r--r--net/core/filter.c72
-rw-r--r--net/core/xdp.c11
-rw-r--r--net/sched/act_bpf.c2
-rw-r--r--net/sched/cls_bpf.c3
-rw-r--r--net/xdp/xsk.c4
-rw-r--r--net/xdp/xsk.h4
-rw-r--r--net/xdp/xskmap.c29
8 files changed, 65 insertions, 62 deletions
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
index 05e1cfc1e5cd..291a92546246 100644
--- a/net/bpfilter/main.c
+++ b/net/bpfilter/main.c
@@ -57,7 +57,7 @@ int main(void)
{
debug_f = fopen("/dev/kmsg", "w");
setvbuf(debug_f, 0, _IOLBF, 0);
- fprintf(debug_f, "Started bpfilter\n");
+ fprintf(debug_f, "<5>Started bpfilter\n");
loop();
fclose(debug_f);
return 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b13d8157a8f..d22895caa164 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3235,15 +3235,12 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
return ret;
}
-static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags)
+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3255,21 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV4 needs to be changed into
- * SKB_GSO_TCPV6.
- */
+ /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
if (shinfo->gso_type & SKB_GSO_TCPV4) {
shinfo->gso_type &= ~SKB_GSO_TCPV4;
shinfo->gso_type |= SKB_GSO_TCPV6;
}
-
- /* Due to IPv6 header, MSS needs to be downgraded. */
- if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
- skb_decrease_gso_size(shinfo, len_diff);
-
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IPV6);
@@ -3278,15 +3265,12 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags)
return 0;
}
-static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags)
+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -3298,21 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV6 needs to be changed into
- * SKB_GSO_TCPV4.
- */
+ /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
if (shinfo->gso_type & SKB_GSO_TCPV6) {
shinfo->gso_type &= ~SKB_GSO_TCPV6;
shinfo->gso_type |= SKB_GSO_TCPV4;
}
-
- /* Due to IPv4 header, MSS can be upgraded. */
- if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
- skb_increase_gso_size(shinfo, len_diff);
-
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IP);
@@ -3321,17 +3295,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags)
return 0;
}
-static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags)
+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
__be16 from_proto = skb->protocol;
if (from_proto == htons(ETH_P_IP) &&
to_proto == htons(ETH_P_IPV6))
- return bpf_skb_proto_4_to_6(skb, flags);
+ return bpf_skb_proto_4_to_6(skb);
if (from_proto == htons(ETH_P_IPV6) &&
to_proto == htons(ETH_P_IP))
- return bpf_skb_proto_6_to_4(skb, flags);
+ return bpf_skb_proto_6_to_4(skb);
return -ENOTSUPP;
}
@@ -3341,7 +3315,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
{
int ret;
- if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO)))
+ if (unlikely(flags))
return -EINVAL;
/* General idea is that this helper does the basic groundwork
@@ -3361,7 +3335,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
* that. For offloads, we mark packet as dodgy, so that headers
* need to be verified first.
*/
- ret = bpf_skb_proto_xlat(skb, proto, flags);
+ ret = bpf_skb_proto_xlat(skb, proto);
bpf_compute_data_pointers(skb);
return ret;
}
@@ -3923,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ * of the redirect and store it (along with some other metadata) in a per-CPU
+ * struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ * call xdp_do_redirect() which will use the information in struct
+ * bpf_redirect_info to actually enqueue the frame into a map type-specific
+ * bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ * which will flush all the different bulk queues, thus completing the
+ * redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
void xdp_do_flush(void)
{
__dev_flush();
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 725d20f1b100..cc92ccb38432 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator)
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
+ int type = xdp_rxq->mem.type;
int id = xdp_rxq->mem.id;
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
+
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return;
@@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
- if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+ if (type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
page_pool_destroy(xa->page_pool);
@@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
-
- /* Reset mem info to defaults */
- xdp_rxq->mem.id = 0;
- xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index e48e980c3b93..e409a0005717 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -43,7 +43,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
tcf_lastuse_update(&prog->tcf_tm);
bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
- rcu_read_lock();
filter = rcu_dereference(prog->filter);
if (at_ingress) {
__skb_push(skb, skb->mac_len);
@@ -56,7 +55,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
}
if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
skb_orphan(skb);
- rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
* Similarly as in cls_bpf, if filter_res == -1 we use the
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6e3e63db0e01..fa739efa59f4 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_bpf_prog *prog;
int ret = -1;
- /* Needed here for accessing maps. */
- rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res;
@@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
break;
}
- rcu_read_unlock();
return ret;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cd62d4ba87a9..996da915f520 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
}
static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
- struct xdp_sock ***map_entry)
+ struct xdp_sock __rcu ***map_entry)
{
struct xsk_map *map = NULL;
struct xsk_map_node *node;
@@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
* might be updates to the map between
* xsk_get_map_list_entry() and xsk_map_try_sock_delete().
*/
- struct xdp_sock **map_entry = NULL;
+ struct xdp_sock __rcu **map_entry = NULL;
struct xsk_map *map;
while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index edcf249ad1f1..a4bc4749faac 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 {
struct xsk_map_node {
struct list_head node;
struct xsk_map *map;
- struct xdp_sock **map_entry;
+ struct xdp_sock __rcu **map_entry;
};
static inline struct xdp_sock *xdp_sk(struct sock *sk)
@@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry);
+ struct xdp_sock __rcu **map_entry);
void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
u16 queue_id);
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 9df75ea4a567..2e48d0e094d9 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -12,7 +12,7 @@
#include "xsk.h"
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
- struct xdp_sock **map_entry)
+ struct xdp_sock __rcu **map_entry)
{
struct xsk_map_node *node;
@@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
}
static void xsk_map_sock_delete(struct xdp_sock *xs,
- struct xdp_sock **map_entry)
+ struct xdp_sock __rcu **map_entry)
{
struct xsk_map_node *n, *tmp;
@@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
@@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- return READ_ONCE(m->xsk_map[key]);
+ return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
return __xsk_map_lookup_elem(map, *(u32 *)key);
}
@@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *xs, *old_xs, **map_entry;
+ struct xdp_sock __rcu **map_entry;
+ struct xdp_sock *xs, *old_xs;
u32 i = *(u32 *)key, fd = *(u32 *)value;
struct xsk_map_node *node;
struct socket *sock;
@@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
}
spin_lock_bh(&m->lock);
- old_xs = READ_ONCE(*map_entry);
+ old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
if (old_xs == xs) {
err = 0;
goto out;
@@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
goto out;
}
xsk_map_sock_add(xs, node);
- WRITE_ONCE(*map_entry, xs);
+ rcu_assign_pointer(*map_entry, xs);
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
@@ -208,7 +212,8 @@ out:
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs, **map_entry;
+ struct xdp_sock __rcu **map_entry;
+ struct xdp_sock *old_xs;
int k = *(u32 *)key;
if (k >= map->max_entries)
@@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
spin_lock_bh(&m->lock);
map_entry = &m->xsk_map[k];
- old_xs = xchg(map_entry, NULL);
+ old_xs = unrcu_pointer(xchg(map_entry, NULL));
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
@@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry)
+ struct xdp_sock __rcu **map_entry)
{
spin_lock_bh(&map->lock);
- if (READ_ONCE(*map_entry) == xs) {
- WRITE_ONCE(*map_entry, NULL);
+ if (rcu_access_pointer(*map_entry) == xs) {
+ rcu_assign_pointer(*map_entry, NULL);
xsk_map_sock_delete(xs, map_entry);
}
spin_unlock_bh(&map->lock);