From 052c82dcdcbb6eb89d0967c309c010cd293076d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:54 -0800 Subject: selftests/bpf: Use if_nametoindex instead of reading the /sys/net/class/*/ifindex When switching netns, the setns_by_fd() is doing dances in mount/umounting the /sys directories. One reason is the tc_redirect.c test is depending on the /sys/net/class/*/ifindex instead of using the if_nametoindex(). if_nametoindex() uses ioctl() to get the ifindex. This patch is to move all /sys/net/class/*/ifindex usages to if_nametoindex(). The current code checks ifindex >= 0 which is incorrect. ifindex > 0 should be checked instead. This patch also stores ifindex_veth_src and ifindex_veth_dst since the latter patch will need them. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-2-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 46 +++++++++------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index cb6a53b3e023..2d85742efdd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -11,12 +11,12 @@ */ #include -#include #include #include #include #include #include +#include #include #include #include @@ -115,7 +115,9 @@ static void netns_setup_namespaces_nofail(const char *verb) } struct netns_setup_result { + int ifindex_veth_src; int ifindex_veth_src_fwd; + int ifindex_veth_dst; int ifindex_veth_dst_fwd; }; @@ -139,27 +141,6 @@ static int get_ifaddr(const char *name, char *ifaddr) return 0; } -static int get_ifindex(const char *name) -{ - char path[PATH_MAX]; - char buf[32]; - FILE *f; - int ret; - - snprintf(path, PATH_MAX, "/sys/class/net/%s/ifindex", name); - f = fopen(path, "r"); - if (!ASSERT_OK_PTR(f, path)) - return -1; - - ret = fread(buf, 1, sizeof(buf), f); - if (!ASSERT_GT(ret, 0, "fread ifindex")) { - fclose(f); - return -1; - } - fclose(f); - return atoi(buf); -} - #define SYS(fmt, ...) \ ({ \ char cmd[1024]; \ @@ -182,11 +163,20 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) goto fail; - result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd"); - if (result->ifindex_veth_src_fwd < 0) + result->ifindex_veth_src = if_nametoindex("veth_src"); + if (!ASSERT_GT(result->ifindex_veth_src, 0, "ifindex_veth_src")) goto fail; - result->ifindex_veth_dst_fwd = get_ifindex("veth_dst_fwd"); - if (result->ifindex_veth_dst_fwd < 0) + + result->ifindex_veth_src_fwd = if_nametoindex("veth_src_fwd"); + if (!ASSERT_GT(result->ifindex_veth_src_fwd, 0, "ifindex_veth_src_fwd")) + goto fail; + + result->ifindex_veth_dst = if_nametoindex("veth_dst"); + if (!ASSERT_GT(result->ifindex_veth_dst, 0, "ifindex_veth_dst")) + goto fail; + + result->ifindex_veth_dst_fwd = if_nametoindex("veth_dst_fwd"); + if (!ASSERT_GT(result->ifindex_veth_dst_fwd, 0, "ifindex_veth_dst_fwd")) goto fail; SYS("ip link set veth_src netns " NS_SRC); @@ -1034,8 +1024,8 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) goto fail; - ifindex = get_ifindex("tun_fwd"); - if (!ASSERT_GE(ifindex, 0, "get_ifindex tun_fwd")) + ifindex = if_nametoindex("tun_fwd"); + if (!ASSERT_GT(ifindex, 0, "if_indextoname tun_fwd")) goto fail; skel->rodata->IFINDEX_SRC = ifindex; -- cgit v1.2.3 From 57d0863f1d2812da543ef49172f60d6aa14bedcf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:55 -0800 Subject: selftests/bpf: Avoid pinning bpf prog in the tc_redirect_dtime test This patch removes the need to pin prog in the tc_redirect_dtime test by directly using the bpf_tc_hook_create() and bpf_tc_attach(). The clsact qdisc will go away together with the test netns, so no need to do bpf_tc_hook_destroy(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-3-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 149 ++++++++++++++------- 1 file changed, 100 insertions(+), 49 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 2d85742efdd3..690102f1ceda 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -250,6 +250,56 @@ fail: return -1; } +static int qdisc_clsact_create(struct bpf_tc_hook *qdisc_hook, int ifindex) +{ + char err_str[128], ifname[16]; + int err; + + qdisc_hook->ifindex = ifindex; + qdisc_hook->attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS; + err = bpf_tc_hook_create(qdisc_hook); + snprintf(err_str, sizeof(err_str), + "qdisc add dev %s clsact", + if_indextoname(qdisc_hook->ifindex, ifname) ? : ""); + err_str[sizeof(err_str) - 1] = 0; + ASSERT_OK(err, err_str); + + return err; +} + +static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook, + enum bpf_tc_attach_point xgress, + const struct bpf_program *prog, int priority) +{ + LIBBPF_OPTS(bpf_tc_opts, tc_attach); + char err_str[128], ifname[16]; + int err; + + qdisc_hook->attach_point = xgress; + tc_attach.prog_fd = bpf_program__fd(prog); + tc_attach.priority = priority; + err = bpf_tc_attach(qdisc_hook, &tc_attach); + snprintf(err_str, sizeof(err_str), + "filter add dev %s %s prio %d bpf da %s", + if_indextoname(qdisc_hook->ifindex, ifname) ? : "", + xgress == BPF_TC_INGRESS ? "ingress" : "egress", + priority, bpf_program__name(prog)); + err_str[sizeof(err_str) - 1] = 0; + ASSERT_OK(err, err_str); + + return err; +} + +#define QDISC_CLSACT_CREATE(qdisc_hook, ifindex) ({ \ + if ((err = qdisc_clsact_create(qdisc_hook, ifindex))) \ + goto fail; \ +}) + +#define XGRESS_FILTER_ADD(qdisc_hook, xgress, prog, priority) ({ \ + if ((err = xgress_filter_add(qdisc_hook, xgress, prog, priority))) \ + goto fail; \ +}) + static int netns_load_bpf(void) { SYS("tc qdisc add dev veth_src_fwd clsact"); @@ -489,78 +539,79 @@ done: close(client_fd); } -static int netns_load_dtime_bpf(struct test_tc_dtime *skel) +static int netns_load_dtime_bpf(struct test_tc_dtime *skel, + const struct netns_setup_result *setup_result) { + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst); struct nstoken *nstoken; - -#define PIN_FNAME(__file) "/sys/fs/bpf/" #__file -#define PIN(__prog) ({ \ - int err = bpf_program__pin(skel->progs.__prog, PIN_FNAME(__prog)); \ - if (!ASSERT_OK(err, "pin " #__prog)) \ - goto fail; \ - }) + int err; /* setup ns_src tc progs */ nstoken = open_netns(NS_SRC); if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC)) return -1; - PIN(egress_host); - PIN(ingress_host); - SYS("tc qdisc add dev veth_src clsact"); - SYS("tc filter add dev veth_src ingress bpf da object-pinned " - PIN_FNAME(ingress_host)); - SYS("tc filter add dev veth_src egress bpf da object-pinned " - PIN_FNAME(egress_host)); + /* tc qdisc add dev veth_src clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_src, setup_result->ifindex_veth_src); + /* tc filter add dev veth_src ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev veth_src egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_dst tc progs */ nstoken = open_netns(NS_DST); if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST)) return -1; - PIN(egress_host); - PIN(ingress_host); - SYS("tc qdisc add dev veth_dst clsact"); - SYS("tc filter add dev veth_dst ingress bpf da object-pinned " - PIN_FNAME(ingress_host)); - SYS("tc filter add dev veth_dst egress bpf da object-pinned " - PIN_FNAME(egress_host)); + /* tc qdisc add dev veth_dst clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_dst, setup_result->ifindex_veth_dst); + /* tc filter add dev veth_dst ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev veth_dst egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_fwd tc progs */ nstoken = open_netns(NS_FWD); if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD)) return -1; - PIN(ingress_fwdns_prio100); - PIN(egress_fwdns_prio100); - PIN(ingress_fwdns_prio101); - PIN(egress_fwdns_prio101); - SYS("tc qdisc add dev veth_dst_fwd clsact"); - SYS("tc filter add dev veth_dst_fwd ingress prio 100 bpf da object-pinned " - PIN_FNAME(ingress_fwdns_prio100)); - SYS("tc filter add dev veth_dst_fwd ingress prio 101 bpf da object-pinned " - PIN_FNAME(ingress_fwdns_prio101)); - SYS("tc filter add dev veth_dst_fwd egress prio 100 bpf da object-pinned " - PIN_FNAME(egress_fwdns_prio100)); - SYS("tc filter add dev veth_dst_fwd egress prio 101 bpf da object-pinned " - PIN_FNAME(egress_fwdns_prio101)); - SYS("tc qdisc add dev veth_src_fwd clsact"); - SYS("tc filter add dev veth_src_fwd ingress prio 100 bpf da object-pinned " - PIN_FNAME(ingress_fwdns_prio100)); - SYS("tc filter add dev veth_src_fwd ingress prio 101 bpf da object-pinned " - PIN_FNAME(ingress_fwdns_prio101)); - SYS("tc filter add dev veth_src_fwd egress prio 100 bpf da object-pinned " - PIN_FNAME(egress_fwdns_prio100)); - SYS("tc filter add dev veth_src_fwd egress prio 101 bpf da object-pinned " - PIN_FNAME(egress_fwdns_prio101)); + /* tc qdisc add dev veth_dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); + /* tc filter add dev veth_dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + skel->progs.ingress_fwdns_prio100, 100); + /* tc filter add dev veth_dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + skel->progs.ingress_fwdns_prio101, 101); + /* tc filter add dev veth_dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + skel->progs.egress_fwdns_prio100, 100); + /* tc filter add dev veth_dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + skel->progs.egress_fwdns_prio101, 101); + + /* tc qdisc add dev veth_src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); + /* tc filter add dev veth_src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + skel->progs.ingress_fwdns_prio100, 100); + /* tc filter add dev veth_src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + skel->progs.ingress_fwdns_prio101, 101); + /* tc filter add dev veth_src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + skel->progs.egress_fwdns_prio100, 100); + /* tc filter add dev veth_src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + skel->progs.egress_fwdns_prio101, 101); close_netns(nstoken); - -#undef PIN - return 0; fail: close_netns(nstoken); - return -1; + return err; } enum { @@ -736,7 +787,7 @@ static void test_tc_redirect_dtime(struct netns_setup_result *setup_result) if (!ASSERT_OK(err, "test_tc_dtime__load")) goto done; - if (netns_load_dtime_bpf(skel)) + if (netns_load_dtime_bpf(skel, setup_result)) goto done; nstoken = open_netns(NS_FWD); -- cgit v1.2.3 From f1b73577bb3c5b23872fde2907386635ea726d6d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:56 -0800 Subject: selftests/bpf: Avoid pinning bpf prog in the tc_redirect_peer_l3 test This patch removes the need to pin prog in the tc_redirect_peer_l3 test by directly using the bpf_tc_hook_create() and bpf_tc_attach(). The clsact qdisc will go away together with the test netns, so no need to do bpf_tc_hook_destroy(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-4-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 32 ++++++++-------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 690102f1ceda..cefde6491749 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -1032,6 +1032,8 @@ static int tun_relay_loop(int src_fd, int target_fd) static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) { + LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); struct test_tc_peer *skel = NULL; struct nstoken *nstoken = NULL; int err; @@ -1086,31 +1088,21 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) if (!ASSERT_OK(err, "test_tc_peer__load")) goto fail; - err = bpf_program__pin(skel->progs.tc_src_l3, SRC_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) - goto fail; - - err = bpf_program__pin(skel->progs.tc_dst_l3, DST_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) - goto fail; - - err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) - goto fail; - /* Load "tc_src_l3" to the tun_fwd interface to redirect packets * towards dst, and "tc_dst" to redirect packets * and "tc_chk" on veth_dst_fwd to drop non-redirected packets. */ - SYS("tc qdisc add dev tun_fwd clsact"); - SYS("tc filter add dev tun_fwd ingress bpf da object-pinned " - SRC_PROG_PIN_FILE); + /* tc qdisc add dev tun_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex); + /* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */ + XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0); - SYS("tc qdisc add dev veth_dst_fwd clsact"); - SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned " - DST_PROG_PIN_FILE); - SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned " - CHK_PROG_PIN_FILE); + /* tc qdisc add dev veth_dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); + /* tc filter add dev veth_dst_fwd ingress bpf da tc_dst_l3 */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); + /* tc filter add dev veth_dst_fwd egress bpf da tc_chk */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); /* Setup route and neigh tables */ SYS("ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24"); -- cgit v1.2.3 From 5dc42a7fc2866a56bc5616babace0a252458fe01 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:57 -0800 Subject: selftests/bpf: Avoid pinning bpf prog in the netns_load_bpf() callers This patch removes the need to pin prog in the remaining tests in tc_redirect.c by directly using the bpf_tc_hook_create() and bpf_tc_attach(). The clsact qdisc will go away together with the test netns, so no need to do bpf_tc_hook_destroy(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-5-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 83 +++++++--------------- 1 file changed, 27 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index cefde6491749..6f800381f924 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -59,10 +59,6 @@ #define IFADDR_STR_LEN 18 #define PING_ARGS "-i 0.2 -c 3 -w 10 -q" -#define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src" -#define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst" -#define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk" - #define TIMEOUT_MILLIS 10000 #define NSEC_PER_SEC 1000000000ULL @@ -300,19 +296,28 @@ static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook, goto fail; \ }) -static int netns_load_bpf(void) +static int netns_load_bpf(const struct bpf_program *src_prog, + const struct bpf_program *dst_prog, + const struct bpf_program *chk_prog, + const struct netns_setup_result *setup_result) { - SYS("tc qdisc add dev veth_src_fwd clsact"); - SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned " - SRC_PROG_PIN_FILE); - SYS("tc filter add dev veth_src_fwd egress bpf da object-pinned " - CHK_PROG_PIN_FILE); - - SYS("tc qdisc add dev veth_dst_fwd clsact"); - SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned " - DST_PROG_PIN_FILE); - SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned " - CHK_PROG_PIN_FILE); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + int err; + + /* tc qdisc add dev veth_src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); + /* tc filter add dev veth_src_fwd ingress bpf da src_prog */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, src_prog, 0); + /* tc filter add dev veth_src_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, chk_prog, 0); + + /* tc qdisc add dev veth_dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); + /* tc filter add dev veth_dst_fwd ingress bpf da dst_prog */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); + /* tc filter add dev veth_dst_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); return 0; fail: @@ -829,7 +834,6 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result) { struct nstoken *nstoken = NULL; struct test_tc_neigh_fib *skel = NULL; - int err; nstoken = open_netns(NS_FWD); if (!ASSERT_OK_PTR(nstoken, "setns fwd")) @@ -842,19 +846,8 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result) if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) goto done; - err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) - goto done; - - if (netns_load_bpf()) + if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst, + skel->progs.tc_chk, setup_result)) goto done; /* bpf_fib_lookup() checks if forwarding is enabled */ @@ -890,19 +883,8 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) if (!ASSERT_OK(err, "test_tc_neigh__load")) goto done; - err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) - goto done; - - if (netns_load_bpf()) + if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst, + skel->progs.tc_chk, setup_result)) goto done; if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) @@ -937,19 +919,8 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result) if (!ASSERT_OK(err, "test_tc_peer__load")) goto done; - err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) - goto done; - - err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) - goto done; - - if (netns_load_bpf()) + if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst, + skel->progs.tc_chk, setup_result)) goto done; if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) -- cgit v1.2.3 From 3084097c369c57c9bbd81e5bf47b36b6b27390fc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:58 -0800 Subject: selftests/bpf: Remove the "/sys" mount and umount dance in {open,close}_netns The previous patches have removed the need to do the mount and umount dance when switching netns. In particular: * Avoid remounting /sys/fs/bpf to have a clean start * Avoid remounting /sys to get a ifindex of a particular netns This patch can finally remove the mount and umount dance in {open,close}_netns which is unnecessarily complicated and error-prone. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-6-martin.lau@linux.dev --- tools/testing/selftests/bpf/network_helpers.c | 51 +++------------------------ 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 1f37adff7632..01de33191226 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -390,49 +390,6 @@ struct nstoken { int orig_netns_fd; }; -static int setns_by_fd(int nsfd) -{ - int err; - - err = setns(nsfd, CLONE_NEWNET); - close(nsfd); - - if (!ASSERT_OK(err, "setns")) - return err; - - /* Switch /sys to the new namespace so that e.g. /sys/class/net - * reflects the devices in the new namespace. - */ - err = unshare(CLONE_NEWNS); - if (!ASSERT_OK(err, "unshare")) - return err; - - /* Make our /sys mount private, so the following umount won't - * trigger the global umount in case it's shared. - */ - err = mount("none", "/sys", NULL, MS_PRIVATE, NULL); - if (!ASSERT_OK(err, "remount private /sys")) - return err; - - err = umount2("/sys", MNT_DETACH); - if (!ASSERT_OK(err, "umount2 /sys")) - return err; - - err = mount("sysfs", "/sys", "sysfs", 0, NULL); - if (!ASSERT_OK(err, "mount /sys")) - return err; - - err = mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL); - if (!ASSERT_OK(err, "mount /sys/fs/bpf")) - return err; - - err = mount("debugfs", "/sys/kernel/debug", "debugfs", 0, NULL); - if (!ASSERT_OK(err, "mount /sys/kernel/debug")) - return err; - - return 0; -} - struct nstoken *open_netns(const char *name) { int nsfd; @@ -453,8 +410,9 @@ struct nstoken *open_netns(const char *name) if (!ASSERT_GE(nsfd, 0, "open netns fd")) goto fail; - err = setns_by_fd(nsfd); - if (!ASSERT_OK(err, "setns_by_fd")) + err = setns(nsfd, CLONE_NEWNET); + close(nsfd); + if (!ASSERT_OK(err, "setns")) goto fail; return token; @@ -465,6 +423,7 @@ fail: void close_netns(struct nstoken *token) { - ASSERT_OK(setns_by_fd(token->orig_netns_fd), "setns_by_fd"); + ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + close(token->orig_netns_fd); free(token); } -- cgit v1.2.3 From 9b6a7773973707bad13f6c8b2a27e93ef4d46182 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:08:59 -0800 Subject: selftests/bpf: Remove serial from tests using {open,close}_netns After removing the mount/umount dance from {open,close}_netns() in the pervious patch, "serial_" can be removed from the tests using {open,close}_netns(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-7-martin.lau@linux.dev --- tools/testing/selftests/bpf/prog_tests/empty_skb.c | 2 +- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_tunnel.c | 2 +- tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c | 2 +- tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 0613f3bb8b5e..32dd731e9070 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -9,7 +9,7 @@ goto out; \ }) -void serial_test_empty_skb(void) +void test_empty_skb(void) { LIBBPF_OPTS(bpf_test_run_opts, tattr); struct empty_skb *bpf_obj = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 6f800381f924..bca5e6839ac4 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -1138,7 +1138,7 @@ static void *test_tc_redirect_run_tests(void *arg) return NULL; } -void serial_test_tc_redirect(void) +void test_tc_redirect(void) { pthread_t test_thread; int err; diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index eea274110267..07ad457f3370 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -421,7 +421,7 @@ static void *test_tunnel_run_tests(void *arg) return NULL; } -void serial_test_tunnel(void) +void test_tunnel(void) { pthread_t test_thread; int err; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 9ac6f6a268db..a50971c6cf4a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -85,7 +85,7 @@ static void test_max_pkt_size(int fd) } #define NUM_PKTS 10000 -void serial_test_xdp_do_redirect(void) +void test_xdp_do_redirect(void) { int err, xdp_prog_fd, tc_prog_fd, ifindex_src, ifindex_dst; char data[sizeof(pkt_udp) + sizeof(__u32)]; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c index 13daa3746064..c72083885b6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -174,7 +174,7 @@ out: system("ip netns del synproxy"); } -void serial_test_xdp_synproxy(void) +void test_xdp_synproxy(void) { if (test__start_subtest("xdp")) test_synproxy(true); -- cgit v1.2.3 From 443f216448ab5ddd1b4d08ad6c9b69628ac25adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 28 Nov 2022 23:09:00 -0800 Subject: selftests/bpf: Avoid pinning prog when attaching to tc ingress in btf_skc_cls_ingress This patch removes the need to pin prog when attaching to tc ingress in the btf_skc_cls_ingress test. Instead, directly use the bpf_tc_hook_create() and bpf_tc_attach(). The qdisc clsact will go away together with the netns, so no need to bpf_tc_hook_destroy(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221129070900.3142427-8-martin.lau@linux.dev --- .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 25 +++++++++------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c index 7a277035c275..ef4d6a3ae423 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -20,10 +21,12 @@ static struct test_btf_skc_cls_ingress *skel; static struct sockaddr_in6 srv_sa6; static __u32 duration; -#define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress" - static int prepare_netns(void) { + LIBBPF_OPTS(bpf_tc_hook, qdisc_lo, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach, + .prog_fd = bpf_program__fd(skel->progs.cls_ingress)); + if (CHECK(unshare(CLONE_NEWNET), "create netns", "unshare(CLONE_NEWNET): %s (%d)", strerror(errno), errno)) @@ -33,12 +36,12 @@ static int prepare_netns(void) "ip link set dev lo up", "failed\n")) return -1; - if (CHECK(system("tc qdisc add dev lo clsact"), - "tc qdisc add dev lo clsact", "failed\n")) + qdisc_lo.ifindex = if_nametoindex("lo"); + if (!ASSERT_OK(bpf_tc_hook_create(&qdisc_lo), "qdisc add dev lo clsact")) return -1; - if (CHECK(system("tc filter add dev lo ingress bpf direct-action object-pinned " PROG_PIN_FILE), - "install tc cls-prog at ingress", "failed\n")) + if (!ASSERT_OK(bpf_tc_attach(&qdisc_lo, &tc_attach), + "filter add dev lo ingress")) return -1; /* Ensure 20 bytes options (i.e. in total 40 bytes tcp header) for the @@ -195,19 +198,12 @@ static struct test tests[] = { void test_btf_skc_cls_ingress(void) { - int i, err; + int i; skel = test_btf_skc_cls_ingress__open_and_load(); if (CHECK(!skel, "test_btf_skc_cls_ingress__open_and_load", "failed\n")) return; - err = bpf_program__pin(skel->progs.cls_ingress, PROG_PIN_FILE); - if (CHECK(err, "bpf_program__pin", - "cannot pin bpf prog to %s. err:%d\n", PROG_PIN_FILE, err)) { - test_btf_skc_cls_ingress__destroy(skel); - return; - } - for (i = 0; i < ARRAY_SIZE(tests); i++) { if (!test__start_subtest(tests[i].desc)) continue; @@ -221,6 +217,5 @@ void test_btf_skc_cls_ingress(void) reset_test(); } - bpf_program__unpin(skel->progs.cls_ingress, PROG_PIN_FILE); test_btf_skc_cls_ingress__destroy(skel); } -- cgit v1.2.3 From b42693415b86f608049cf1b4870adc1dc65e58b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 30 Nov 2022 12:00:12 -0800 Subject: libbpf: Avoid enum forward-declarations in public API in C++ mode C++ enum forward declarations are fundamentally not compatible with pure C enum definitions, and so libbpf's use of `enum bpf_stats_type;` forward declaration in libbpf/bpf.h public API header is causing C++ compilation issues. More details can be found in [0], but it comes down to C++ supporting enum forward declaration only with explicitly specified backing type: enum bpf_stats_type: int; In C (and I believe it's a GCC extension also), such forward declaration is simply: enum bpf_stats_type; Further, in Linux UAPI this enum is defined in pure C way: enum bpf_stats_type { BPF_STATS_RUN_TIME = 0; } And even though in both cases backing type is int, which can be confirmed by looking at DWARF information, for C++ compiler actual enum definition and forward declaration are incompatible. To eliminate this problem, for C++ mode define input argument as int, which makes enum unnecessary in libbpf public header. This solves the issue and as demonstrated by next patch doesn't cause any unwanted compiler warnings, at least with default warnings setting. [0] https://stackoverflow.com/questions/42766839/c11-enum-forward-causes-underlying-type-mismatch [1] Closes: https://github.com/libbpf/libbpf/issues/249 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221130200013.2997831-1-andrii@kernel.org --- tools/lib/bpf/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index a112e0ed1b19..7468978d3c27 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -409,8 +409,15 @@ LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); +#ifdef __cplusplus +/* forward-declaring enums in C++ isn't compatible with pure C enums, so + * instead define bpf_enable_stats() as accepting int as an input + */ +LIBBPF_API int bpf_enable_stats(int type); +#else enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +#endif struct bpf_prog_bind_opts { size_t sz; /* size of this struct for forward/backward compatibility */ -- cgit v1.2.3 From f8186bf65ae6a4ce96d5cf52c2c9481c0e2193ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 30 Nov 2022 12:00:13 -0800 Subject: selftests/bpf: Make sure enum-less bpf_enable_stats() API works in C++ mode Just a simple test to make sure we don't introduce unwanted compiler warnings and API still supports passing enums as input argument. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221130200013.2997831-2-andrii@kernel.org --- tools/testing/selftests/bpf/test_cpp.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index 19ad172036da..0bd9990e83fa 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -1,9 +1,9 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #include -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include +#include +#include #include -#pragma GCC diagnostic pop #include #include #include "test_core_extern.skel.h" @@ -99,6 +99,7 @@ int main(int argc, char *argv[]) struct btf_dump_opts opts = { }; struct test_core_extern *skel; struct btf *btf; + int fd; try_skeleton_template(); @@ -117,6 +118,12 @@ int main(int argc, char *argv[]) skel = test_core_extern__open_and_load(); test_core_extern__destroy(skel); + fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (fd < 0) + std::cout << "FAILED to enable stats: " << fd << std::endl; + else + ::close(fd); + std::cout << "DONE!" << std::endl; return 0; -- cgit v1.2.3 From 996c060e2bb90e5caef42849846b56da21ea88d9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Nov 2022 23:20:49 +0100 Subject: selftests/bpf: Add bench test to arm64 and s390x denylist BPF CI fails for arm64 and s390x each with the following result: [...] All error logs: serial_test_kprobe_multi_bench_attach:PASS:get_syms 0 nsec serial_test_kprobe_multi_bench_attach:PASS:kprobe_multi_empty__open_and_load 0 nsec libbpf: prog 'test_kprobe_empty': failed to attach: Operation not supported serial_test_kprobe_multi_bench_attach:FAIL:bpf_program__attach_kprobe_multi_opts unexpected error: -95 #92 kprobe_multi_bench_attach:FAIL [...] Add the test to the deny list. Fixes: 5b6c7e5c4434 ("selftests/bpf: Add attach bench test") Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 8e77515d56f6..99cc33c51eaa 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -28,6 +28,7 @@ kfree_skb # attach fentry unexpected erro kfunc_call/subprog # extern (var ksym) 'bpf_prog_active': not found in kernel BTF kfunc_call/subprog_lskel # skel unexpected error: -2 kfunc_dynptr_param/dynptr_data_null # libbpf: prog 'dynptr_data_null': failed to attach: ERROR: strerror_r(-524)=22 +kprobe_multi_bench_attach # bpf_program__attach_kprobe_multi_opts unexpected error: -95 kprobe_multi_test/attach_api_addrs # bpf_program__attach_kprobe_multi_opts unexpected error: -95 kprobe_multi_test/attach_api_pattern # bpf_program__attach_kprobe_multi_opts unexpected error: -95 kprobe_multi_test/attach_api_syms # bpf_program__attach_kprobe_multi_opts unexpected error: -95 diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 648a8a1b6b78..3481f3a5ea6f 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -29,6 +29,7 @@ htab_update # failed to attach: ERROR: strerror_r(- kfree_skb # attach fentry unexpected error: -524 (trampoline) kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) +kprobe_multi_bench_attach # bpf_program__attach_kprobe_multi_opts unexpected error: -95 kprobe_multi_test # relies on fentry ksyms_module # test_ksyms_module__open_and_load unexpected error: -9 (?) ksyms_module_libbpf # JIT does not support calling kernel function (kfunc) -- cgit v1.2.3 From c67cae551f0df80421b5703ee56ff5e2fe9c4de6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 25 Nov 2022 14:06:17 -0800 Subject: bpf: Tighten ptr_to_btf_id checks. The networking programs typically don't require CAP_PERFMON, but through kfuncs like bpf_cast_to_kern_ctx() they can access memory through PTR_TO_BTF_ID. In such case enforce CAP_PERFMON. Also make sure that only GPL programs can access kernel data structures. All kfuncs require GPL already. Also remove allow_ptr_to_map_access. It's the same as allow_ptr_leaks and different name for the same check only causes confusion. Fixes: fd264ca02094 ("bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx") Fixes: 50c6b8a9aea2 ("selftests/bpf: Add a test for btf_type_tag "percpu"") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221125220617.26846-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 5 ----- include/linux/bpf_verifier.h | 1 - kernel/bpf/verifier.c | 17 ++++++++++++++--- tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c | 1 + tools/testing/selftests/bpf/verifier/map_ptr.c | 8 ++++---- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 67452103bb86..4920ac252754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void) return perfmon_capable(); } -static inline bool bpf_allow_ptr_to_map_access(void) -{ - return perfmon_capable(); -} - static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c05aa6e1f6f5..b5090e89cb3f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -531,7 +531,6 @@ struct bpf_verifier_env { bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; - bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e7f1d085e53..d0ecc0b18b20 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4703,6 +4703,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; + if (!env->allow_ptr_leaks) { + verbose(env, + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) { + verbose(env, + "Cannot access kernel 'struct %s' from non-GPL compatible program\n", + tname); + return -EINVAL; + } if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -4823,9 +4835,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!env->allow_ptr_to_map_access) { + if (!env->allow_ptr_leaks) { verbose(env, - "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", tname); return -EPERM; } @@ -16679,7 +16691,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c index 8feddb8289cf..38f78d9345de 100644 --- a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c +++ b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c @@ -64,3 +64,4 @@ int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path) return 0; } +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index 1f82021429bf..17ee84dc7766 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -9,7 +9,7 @@ }, .fixup_map_array_48b = { 1 }, .result_unpriv = REJECT, - .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", + .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = REJECT, .errstr = "R1 is bpf_array invalid negative access: off=-8", }, @@ -26,7 +26,7 @@ }, .fixup_map_array_48b = { 3 }, .result_unpriv = REJECT, - .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", + .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = REJECT, .errstr = "only read from bpf_array is supported", }, @@ -41,7 +41,7 @@ }, .fixup_map_array_48b = { 1 }, .result_unpriv = REJECT, - .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", + .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = REJECT, .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -57,7 +57,7 @@ }, .fixup_map_array_48b = { 1 }, .result_unpriv = REJECT, - .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", + .errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = ACCEPT, .retval = 1, }, -- cgit v1.2.3 From 7a9841ca025275b5b0edfb0b618934abb6ceec15 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:38 +0800 Subject: bpf, sockmap: Fix repeated calls to sock_put() when msg has more_data In tcp_bpf_send_verdict() redirection, the eval variable is assigned to __SK_REDIRECT after the apply_bytes data is sent, if msg has more_data, sock_put() will be called multiple times. We should reset the eval variable to __SK_NONE every time more_data starts. This causes: IPv4: Attempt to release TCP socket in state 1 00000000b4c925d7 ------------[ cut here ]------------ refcount_t: addition on 0; use-after-free. WARNING: CPU: 5 PID: 4482 at lib/refcount.c:25 refcount_warn_saturate+0x7d/0x110 Modules linked in: CPU: 5 PID: 4482 Comm: sockhash_bypass Kdump: loaded Not tainted 6.0.0 #1 Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014 Call Trace: __tcp_transmit_skb+0xa1b/0xb90 ? __alloc_skb+0x8c/0x1a0 ? __kmalloc_node_track_caller+0x184/0x320 tcp_write_xmit+0x22a/0x1110 __tcp_push_pending_frames+0x32/0xf0 do_tcp_sendpages+0x62d/0x640 tcp_bpf_push+0xae/0x2c0 tcp_bpf_sendmsg_redir+0x260/0x410 ? preempt_count_add+0x70/0xa0 tcp_bpf_send_verdict+0x386/0x4b0 tcp_bpf_sendmsg+0x21b/0x3b0 sock_sendmsg+0x58/0x70 __sys_sendto+0xfa/0x170 ? xfd_validate_state+0x1d/0x80 ? switch_fpu_return+0x59/0xe0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: cd9733f5d75c ("tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-2-git-send-email-yangpc@wangsu.com --- net/ipv4/tcp_bpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index cf9c3e8f7ccb..f3e868f4cd9e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -279,7 +279,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; - u32 eval = __SK_NONE; + u32 eval; int ret; more_data: @@ -310,6 +310,7 @@ more_data: tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: -- cgit v1.2.3 From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:39 +0800 Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS flag from the msg->flags. If apply_bytes is used and it is larger than the current data being processed, sk_psock_msg_verdict() will not be called when sendmsg() is called again. At this time, the msg->flags is 0, and we lost the BPF_F_INGRESS flag. So we need to save the BPF_F_INGRESS flag in sk_psock and use it when redirection. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com --- include/linux/skmsg.h | 1 + include/net/tcp.h | 4 ++-- net/core/skmsg.c | 9 ++++++--- net/ipv4/tcp_bpf.c | 11 ++++++----- net/tls/tls_sw.c | 6 ++++-- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 70d6cb94e580..84f787416a54 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -82,6 +82,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6b814e788f00..b87e7381bddf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2319,8 +2319,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, - int flags); +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e6b9ced3eda8..53d0251788aa 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { - if (psock->sk_redir) + if (psock->sk_redir) { sock_put(psock->sk_redir); - psock->sk_redir = msg->sk_redir; - if (!psock->sk_redir) { + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { ret = __SK_DROP; goto out; } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f3e868f4cd9e..275c5ca9e04d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -131,10 +131,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, return ret; } -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, - u32 bytes, int flags) +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags) { - bool ingress = sk_msg_to_ingress(msg); struct sk_psock *psock = sk_psock_get(sk); int ret; @@ -276,7 +275,7 @@ msg_bytes_ready: static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = sk_msg_full(msg); + bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; @@ -322,6 +321,7 @@ more_data: sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { @@ -338,7 +338,8 @@ more_data: release_sock(sk); origsize = msg->sg.size; - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 264cf367e265..9ed978634125 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -792,7 +792,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; - bool enospc, policy; + bool enospc, policy, redir_ingress; int err = 0, send; u32 delta = 0; @@ -837,6 +837,7 @@ more_data: } break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; memcpy(&msg_redir, msg, sizeof(*msg)); if (msg->apply_bytes < send) @@ -846,7 +847,8 @@ more_data: sk_msg_return_zero(sk, msg, send); msg->sg.size -= send; release_sock(sk); - err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + &msg_redir, send, flags); lock_sock(sk); if (err < 0) { *copied -= sk_msg_free_nocharge(sk, &msg_redir); -- cgit v1.2.3 From 9072931f020bfd907d6d89ee21ff1481cd78b407 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:40 +0800 Subject: bpf, sockmap: Fix data loss caused by using apply_bytes on ingress redirect Use apply_bytes on ingress redirect, when apply_bytes is less than the length of msg data, some data may be skipped and lost in bpf_tcp_ingress(). If there is still data in the scatterlist that has not been consumed, we cannot move the msg iter. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-4-git-send-email-yangpc@wangsu.com --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 275c5ca9e04d..94aad3870c5f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -45,8 +45,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); -- cgit v1.2.3 From 89903dcb3c2e134fb101de7921a19dd9f8418b4c Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:41 +0800 Subject: selftests/bpf: Add ingress tests for txmsg with apply_bytes Currently, the ingress redirect is not covered in "txmsg test apply". Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-5-git-send-email-yangpc@wangsu.com --- tools/testing/selftests/bpf/test_sockmap.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index e768181a1bd7..024a0faafb3b 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1690,24 +1690,42 @@ static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) { txmsg_pass = 1; txmsg_redir = 0; + txmsg_ingress = 0; txmsg_apply = 1; txmsg_cork = 0; test_send_one(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; + txmsg_ingress = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_ingress = 1; txmsg_apply = 1; txmsg_cork = 0; test_send_one(opt, cgrp); txmsg_pass = 1; txmsg_redir = 0; + txmsg_ingress = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_ingress = 0; txmsg_apply = 1024; txmsg_cork = 0; test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; + txmsg_ingress = 1; txmsg_apply = 1024; txmsg_cork = 0; test_send_large(opt, cgrp); -- cgit v1.2.3 From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Nov 2022 21:21:47 -0800 Subject: bpf: Fix a compilation failure with clang lto build When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the following compilation error will appear: $ make LLVM=1 LLVM_IAS=1 -j ... ld.lld: error: ld-temp.o :26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined cgroup_storage_map_btf_ids:; ^ make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1 In local_storage.c, we have BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker complains with lto build. Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times for sk, inode, task and cgrp local storages. Let us define a single global one with a different name than cgroup_storage_map_btf_ids, which also fixed the lto compilation error. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com --- include/linux/btf_ids.h | 1 + kernel/bpf/bpf_cgrp_storage.c | 3 +-- kernel/bpf/bpf_inode_storage.c | 4 +--- kernel/bpf/bpf_task_storage.c | 4 ++-- net/core/bpf_sk_storage.c | 3 +-- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93397711a68c..3a4f7cd882ca 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; +extern u32 bpf_local_storage_map_btf_id[]; #endif diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 309403800f82..6cdf6d9ed91d 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -211,7 +211,6 @@ BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgro return ret; } -BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -222,7 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = { .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &cgroup_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6a1d4d22816a..05f4c66c9089 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -213,8 +213,6 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &inode_cache, NULL); } -BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, - bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -225,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &inode_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 8e832db8151a..1e486055a523 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -324,7 +324,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } -BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -335,7 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &task_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 9d2288c0736e..bb378c33f542 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -310,7 +310,6 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -321,7 +320,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &sk_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, -- cgit v1.2.3 From 1f82dffc10ff8e44bd0c2c85ba6e21189b4a5695 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 1 Dec 2022 10:34:05 -0800 Subject: bpf: Fix release_on_unlock release logic for multiple refs Consider a verifier state with three acquired references, all with release_on_unlock = true: idx 0 1 2 state->refs = [2 4 6] (with 2, 4, and 6 being the ref ids). When bpf_spin_unlock is called, process_spin_lock will loop through all acquired_refs and, for each ref, if it's release_on_unlock, calls release_reference on it. That function in turn calls release_reference_state, which removes the reference from state->refs by swapping the reference state with the last reference state in refs array and decrements acquired_refs count. process_spin_lock's loop logic, which is essentially: for (i = 0; i < state->acquired_refs; i++) { if (!state->refs[i].release_on_unlock) continue; release_reference(state->refs[i].id); } will fail to release release_on_unlock references which are swapped from the end. Running this logic on our example demonstrates: state->refs = [2 4 6] (start of idx=0 iter) release state->refs[0] by swapping w/ state->refs[2] state->refs = [6 4] (start of idx=1) release state->refs[1], no need to swap as it's the last idx state->refs = [6] (start of idx=2, loop terminates) ref_id 6 should have been removed but was skipped. Fix this by looping from back-to-front, which results in refs that are candidates for removal being swapped with refs which have already been examined and kept. If we modify our initial example such that ref 6 is replaced with ref 7, which is _not_ release_on_unlock, and loop from the back, we'd see: state->refs = [2 4 7] (start of idx=2) state->refs = [2 4 7] (start of idx=1) state->refs = [2 7] (start of idx=0, refs 7 and 4 swapped) state->refs = [7] (after idx=0, 7 and 2 swapped, loop terminates) Signed-off-by: Dave Marchevsky Acked-by: Yonghong Song cc: Kumar Kartikeya Dwivedi Fixes: 534e86bc6c66 ("bpf: Add 'release on unlock' logic for bpf_list_push_{front,back}") Link: https://lore.kernel.org/r/20221201183406.1203621-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d0ecc0b18b20..b0db9c10567b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5738,7 +5738,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, cur->active_lock.ptr = NULL; cur->active_lock.id = 0; - for (i = 0; i < fstate->acquired_refs; i++) { + for (i = fstate->acquired_refs - 1; i >= 0; i--) { int err; /* Complain on error because this reference state cannot -- cgit v1.2.3 From 78b037bd402df8eca0f45ef003c6d0ab25a26ecc Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 1 Dec 2022 10:34:06 -0800 Subject: selftests/bpf: Validate multiple ref release_on_unlock logic Modify list_push_pop_multiple to alloc and insert nodes 2-at-a-time. Without the previous patch's fix, this block of code: bpf_spin_lock(lock); bpf_list_push_front(head, &f[i]->node); bpf_list_push_front(head, &f[i + 1]->node); bpf_spin_unlock(lock); would fail check_reference_leak check as release_on_unlock logic would miss a ref that should've been released. Signed-off-by: Dave Marchevsky cc: Kumar Kartikeya Dwivedi Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221201183406.1203621-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/linked_list.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 2c7b615c6d41..4ad88da5cda2 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -99,13 +99,28 @@ int list_push_pop_multiple(struct bpf_spin_lock *lock, struct bpf_list_head *hea struct foo *f[8], *pf; int i; - for (i = 0; i < ARRAY_SIZE(f); i++) { + /* Loop following this check adds nodes 2-at-a-time in order to + * validate multiple release_on_unlock release logic + */ + if (ARRAY_SIZE(f) % 2) + return 10; + + for (i = 0; i < ARRAY_SIZE(f); i += 2) { f[i] = bpf_obj_new(typeof(**f)); if (!f[i]) return 2; f[i]->data = i; + + f[i + 1] = bpf_obj_new(typeof(**f)); + if (!f[i + 1]) { + bpf_obj_drop(f[i]); + return 9; + } + f[i + 1]->data = i + 1; + bpf_spin_lock(lock); bpf_list_push_front(head, &f[i]->node); + bpf_list_push_front(head, &f[i + 1]->node); bpf_spin_unlock(lock); } -- cgit v1.2.3 From bc067cacb69c6ee0da10d357458fa7f1fe54e472 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 29 Nov 2022 21:45:58 +0800 Subject: bpf, docs: Correct the example of BPF_XOR Refer to description of BPF_XOR, dst_reg should be used but not src_reg in the examples. Fixes: be3193cded9d ("bpf, docs: Add subsections for ALU and JMP instructions") Signed-off-by: Zheng Yejian Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221129134558.2757043-1-zhengyejian1@huawei.com --- Documentation/bpf/instruction-set.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 5d798437dad4..e672d5ec6cc7 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -122,11 +122,11 @@ BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below) ``BPF_XOR | BPF_K | BPF_ALU`` means:: - src_reg = (u32) src_reg ^ (u32) imm32 + dst_reg = (u32) dst_reg ^ (u32) imm32 ``BPF_XOR | BPF_K | BPF_ALU64`` means:: - src_reg = src_reg ^ imm32 + dst_reg = dst_reg ^ imm32 Byte swap instructions -- cgit v1.2.3 From f16a7aa5c2be3134d3b72078b94f36d639552888 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Thu, 1 Dec 2022 12:09:39 -0700 Subject: selftests/bpf: Add GCC compatible builtins to bpf_legacy.h The bpf_legacy.h header uses llvm specific load functions, add GCC compatible variants as well to fix tests using these functions under GCC. Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221201190939.3230513-1-james.hilliard1@gmail.com --- tools/testing/selftests/bpf/bpf_legacy.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_legacy.h b/tools/testing/selftests/bpf/bpf_legacy.h index 845209581440..bc4555a003a7 100644 --- a/tools/testing/selftests/bpf/bpf_legacy.h +++ b/tools/testing/selftests/bpf/bpf_legacy.h @@ -2,15 +2,22 @@ #ifndef __BPF_LEGACY__ #define __BPF_LEGACY__ +#if __GNUC__ && !__clang__ +/* Functions to emit BPF_LD_ABS and BPF_LD_IND instructions. We + * provide the "standard" names as synonyms of the corresponding GCC + * builtins. Note how the SKB argument is ignored. + */ +#define load_byte(skb, off) __builtin_bpf_load_byte(off) +#define load_half(skb, off) __builtin_bpf_load_half(off) +#define load_word(skb, off) __builtin_bpf_load_word(off) +#else /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions */ -unsigned long long load_byte(void *skb, - unsigned long long off) asm("llvm.bpf.load.byte"); -unsigned long long load_half(void *skb, - unsigned long long off) asm("llvm.bpf.load.half"); -unsigned long long load_word(void *skb, - unsigned long long off) asm("llvm.bpf.load.word"); +unsigned long long load_byte(void *skb, unsigned long long off) asm("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, unsigned long long off) asm("llvm.bpf.load.half"); +unsigned long long load_word(void *skb, unsigned long long off) asm("llvm.bpf.load.word"); +#endif #endif -- cgit v1.2.3 From 706819495921ddad6b3780140b9d9e9293b6dedc Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Fri, 2 Dec 2022 16:17:38 +0800 Subject: libbpf: Improve usability of libbpf Makefile Current libbpf Makefile does not contain the help command, which is inconvenient to use. Similar to the Makefile help command of the perf, a help command is provided to list the commands supported by libbpf make and the functions of the commands. Signed-off-by: Xin Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221202081738.128513-1-liuxin350@huawei.com --- tools/lib/bpf/Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 4c904ef0b47e..477666f3d496 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -286,3 +286,20 @@ tags: # Delete partially updated (corrupted) files on error .DELETE_ON_ERROR: + +help: + @echo 'libbpf common targets:' + @echo ' HINT: use "V=1" to enable verbose build' + @echo ' all - build libraries and pkgconfig' + @echo ' clean - remove all generated files' + @echo ' check - check abi and version info' + @echo '' + @echo 'libbpf install targets:' + @echo ' HINT: use "prefix"(defaults to "/usr/local") or "DESTDIR" (defaults to "/")' + @echo ' to adjust target desitantion, e.g. "make prefix=/usr/local install"' + @echo ' install - build and install all headers, libraries and pkgconfig' + @echo ' install_headers - install only headers to include/bpf' + @echo '' + @echo 'libbpf make targets:' + @echo ' tags - use ctags to make tag information for source code browsing' + @echo ' cscope - use cscope to make interactive source code browsing database' -- cgit v1.2.3 From fca1aa75518c03b04c3c249e9a9134faf9ca18c5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:02 -0800 Subject: bpf: Handle MEM_RCU type properly Commit 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") introduced MEM_RCU and bpf_rcu_read_lock/unlock() support. In that commit, a rcu pointer is tagged with both MEM_RCU and PTR_TRUSTED so that it can be passed into kfuncs or helpers as an argument. Martin raised a good question in [1] such that the rcu pointer, although being able to accessing the object, might have reference count of 0. This might cause a problem if the rcu pointer is passed to a kfunc which expects trusted arguments where ref count should be greater than 0. This patch makes the following changes related to MEM_RCU pointer: - MEM_RCU pointer might be NULL (PTR_MAYBE_NULL). - Introduce KF_RCU so MEM_RCU ptr can be acquired with a KF_RCU tagged kfunc which assumes ref count of rcu ptr could be zero. - For mem access 'b = ptr->a', say 'ptr' is a MEM_RCU ptr, and 'a' is tagged with __rcu as well. Let us mark 'b' as MEM_RCU | PTR_MAYBE_NULL. [1] https://lore.kernel.org/bpf/ac70f574-4023-664e-b711-e0d3b18117fd@linux.dev/ Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184602.477272-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/linux/btf.h | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++------------- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b5090e89cb3f..c07b351a5bc7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { diff --git a/include/linux/btf.h b/include/linux/btf.h index 9ed00077db6e..cbd6e4096f8c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,6 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ /* * Return the name of the passed struct, if exists, or halt the build if for diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a5a511430f2a..cca642358e80 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1837,6 +1837,19 @@ struct task_struct *bpf_task_acquire(struct task_struct *p) return p; } +/** + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task + * acquired by this kfunc which is not stored in a map as a kptr, must be + * released by calling bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +{ + if (!refcount_inc_not_zero(&p->rcu_users)) + return NULL; + return p; +} + /** * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task * kptr acquired by this kfunc which is not subsequently stored in a map, must @@ -2013,6 +2026,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) #ifdef CONFIG_CGROUPS diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0db9c10567b..66b82f52b5bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4275,7 +4275,7 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* If a register is not referenced, it is trusted if it has the - * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the * other type modifiers may be safe, but we elect to take an opt-in * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are * not. @@ -4287,6 +4287,11 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) !bpf_type_has_unsafe_modifiers(reg->type); } +static bool is_rcu_reg(const struct bpf_reg_state *reg) +{ + return reg->type & MEM_RCU; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -4785,14 +4790,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (flag & MEM_RCU) { /* Mark value register as MEM_RCU only if it is protected by - * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU + * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU * itself can already indicate trustedness inside the rcu - * read lock region. Also mark it as PTR_TRUSTED. + * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since + * it could be null in some cases. */ - if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg)) + if (!env->cur_state->active_rcu_lock || + !(is_trusted_reg(reg) || is_rcu_reg(reg))) flag &= ~MEM_RCU; else - flag |= PTR_TRUSTED; + flag |= PTR_MAYBE_NULL; } else if (reg->type & MEM_RCU) { /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. @@ -5957,7 +5964,7 @@ static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID, PTR_TO_BTF_ID | PTR_TRUSTED, - PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED, + PTR_TO_BTF_ID | MEM_RCU, }, }; static const struct bpf_reg_types percpu_btf_ptr_types = { @@ -6136,7 +6143,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | PTR_TRUSTED: - case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: /* When referenced PTR_TO_BTF_ID is passed to release function, * it's fixed offset must be 0. In the other cases, fixed offset @@ -8038,6 +8045,11 @@ static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_DESTRUCTIVE; } +static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU; +} + static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) { return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); @@ -8722,13 +8734,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ switch (kf_arg_type) { case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_kfunc_trusted_args(meta)) + if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; if (!is_trusted_reg(reg)) { - verbose(env, "R%d must be referenced or trusted\n", regno); - return -EINVAL; + if (!is_kfunc_rcu(meta)) { + verbose(env, "R%d must be referenced or trusted\n", regno); + return -EINVAL; + } + if (!is_rcu_reg(reg)) { + verbose(env, "R%d must be a rcu pointer\n", regno); + return -EINVAL; + } } + fallthrough; case KF_ARG_PTR_TO_CTX: /* Trusted arguments have the same offset checks as release arguments */ @@ -8839,7 +8858,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || - bpf_type_has_unsafe_modifiers(reg->type)) && + (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && !reg2btf_ids[base_type(reg->type)]) { verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); verbose(env, "expected %s or socket\n", @@ -8954,7 +8973,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } else if (rcu_unlock) { bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg->type & MEM_RCU) { - reg->type &= ~(MEM_RCU | PTR_TRUSTED); + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); @@ -11294,7 +11313,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, bool is_null) { if (type_may_be_null(reg->type) && reg->id == id && - !WARN_ON_ONCE(!reg->id)) { + (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { /* Old offset (both fixed and variable parts) should have been * known-zero, because we don't allow pointer arithmetic on * pointers that might be NULL. If we see this happening, don't -- cgit v1.2.3 From 8723ec22a31db3f400fce440c235ada0fff95657 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:07 -0800 Subject: selftests/bpf: Fix rcu_read_lock test with new MEM_RCU semantics Add MEM_RCU pointer null checking for related tests. Also modified task_acquire test so it takes a rcu ptr 'ptr' where 'ptr = rcu_ptr->rcu_field'. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184607.478314-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 55 ++++++++++++++++++----- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 94a970076b98..cf06a34fcb02 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -23,13 +23,14 @@ struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym; void bpf_key_put(struct bpf_key *key) __ksym; void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; -struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) __ksym; void bpf_task_release(struct task_struct *p) __ksym; SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") int get_cgroup_id(void *ctx) { struct task_struct *task; + struct css_set *cgroups; task = bpf_get_current_task_btf(); if (task->pid != target_pid) @@ -37,7 +38,11 @@ int get_cgroup_id(void *ctx) /* simulate bpf_get_current_cgroup_id() helper */ bpf_rcu_read_lock(); - cgroup_id = task->cgroups->dfl_cgrp->kn->id; + cgroups = task->cgroups; + if (!cgroups) + goto unlock; + cgroup_id = cgroups->dfl_cgrp->kn->id; +unlock: bpf_rcu_read_unlock(); return 0; } @@ -56,6 +61,8 @@ int task_succ(void *ctx) bpf_rcu_read_lock(); /* region including helper using rcu ptr real_parent */ real_parent = task->real_parent; + if (!real_parent) + goto out; ptr = bpf_task_storage_get(&map_a, real_parent, &init_val, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!ptr) @@ -92,7 +99,10 @@ int two_regions(void *ctx) bpf_rcu_read_unlock(); bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: bpf_rcu_read_unlock(); return 0; } @@ -105,7 +115,10 @@ int non_sleepable_1(void *ctx) task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: bpf_rcu_read_unlock(); return 0; } @@ -121,7 +134,10 @@ int non_sleepable_2(void *ctx) bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: bpf_rcu_read_unlock(); return 0; } @@ -129,16 +145,28 @@ int non_sleepable_2(void *ctx) SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") int task_acquire(void *ctx) { - struct task_struct *task, *real_parent; + struct task_struct *task, *real_parent, *gparent; task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; + + /* rcu_ptr->rcu_field */ + gparent = real_parent->real_parent; + if (!gparent) + goto out; + /* acquire a reference which can be used outside rcu read lock region */ - real_parent = bpf_task_acquire(real_parent); + gparent = bpf_task_acquire_not_zero(gparent); + if (!gparent) + goto out; + + (void)bpf_task_storage_get(&map_a, gparent, 0, 0); + bpf_task_release(gparent); +out: bpf_rcu_read_unlock(); - (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); - bpf_task_release(real_parent); return 0; } @@ -181,9 +209,12 @@ int non_sleepable_rcu_mismatch(void *ctx) /* non-sleepable: missing bpf_rcu_read_unlock() in one path */ bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); if (real_parent) bpf_rcu_read_unlock(); +out: return 0; } @@ -199,16 +230,17 @@ int inproper_sleepable_helper(void *ctx) /* sleepable helper in rcu read lock region */ bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; regs = (struct pt_regs *)bpf_task_pt_regs(real_parent); - if (!regs) { - bpf_rcu_read_unlock(); - return 0; - } + if (!regs) + goto out; ptr = (void *)PT_REGS_IP(regs); (void)bpf_copy_from_user_task(&value, sizeof(uint32_t), ptr, task, 0); user_data = value; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: bpf_rcu_read_unlock(); return 0; } @@ -239,7 +271,10 @@ int nested_rcu_region(void *ctx) bpf_rcu_read_lock(); bpf_rcu_read_lock(); real_parent = task->real_parent; + if (!real_parent) + goto out; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: bpf_rcu_read_unlock(); bpf_rcu_read_unlock(); return 0; -- cgit v1.2.3 From f53625649888c6ec9eeca151f802e905482c6698 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:13 -0800 Subject: docs/bpf: Add KF_RCU documentation Add proper KF_RCU documentation in kfuncs.rst. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184613.478967-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 90774479ab7a..b027fe16ee66 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -191,6 +191,15 @@ rebooting or panicking. Due to this additional restrictions apply to these calls. At the moment they only require CAP_SYS_BOOT capability, but more can be added later. +2.4.8 KF_RCU flag +----------------- + +The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument. +When used together with KF_ACQUIRE, it indicates the kfunc should have a +single argument which must be a trusted argument or a MEM_RCU pointer. +The argument may have reference count of 0 and the kfunc must take this +into consideration. + 2.5 Registering the kfuncs -------------------------- -- cgit v1.2.3 From c0c852dd1876dc1db4600ce951a92aadd3073b1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 12:49:54 -0800 Subject: bpf: Do not mark certain LSM hook arguments as trusted Martin mentioned that the verifier cannot assume arguments from LSM hook sk_alloc_security being trusted since after the hook is called, the sk ref_count is set to 1. This will overwrite the ref_count changed by the bpf program and may cause ref_count underflow later on. I then further checked some other hooks. For example, for bpf_lsm_file_alloc() hook in fs/file_table.c, f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); The input parameter 'f' to security_file_alloc() cannot be trusted as well. Specifically, I investiaged bpf_map/bpf_prog/file/sk/task alloc/free lsm hooks. Except bpf_map_alloc and task_alloc, arguments for all other hooks should not be considered as trusted. This may not be a complete list, but it covers common usage for sk and task. Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203204954.2043348-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ kernel/bpf/bpf_lsm.c | 16 ++++++++++++++++ kernel/bpf/btf.c | 2 ++ tools/testing/selftests/bpf/prog_tests/task_kfunc.c | 1 + tools/testing/selftests/bpf/progs/task_kfunc_failure.c | 11 +++++++++++ 5 files changed, 36 insertions(+) diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 4bcf76a9bb06..1de7ece5d36d 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); +bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) @@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) return false; } +static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index ae0267f150b5..9ea42a45da47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -345,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode) BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) +BTF_SET_START(untrusted_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) +BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_file_alloc_security) +BTF_ID(func, bpf_lsm_file_free_security) +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_ID(func, bpf_lsm_task_free) +BTF_SET_END(untrusted_lsm_hooks) + bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); } +bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d11cbf8cece7..c80bd8709e69 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -5829,6 +5830,7 @@ static bool prog_args_trusted(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; case BPF_PROG_TYPE_LSM: + return bpf_lsm_is_trusted(prog); case BPF_PROG_TYPE_STRUCT_OPS: return true; default: diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index ffd8ef4303c8..18848c31e36f 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -103,6 +103,7 @@ static struct { {"task_kfunc_release_null", "arg#0 is ptr_or_null_ expected ptr_ or socket"}, {"task_kfunc_release_unacquired", "release kernel function bpf_task_release expects"}, {"task_kfunc_from_pid_no_null_check", "arg#0 is ptr_or_null_ expected ptr_ or socket"}, + {"task_kfunc_from_lsm_task_free", "reg type unsupported for arg#0 function"}, }; static void verify_fail(const char *prog_name, const char *expected_err_msg) diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index e310473190d5..87fa1db9d9b5 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -271,3 +271,14 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl return 0; } + +SEC("lsm/task_free") +int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task) +{ + struct task_struct *acquired; + + /* the argument of lsm task_free hook is untrusted. */ + acquired = bpf_task_acquire(task); + bpf_task_release(acquired); + return 0; +} -- cgit v1.2.3 From 8972e18a439d6c47e838b600d71ff0a4f102f0e0 Mon Sep 17 00:00:00 2001 From: Sreevani Sreejith Date: Fri, 2 Dec 2022 14:17:10 -0800 Subject: bpf, docs: BPF Iterator Document Document that describes how BPF iterators work, how to use iterators, and how to pass parameters in BPF iterators. Acked-by: David Vernet Signed-off-by: Sreevani Sreejith Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221202221710.320810-2-ssreevani@meta.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_iterators.rst | 485 ++++++++++++++++++++++++++++++++++++ Documentation/bpf/index.rst | 1 + 2 files changed, 486 insertions(+) create mode 100644 Documentation/bpf/bpf_iterators.rst diff --git a/Documentation/bpf/bpf_iterators.rst b/Documentation/bpf/bpf_iterators.rst new file mode 100644 index 000000000000..6d7770793fab --- /dev/null +++ b/Documentation/bpf/bpf_iterators.rst @@ -0,0 +1,485 @@ +============= +BPF Iterators +============= + + +---------- +Motivation +---------- + +There are a few existing ways to dump kernel data into user space. The most +popular one is the ``/proc`` system. For example, ``cat /proc/net/tcp6`` dumps +all tcp6 sockets in the system, and ``cat /proc/net/netlink`` dumps all netlink +sockets in the system. However, their output format tends to be fixed, and if +users want more information about these sockets, they have to patch the kernel, +which often takes time to publish upstream and release. The same is true for popular +tools like `ss `_ where any +additional information needs a kernel patch. + +To solve this problem, the `drgn +`_ tool is often used to +dig out the kernel data with no kernel change. However, the main drawback for +drgn is performance, as it cannot do pointer tracing inside the kernel. In +addition, drgn cannot validate a pointer value and may read invalid data if the +pointer becomes invalid inside the kernel. + +The BPF iterator solves the above problem by providing flexibility on what data +(e.g., tasks, bpf_maps, etc.) to collect by calling BPF programs for each kernel +data object. + +---------------------- +How BPF Iterators Work +---------------------- + +A BPF iterator is a type of BPF program that allows users to iterate over +specific types of kernel objects. Unlike traditional BPF tracing programs that +allow users to define callbacks that are invoked at particular points of +execution in the kernel, BPF iterators allow users to define callbacks that +should be executed for every entry in a variety of kernel data structures. + +For example, users can define a BPF iterator that iterates over every task on +the system and dumps the total amount of CPU runtime currently used by each of +them. Another BPF task iterator may instead dump the cgroup information for each +task. Such flexibility is the core value of BPF iterators. + +A BPF program is always loaded into the kernel at the behest of a user space +process. A user space process loads a BPF program by opening and initializing +the program skeleton as required and then invoking a syscall to have the BPF +program verified and loaded by the kernel. + +In traditional tracing programs, a program is activated by having user space +obtain a ``bpf_link`` to the program with ``bpf_program__attach()``. Once +activated, the program callback will be invoked whenever the tracepoint is +triggered in the main kernel. For BPF iterator programs, a ``bpf_link`` to the +program is obtained using ``bpf_link_create()``, and the program callback is +invoked by issuing system calls from user space. + +Next, let us see how you can use the iterators to iterate on kernel objects and +read data. + +------------------------ +How to Use BPF iterators +------------------------ + +BPF selftests are a great resource to illustrate how to use the iterators. In +this section, we’ll walk through a BPF selftest which shows how to load and use +a BPF iterator program. To begin, we’ll look at `bpf_iter.c +`_, +which illustrates how to load and trigger BPF iterators on the user space side. +Later, we’ll look at a BPF program that runs in kernel space. + +Loading a BPF iterator in the kernel from user space typically involves the +following steps: + +* The BPF program is loaded into the kernel through ``libbpf``. Once the kernel + has verified and loaded the program, it returns a file descriptor (fd) to user + space. +* Obtain a ``link_fd`` to the BPF program by calling the ``bpf_link_create()`` + specified with the BPF program file descriptor received from the kernel. +* Next, obtain a BPF iterator file descriptor (``bpf_iter_fd``) by calling the + ``bpf_iter_create()`` specified with the ``bpf_link`` received from Step 2. +* Trigger the iteration by calling ``read(bpf_iter_fd)`` until no data is + available. +* Close the iterator fd using ``close(bpf_iter_fd)``. +* If needed to reread the data, get a new ``bpf_iter_fd`` and do the read again. + +The following are a few examples of selftest BPF iterator programs: + +* `bpf_iter_tcp4.c `_ +* `bpf_iter_task_vma.c `_ +* `bpf_iter_task_file.c `_ + +Let us look at ``bpf_iter_task_file.c``, which runs in kernel space: + +Here is the definition of ``bpf_iter__task_file`` in `vmlinux.h +`_. +Any struct name in ``vmlinux.h`` in the format ``bpf_iter__`` +represents a BPF iterator. The suffix ```` represents the type of +iterator. + +:: + + struct bpf_iter__task_file { + union { + struct bpf_iter_meta *meta; + }; + union { + struct task_struct *task; + }; + u32 fd; + union { + struct file *file; + }; + }; + +In the above code, the field 'meta' contains the metadata, which is the same for +all BPF iterator programs. The rest of the fields are specific to different +iterators. For example, for task_file iterators, the kernel layer provides the +'task', 'fd' and 'file' field values. The 'task' and 'file' are `reference +counted +`_, +so they won't go away when the BPF program runs. + +Here is a snippet from the ``bpf_iter_task_file.c`` file: + +:: + + SEC("iter/task_file") + int dump_task_file(struct bpf_iter__task_file *ctx) + { + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + struct file *file = ctx->file; + __u32 fd = ctx->fd; + + if (task == NULL || file == NULL) + return 0; + + if (ctx->meta->seq_num == 0) { + count = 0; + BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + } + + if (tgid == task->tgid && task->tgid != task->pid) + count++; + + if (last_tgid != task->tgid) { + last_tgid = task->tgid; + unique_tgid_count++; + } + + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; + } + +In the above example, the section name ``SEC(iter/task_file)``, indicates that +the program is a BPF iterator program to iterate all files from all tasks. The +context of the program is ``bpf_iter__task_file`` struct. + +The user space program invokes the BPF iterator program running in the kernel +by issuing a ``read()`` syscall. Once invoked, the BPF +program can export data to user space using a variety of BPF helper functions. +You can use either ``bpf_seq_printf()`` (and BPF_SEQ_PRINTF helper macro) or +``bpf_seq_write()`` function based on whether you need formatted output or just +binary data, respectively. For binary-encoded data, the user space applications +can process the data from ``bpf_seq_write()`` as needed. For the formatted data, +you can use ``cat `` to print the results similar to ``cat +/proc/net/netlink`` after pinning the BPF iterator to the bpffs mount. Later, +use ``rm -f `` to remove the pinned iterator. + +For example, you can use the following command to create a BPF iterator from the +``bpf_iter_ipv6_route.o`` object file and pin it to the ``/sys/fs/bpf/my_route`` +path: + +:: + + $ bpftool iter pin ./bpf_iter_ipv6_route.o /sys/fs/bpf/my_route + +And then print out the results using the following command: + +:: + + $ cat /sys/fs/bpf/my_route + + +------------------------------------------------------- +Implement Kernel Support for BPF Iterator Program Types +------------------------------------------------------- + +To implement a BPF iterator in the kernel, the developer must make a one-time +change to the following key data structure defined in the `bpf.h +`_ +file. + +:: + + struct bpf_iter_reg { + const char *target; + bpf_iter_attach_target_t attach_target; + bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; + bpf_iter_get_func_proto_t get_func_proto; + u32 ctx_arg_info_size; + u32 feature; + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; + const struct bpf_iter_seq_info *seq_info; + }; + +After filling the data structure fields, call ``bpf_iter_reg_target()`` to +register the iterator to the main BPF iterator subsystem. + +The following is the breakdown for each field in struct ``bpf_iter_reg``. + +.. list-table:: + :widths: 25 50 + :header-rows: 1 + + * - Fields + - Description + * - target + - Specifies the name of the BPF iterator. For example: ``bpf_map``, + ``bpf_map_elem``. The name should be different from other ``bpf_iter`` target names in the kernel. + * - attach_target and detach_target + - Allows for target specific ``link_create`` action since some targets + may need special processing. Called during the user space link_create stage. + * - show_fdinfo and fill_link_info + - Called to fill target specific information when user tries to get link + info associated with the iterator. + * - get_func_proto + - Permits a BPF iterator to access BPF helpers specific to the iterator. + * - ctx_arg_info_size and ctx_arg_info + - Specifies the verifier states for BPF program arguments associated with + the bpf iterator. + * - feature + - Specifies certain action requests in the kernel BPF iterator + infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means + that the kernel function cond_resched() is called to avoid other kernel + subsystem (e.g., rcu) misbehaving. + * - seq_info + - Specifies certain action requests in the kernel BPF iterator + infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means + that the kernel function cond_resched() is called to avoid other kernel + subsystem (e.g., rcu) misbehaving. + + +`Click here +`_ +to see an implementation of the ``task_vma`` BPF iterator in the kernel. + +--------------------------------- +Parameterizing BPF Task Iterators +--------------------------------- + +By default, BPF iterators walk through all the objects of the specified types +(processes, cgroups, maps, etc.) across the entire system to read relevant +kernel data. But often, there are cases where we only care about a much smaller +subset of iterable kernel objects, such as only iterating tasks within a +specific process. Therefore, BPF iterator programs support filtering out objects +from iteration by allowing user space to configure the iterator program when it +is attached. + +-------------------------- +BPF Task Iterator Program +-------------------------- + +The following code is a BPF iterator program to print files and task information +through the ``seq_file`` of the iterator. It is a standard BPF iterator program +that visits every file of an iterator. We will use this BPF program in our +example later. + +:: + + #include + #include + + char _license[] SEC("license") = "GPL"; + + SEC("iter/task_file") + int dump_task_file(struct bpf_iter__task_file *ctx) + { + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + struct file *file = ctx->file; + __u32 fd = ctx->fd; + if (task == NULL || file == NULL) + return 0; + if (ctx->meta->seq_num == 0) { + BPF_SEQ_PRINTF(seq, " tgid pid fd file\n"); + } + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; + } + +---------------------------------------- +Creating a File Iterator with Parameters +---------------------------------------- + +Now, let us look at how to create an iterator that includes only files of a +process. + +First, fill the ``bpf_iter_attach_opts`` struct as shown below: + +:: + + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + +``linfo.task.pid``, if it is non-zero, directs the kernel to create an iterator +that only includes opened files for the process with the specified ``pid``. In +this example, we will only be iterating files for our process. If +``linfo.task.pid`` is zero, the iterator will visit every opened file of every +process. Similarly, ``linfo.task.tid`` directs the kernel to create an iterator +that visits opened files of a specific thread, not a process. In this example, +``linfo.task.tid`` is different from ``linfo.task.pid`` only if the thread has a +separate file descriptor table. In most circumstances, all process threads share +a single file descriptor table. + +Now, in the userspace program, pass the pointer of struct to the +``bpf_program__attach_iter()``. + +:: + + link = bpf_program__attach_iter(prog, &opts); iter_fd = + bpf_iter_create(bpf_link__fd(link)); + +If both *tid* and *pid* are zero, an iterator created from this struct +``bpf_iter_attach_opts`` will include every opened file of every task in the +system (in the namespace, actually.) It is the same as passing a NULL as the +second argument to ``bpf_program__attach_iter()``. + +The whole program looks like the following code: + +:: + + #include + #include + #include + #include + #include "bpf_iter_task_ex.skel.h" + + static int do_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts) + { + struct bpf_link *link; + char buf[16] = {}; + int iter_fd = -1, len; + int ret = 0; + + link = bpf_program__attach_iter(prog, opts); + if (!link) { + fprintf(stderr, "bpf_program__attach_iter() fails\n"); + return -1; + } + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (iter_fd < 0) { + fprintf(stderr, "bpf_iter_create() fails\n"); + ret = -1; + goto free_link; + } + /* not check contents, but ensure read() ends without error */ + while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) { + buf[len] = 0; + printf("%s", buf); + } + printf("\n"); + free_link: + if (iter_fd >= 0) + close(iter_fd); + bpf_link__destroy(link); + return 0; + } + + static void test_task_file(void) + { + LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_task_ex *skel; + union bpf_iter_link_info linfo; + skel = bpf_iter_task_ex__open_and_load(); + if (skel == NULL) + return; + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + printf("PID %d\n", getpid()); + do_read_opts(skel->progs.dump_task_file, &opts); + bpf_iter_task_ex__destroy(skel); + } + + int main(int argc, const char * const * argv) + { + test_task_file(); + return 0; + } + +The following lines are the output of the program. +:: + + PID 1859 + + tgid pid fd file + 1859 1859 0 ffffffff82270aa0 + 1859 1859 1 ffffffff82270aa0 + 1859 1859 2 ffffffff82270aa0 + 1859 1859 3 ffffffff82272980 + 1859 1859 4 ffffffff8225e120 + 1859 1859 5 ffffffff82255120 + 1859 1859 6 ffffffff82254f00 + 1859 1859 7 ffffffff82254d80 + 1859 1859 8 ffffffff8225abe0 + +------------------ +Without Parameters +------------------ + +Let us look at how a BPF iterator without parameters skips files of other +processes in the system. In this case, the BPF program has to check the pid or +the tid of tasks, or it will receive every opened file in the system (in the +current *pid* namespace, actually). So, we usually add a global variable in the +BPF program to pass a *pid* to the BPF program. + +The BPF program would look like the following block. + + :: + + ...... + int target_pid = 0; + + SEC("iter/task_file") + int dump_task_file(struct bpf_iter__task_file *ctx) + { + ...... + if (task->tgid != target_pid) /* Check task->pid instead to check thread IDs */ + return 0; + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; + } + +The user space program would look like the following block: + + :: + + ...... + static void test_task_file(void) + { + ...... + skel = bpf_iter_task_ex__open_and_load(); + if (skel == NULL) + return; + skel->bss->target_pid = getpid(); /* process ID. For thread id, use gettid() */ + memset(&linfo, 0, sizeof(linfo)); + linfo.task.pid = getpid(); + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + ...... + } + +``target_pid`` is a global variable in the BPF program. The user space program +should initialize the variable with a process ID to skip opened files of other +processes in the BPF program. When you parametrize a BPF iterator, the iterator +calls the BPF program fewer times which can save significant resources. + +--------------------------- +Parametrizing VMA Iterators +--------------------------- + +By default, a BPF VMA iterator includes every VMA in every process. However, +you can still specify a process or a thread to include only its VMAs. Unlike +files, a thread can not have a separate address space (since Linux 2.6.0-test6). +Here, using *tid* makes no difference from using *pid*. + +---------------------------- +Parametrizing Task Iterators +---------------------------- + +A BPF task iterator with *pid* includes all tasks (threads) of a process. The +BPF program receives these tasks one after another. You can specify a BPF task +iterator with *tid* parameter to include only the tasks that match the given +*tid*. diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 1088d44634d6..b81533d8b061 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -24,6 +24,7 @@ that goes into great technical depth about the BPF Architecture. maps bpf_prog_run classic_vs_extended.rst + bpf_iterators bpf_licensing test_debug clang-notes -- cgit v1.2.3 From 2c40d97da1a25be815c91d6ebcd734d96d5732e8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 30 Nov 2022 21:04:44 -0800 Subject: bpf: Enable sleeptable support for cgrp local storage Similar to sk/inode/task local storage, enable sleepable support for cgrp local storage. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221201050444.2785007-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 66b82f52b5bc..1d51bd9596da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14154,10 +14154,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: break; default: verbose(env, - "Sleepable programs can only use array, hash, and ringbuf maps\n"); + "Sleepable programs can only use array, hash, ringbuf and local storage maps\n"); return -EINVAL; } -- cgit v1.2.3 From 41d76c721c5c743470078d30e9bb8df08c489b1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 30 Nov 2022 21:04:49 -0800 Subject: bpf: Add sleepable prog tests for cgrp local storage Add three tests for cgrp local storage support for sleepable progs. Two tests can load and run properly, one for cgroup_iter, another for passing current->cgroups->dfl_cgrp to bpf_cgrp_storage_get() helper. One test has bpf_rcu_read_lock() and failed to load. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221201050449.2785613-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/cgrp_local_storage.c | 94 ++++++++++++++++++++++ .../selftests/bpf/progs/cgrp_ls_sleepable.c | 80 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 1c30412ba132..33a2776737e7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -10,7 +10,9 @@ #include "cgrp_ls_recursion.skel.h" #include "cgrp_ls_attach_cgroup.skel.h" #include "cgrp_ls_negative.skel.h" +#include "cgrp_ls_sleepable.skel.h" #include "network_helpers.h" +#include "cgroup_helpers.h" struct socket_cookie { __u64 cookie_key; @@ -150,14 +152,100 @@ static void test_negative(void) } } +static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct cgrp_ls_sleepable *skel; + struct bpf_link *link; + int err, iter_fd; + char buf[16]; + + skel = cgrp_ls_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bpf_program__set_autoload(skel->progs.cgroup_iter, true); + err = cgrp_ls_sleepable__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto out; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = cgroup_fd; + linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(skel->progs.cgroup_iter, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "iter_create")) + goto out; + + /* trigger the program run */ + (void)read(iter_fd, buf, sizeof(buf)); + + ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); + + close(iter_fd); +out: + cgrp_ls_sleepable__destroy(skel); +} + +static void test_no_rcu_lock(__u64 cgroup_id) +{ + struct cgrp_ls_sleepable *skel; + int err; + + skel = cgrp_ls_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->target_pid = syscall(SYS_gettid); + + bpf_program__set_autoload(skel->progs.no_rcu_lock, true); + err = cgrp_ls_sleepable__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto out; + + err = cgrp_ls_sleepable__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); +out: + cgrp_ls_sleepable__destroy(skel); +} + +static void test_rcu_lock(void) +{ + struct cgrp_ls_sleepable *skel; + int err; + + skel = cgrp_ls_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bpf_program__set_autoload(skel->progs.yes_rcu_lock, true); + err = cgrp_ls_sleepable__load(skel); + ASSERT_ERR(err, "skel_load"); + + cgrp_ls_sleepable__destroy(skel); +} + void test_cgrp_local_storage(void) { + __u64 cgroup_id; int cgroup_fd; cgroup_fd = test__join_cgroup("/cgrp_local_storage"); if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /cgrp_local_storage")) return; + cgroup_id = get_cgroup_id("/cgrp_local_storage"); if (test__start_subtest("tp_btf")) test_tp_btf(cgroup_fd); if (test__start_subtest("attach_cgroup")) @@ -166,6 +254,12 @@ void test_cgrp_local_storage(void) test_recursion(cgroup_fd); if (test__start_subtest("negative")) test_negative(); + if (test__start_subtest("cgroup_iter_sleepable")) + test_cgroup_iter_sleepable(cgroup_fd, cgroup_id); + if (test__start_subtest("no_rcu_lock")) + test_no_rcu_lock(cgroup_id); + if (test__start_subtest("rcu_lock")) + test_rcu_lock(); close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c new file mode 100644 index 000000000000..2d11ed528b6f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "bpf_iter.h" +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_a SEC(".maps"); + +__u32 target_pid; +__u64 cgroup_id; + +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + +SEC("?iter.s/cgroup") +int cgroup_iter(struct bpf_iter__cgroup *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct cgroup *cgrp = ctx->cgroup; + long *ptr; + + if (cgrp == NULL) + return 0; + + ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + cgroup_id = cgrp->kn->id; + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int no_rcu_lock(void *ctx) +{ + struct task_struct *task; + struct cgroup *cgrp; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + /* ptr_to_btf_id semantics. should work. */ + cgrp = task->cgroups->dfl_cgrp; + ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + cgroup_id = cgrp->kn->id; + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int yes_rcu_lock(void *ctx) +{ + struct task_struct *task; + struct cgroup *cgrp; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + bpf_rcu_read_lock(); + cgrp = task->cgroups->dfl_cgrp; + /* cgrp is untrusted and cannot pass to bpf_cgrp_storage_get() helper. */ + ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + cgroup_id = cgrp->kn->id; + bpf_rcu_read_unlock(); + return 0; +} -- cgit v1.2.3 From 578ce69ffda49d6c1a252490553290d1f27199f0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 1 Dec 2022 13:39:39 +0100 Subject: bpf: Add dummy type reference to nf_conn___init to fix type deduplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_ct_set_nat_info() kfunc is defined in the nf_nat.ko module, and takes as a parameter the nf_conn___init struct, which is allocated through the bpf_xdp_ct_alloc() helper defined in the nf_conntrack.ko module. However, because kernel modules can't deduplicate BTF types between each other, and the nf_conn___init struct is not referenced anywhere in vmlinux BTF, this leads to two distinct BTF IDs for the same type (one in each module). This confuses the verifier, as described here: https://lore.kernel.org/all/87leoh372s.fsf@toke.dk/ As a workaround, add an explicit BTF_TYPE_EMIT for the type in net/filter.c, so the type definition gets included in vmlinux BTF. This way, both modules can refer to the same type ID (as they both build on top of vmlinux BTF), and the verifier is no longer confused. v2: - Use BTF_TYPE_EMIT (which is a statement so it has to be inside a function definition; use xdp_func_proto() for this, since this is mostly xdp-related). Fixes: 820dc0523e05 ("net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c") Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221201123939.696558-1-toke@redhat.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 37baaa6b8fc3..8607136b6e2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -80,6 +80,7 @@ #include #include #include +#include static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -7992,6 +7993,19 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return bpf_sk_base_func_proto(func_id); } + +#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) + /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The + * kfuncs are defined in two different modules, and we want to be able + * to use them interchangably with the same BTF type ID. Because modules + * can't de-duplicate BTF IDs between each other, we need the type to be + * referenced in the vmlinux BTF or the verifier will get confused about + * the different types. So we add this dummy type reference which will + * be included in vmlinux BTF, allowing both modules to refer to the + * same type ID. + */ + BTF_TYPE_EMIT(struct nf_conn___init); +#endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; -- cgit v1.2.3 From 0a182f8d607464911756b4dbef5d6cad8de22469 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Dec 2022 11:16:40 +0000 Subject: bpf, sockmap: fix race in sock_map_free() sock_map_free() calls release_sock(sk) without owning a reference on the socket. This can cause use-after-free as syzbot found [1] Jakub Sitnicki already took care of a similar issue in sock_hash_free() in commit 75e68e5bf2c7 ("bpf, sockhash: Synchronize delete from bucket list on map free") [1] refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 3785 at lib/refcount.c:31 refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 3785 Comm: kworker/u4:6 Not tainted 6.1.0-rc7-syzkaller-00103-gef4d3ea40565 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Code: 68 8b 31 c0 e8 75 71 15 fd 0f 0b e9 64 ff ff ff e8 d9 6e 4e fd c6 05 62 9c 3d 0a 01 48 c7 c7 80 bb 68 8b 31 c0 e8 54 71 15 fd <0f> 0b e9 43 ff ff ff 89 d9 80 e1 07 80 c1 03 38 c1 0f 8c a2 fe ff RSP: 0018:ffffc9000456fb60 EFLAGS: 00010246 RAX: eae59bab72dcd700 RBX: 0000000000000004 RCX: ffff8880207057c0 RDX: 0000000000000000 RSI: 0000000000000201 RDI: 0000000000000000 RBP: 0000000000000004 R08: ffffffff816fdabd R09: fffff520008adee5 R10: fffff520008adee5 R11: 1ffff920008adee4 R12: 0000000000000004 R13: dffffc0000000000 R14: ffff88807b1c6c00 R15: 1ffff1100f638dcf FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b30c30000 CR3: 000000000d08e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] __sock_put include/net/sock.h:779 [inline] tcp_release_cb+0x2d0/0x360 net/ipv4/tcp_output.c:1092 release_sock+0xaf/0x1c0 net/core/sock.c:3468 sock_map_free+0x219/0x2c0 net/core/sock_map.c:356 process_one_work+0x81c/0xd10 kernel/workqueue.c:2289 worker_thread+0xb14/0x1330 kernel/workqueue.c:2436 kthread+0x266/0x300 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jakub Sitnicki Cc: John Fastabend Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/r/20221202111640.2745533-1-edumazet@google.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 81beb16ab1eb..22fa2c5bc6ec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -349,11 +349,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { + sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); + sock_put(sk); } } -- cgit v1.2.3 From ab0350c743d5c93fd88742f02b3dff12168ab435 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Fri, 2 Dec 2022 18:08:44 -0700 Subject: selftests/bpf: Fix conflicts with built-in functions in bpf_iter_ksym Both tolower and toupper are built in c functions, we should not redefine them as this can result in a build error. Fixes the following errors: progs/bpf_iter_ksym.c:10:20: error: conflicting types for built-in function 'tolower'; expected 'int(int)' [-Werror=builtin-declaration-mismatch] 10 | static inline char tolower(char c) | ^~~~~~~ progs/bpf_iter_ksym.c:5:1: note: 'tolower' is declared in header '' 4 | #include +++ |+#include 5 | progs/bpf_iter_ksym.c:17:20: error: conflicting types for built-in function 'toupper'; expected 'int(int)' [-Werror=builtin-declaration-mismatch] 17 | static inline char toupper(char c) | ^~~~~~~ progs/bpf_iter_ksym.c:17:20: note: 'toupper' is declared in header '' See background on this sort of issue: https://stackoverflow.com/a/20582607 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=12213 (C99, 7.1.3p1) "All identifiers with external linkage in any of the following subclauses (including the future library directions) are always reserved for use as identifiers with external linkage." This is documented behavior in GCC: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html#index-std-2 Signed-off-by: James Hilliard Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221203010847.2191265-1-james.hilliard1@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_iter_ksym.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c index 285c008cbf9c..9ba14c37bbcc 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c @@ -7,14 +7,14 @@ char _license[] SEC("license") = "GPL"; unsigned long last_sym_value = 0; -static inline char tolower(char c) +static inline char to_lower(char c) { if (c >= 'A' && c <= 'Z') c += ('a' - 'A'); return c; } -static inline char toupper(char c) +static inline char to_upper(char c) { if (c >= 'a' && c <= 'z') c -= ('a' - 'A'); @@ -54,7 +54,7 @@ int dump_ksym(struct bpf_iter__ksym *ctx) type = iter->type; if (iter->module_name[0]) { - type = iter->exported ? toupper(type) : tolower(type); + type = iter->exported ? to_upper(type) : to_lower(type); BPF_SEQ_PRINTF(seq, "0x%llx %c %s [ %s ] ", value, type, iter->name, iter->module_name); } else { -- cgit v1.2.3 From ee9a113ab63468137802898bcd2c598998c96938 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:56 +0200 Subject: xfrm: interface: rename xfrm_interface.c to xfrm_interface_core.c This change allows adding additional files to the xfrm_interface module. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-2-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- net/xfrm/Makefile | 2 + net/xfrm/xfrm_interface.c | 1198 ---------------------------------------- net/xfrm/xfrm_interface_core.c | 1198 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1200 insertions(+), 1198 deletions(-) delete mode 100644 net/xfrm/xfrm_interface.c create mode 100644 net/xfrm/xfrm_interface_core.c diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 494aa744bfb9..08a2870fdd36 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -3,6 +3,8 @@ # Makefile for the XFRM subsystem. # +xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c deleted file mode 100644 index 5a67b120c4db..000000000000 --- a/net/xfrm/xfrm_interface.c +++ /dev/null @@ -1,1198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * XFRM virtual interface - * - * Copyright (C) 2018 secunet Security Networks AG - * - * Author: - * Steffen Klassert - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int xfrmi_dev_init(struct net_device *dev); -static void xfrmi_dev_setup(struct net_device *dev); -static struct rtnl_link_ops xfrmi_link_ops __read_mostly; -static unsigned int xfrmi_net_id __read_mostly; -static const struct net_device_ops xfrmi_netdev_ops; - -#define XFRMI_HASH_BITS 8 -#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) - -struct xfrmi_net { - /* lists for storing interfaces in use */ - struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; - struct xfrm_if __rcu *collect_md_xfrmi; -}; - -static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { - [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), - [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), -}; - -static void xfrmi_destroy_state(struct lwtunnel_state *lwt) -{ -} - -static int xfrmi_build_state(struct net *net, struct nlattr *nla, - unsigned int family, const void *cfg, - struct lwtunnel_state **ts, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb[LWT_XFRM_MAX + 1]; - struct lwtunnel_state *new_state; - struct xfrm_md_info *info; - int ret; - - ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); - if (ret < 0) - return ret; - - if (!tb[LWT_XFRM_IF_ID]) { - NL_SET_ERR_MSG(extack, "if_id must be set"); - return -EINVAL; - } - - new_state = lwtunnel_state_alloc(sizeof(*info)); - if (!new_state) { - NL_SET_ERR_MSG(extack, "failed to create encap info"); - return -ENOMEM; - } - - new_state->type = LWTUNNEL_ENCAP_XFRM; - - info = lwt_xfrm_info(new_state); - - info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); - - if (tb[LWT_XFRM_LINK]) - info->link = nla_get_u32(tb[LWT_XFRM_LINK]); - - *ts = new_state; - return 0; -} - -static int xfrmi_fill_encap_info(struct sk_buff *skb, - struct lwtunnel_state *lwt) -{ - struct xfrm_md_info *info = lwt_xfrm_info(lwt); - - if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || - (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) - return -EMSGSIZE; - - return 0; -} - -static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) -{ - return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ - nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ -} - -static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) -{ - struct xfrm_md_info *a_info = lwt_xfrm_info(a); - struct xfrm_md_info *b_info = lwt_xfrm_info(b); - - return memcmp(a_info, b_info, sizeof(*a_info)); -} - -static const struct lwtunnel_encap_ops xfrmi_encap_ops = { - .build_state = xfrmi_build_state, - .destroy_state = xfrmi_destroy_state, - .fill_encap = xfrmi_fill_encap_info, - .get_encap_size = xfrmi_encap_nlsize, - .cmp_encap = xfrmi_encap_cmp, - .owner = THIS_MODULE, -}; - -#define for_each_xfrmi_rcu(start, xi) \ - for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) - -static u32 xfrmi_hash(u32 if_id) -{ - return hash_32(if_id, XFRMI_HASH_BITS); -} - -static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if *xi; - - for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { - if (x->if_id == xi->p.if_id && - (xi->dev->flags & IFF_UP)) - return xi; - } - - xi = rcu_dereference(xfrmn->collect_md_xfrmi); - if (xi && (xi->dev->flags & IFF_UP)) - return xi; - - return NULL; -} - -static bool xfrmi_decode_session(struct sk_buff *skb, - unsigned short family, - struct xfrm_if_decode_session_result *res) -{ - struct net_device *dev; - struct xfrm_if *xi; - int ifindex = 0; - - if (!secpath_exists(skb) || !skb->dev) - return false; - - switch (family) { - case AF_INET6: - ifindex = inet6_sdif(skb); - break; - case AF_INET: - ifindex = inet_sdif(skb); - break; - } - - if (ifindex) { - struct net *net = xs_net(xfrm_input_state(skb)); - - dev = dev_get_by_index_rcu(net, ifindex); - } else { - dev = skb->dev; - } - - if (!dev || !(dev->flags & IFF_UP)) - return false; - if (dev->netdev_ops != &xfrmi_netdev_ops) - return false; - - xi = netdev_priv(dev); - res->net = xi->net; - - if (xi->p.collect_md) - res->if_id = xfrm_input_state(skb)->if_id; - else - res->if_id = xi->p.if_id; - return true; -} - -static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) -{ - struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; - - rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); - rcu_assign_pointer(*xip, xi); -} - -static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) -{ - struct xfrm_if __rcu **xip; - struct xfrm_if *iter; - - for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; - (iter = rtnl_dereference(*xip)) != NULL; - xip = &iter->next) { - if (xi == iter) { - rcu_assign_pointer(*xip, xi->next); - break; - } - } -} - -static void xfrmi_dev_free(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - gro_cells_destroy(&xi->gro_cells); - free_percpu(dev->tstats); -} - -static int xfrmi_create(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net *net = dev_net(dev); - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - int err; - - dev->rtnl_link_ops = &xfrmi_link_ops; - err = register_netdevice(dev); - if (err < 0) - goto out; - - if (xi->p.collect_md) - rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); - else - xfrmi_link(xfrmn, xi); - - return 0; - -out: - return err; -} - -static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) -{ - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - if (xi->p.if_id == p->if_id) - return xi; - - return NULL; -} - -static void xfrmi_dev_uninit(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); - - if (xi->p.collect_md) - RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); - else - xfrmi_unlink(xfrmn, xi); -} - -static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) -{ - skb_clear_tstamp(skb); - skb->pkt_type = PACKET_HOST; - skb->skb_iif = 0; - skb->ignore_df = 0; - skb_dst_drop(skb); - nf_reset_ct(skb); - nf_reset_trace(skb); - - if (!xnet) - return; - - ipvs_reset(skb); - secpath_reset(skb); - skb_orphan(skb); - skb->mark = 0; -} - -static int xfrmi_rcv_cb(struct sk_buff *skb, int err) -{ - const struct xfrm_mode *inner_mode; - struct net_device *dev; - struct xfrm_state *x; - struct xfrm_if *xi; - bool xnet; - int link; - - if (err && !secpath_exists(skb)) - return 0; - - x = xfrm_input_state(skb); - - xi = xfrmi_lookup(xs_net(x), x); - if (!xi) - return 1; - - link = skb->dev->ifindex; - dev = xi->dev; - skb->dev = dev; - - if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; - - return 0; - } - - xnet = !net_eq(xi->net, dev_net(skb->dev)); - - if (xnet) { - inner_mode = &x->inner_mode; - - if (x->sel.family == AF_UNSPEC) { - inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (inner_mode == NULL) { - XFRM_INC_STATS(dev_net(skb->dev), - LINUX_MIB_XFRMINSTATEMODEERROR); - return -EINVAL; - } - } - - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, - inner_mode->family)) - return -EPERM; - } - - xfrmi_scrub_packet(skb, xnet); - if (xi->p.collect_md) { - struct metadata_dst *md_dst; - - md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); - if (!md_dst) - return -ENOMEM; - - md_dst->u.xfrm_info.if_id = x->if_id; - md_dst->u.xfrm_info.link = link; - skb_dst_set(skb, (struct dst_entry *)md_dst); - } - dev_sw_netstats_rx_add(dev, skb->len); - - return 0; -} - -static int -xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; - struct dst_entry *dst = skb_dst(skb); - unsigned int length = skb->len; - struct net_device *tdev; - struct xfrm_state *x; - int err = -1; - u32 if_id; - int mtu; - - if (xi->p.collect_md) { - struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); - - if (unlikely(!md_info)) - return -EINVAL; - - if_id = md_info->if_id; - fl->flowi_oif = md_info->link; - } else { - if_id = xi->p.if_id; - } - - dst_hold(dst); - dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto tx_err_link_failure; - } - - x = dst->xfrm; - if (!x) - goto tx_err_link_failure; - - if (x->if_id != if_id) - goto tx_err_link_failure; - - tdev = dst->dev; - - if (tdev == dev) { - stats->collisions++; - net_warn_ratelimited("%s: Local routing loop detected!\n", - dev->name); - goto tx_err_dst_release; - } - - mtu = dst_mtu(dst); - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { - skb_dst_update_pmtu_no_confirm(skb, mtu); - - if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - - if (skb->len > 1280) - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - else - goto xmit; - } else { - if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto xmit; - icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - } - - dst_release(dst); - return -EMSGSIZE; - } - -xmit: - xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); - skb_dst_set(skb, dst); - skb->dev = tdev; - - err = dst_output(xi->net, skb->sk, skb); - if (net_xmit_eval(err) == 0) { - dev_sw_netstats_tx_add(dev, 1, length); - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } - - return 0; -tx_err_link_failure: - stats->tx_carrier_errors++; - dst_link_failure(skb); -tx_err_dst_release: - dst_release(dst); - return err; -} - -static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; - struct dst_entry *dst = skb_dst(skb); - struct flowi fl; - int ret; - - memset(&fl, 0, sizeof(fl)); - - switch (skb->protocol) { - case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - if (!dst) { - fl.u.ip6.flowi6_oif = dev->ifindex; - fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; - dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); - if (dst->error) { - dst_release(dst); - stats->tx_carrier_errors++; - goto tx_err; - } - skb_dst_set(skb, dst); - } - break; - case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (!dst) { - struct rtable *rt; - - fl.u.ip4.flowi4_oif = dev->ifindex; - fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; - rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); - if (IS_ERR(rt)) { - stats->tx_carrier_errors++; - goto tx_err; - } - skb_dst_set(skb, &rt->dst); - } - break; - default: - goto tx_err; - } - - fl.flowi_oif = xi->p.link; - - ret = xfrmi_xmit2(skb, dev, &fl); - if (ret < 0) - goto tx_err; - - return NETDEV_TX_OK; - -tx_err: - stats->tx_errors++; - stats->tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} - -static int xfrmi4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (const struct iphdr *)skb->data; - struct net *net = dev_net(skb->dev); - int protocol = iph->protocol; - struct ip_comp_hdr *ipch; - struct ip_esp_hdr *esph; - struct ip_auth_hdr *ah ; - struct xfrm_state *x; - struct xfrm_if *xi; - __be32 spi; - - switch (protocol) { - case IPPROTO_ESP: - esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); - spi = esph->spi; - break; - case IPPROTO_AH: - ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); - spi = ah->spi; - break; - case IPPROTO_COMP: - ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); - spi = htonl(ntohs(ipch->cpi)); - break; - default: - return 0; - } - - switch (icmp_hdr(skb)->type) { - case ICMP_DEST_UNREACH: - if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return 0; - break; - case ICMP_REDIRECT: - break; - default: - return 0; - } - - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, - spi, protocol, AF_INET); - if (!x) - return 0; - - xi = xfrmi_lookup(net, x); - if (!xi) { - xfrm_state_put(x); - return -1; - } - - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, protocol); - else - ipv4_redirect(skb, net, 0, protocol); - xfrm_state_put(x); - - return 0; -} - -static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - struct net *net = dev_net(skb->dev); - int protocol = iph->nexthdr; - struct ip_comp_hdr *ipch; - struct ip_esp_hdr *esph; - struct ip_auth_hdr *ah; - struct xfrm_state *x; - struct xfrm_if *xi; - __be32 spi; - - switch (protocol) { - case IPPROTO_ESP: - esph = (struct ip_esp_hdr *)(skb->data + offset); - spi = esph->spi; - break; - case IPPROTO_AH: - ah = (struct ip_auth_hdr *)(skb->data + offset); - spi = ah->spi; - break; - case IPPROTO_COMP: - ipch = (struct ip_comp_hdr *)(skb->data + offset); - spi = htonl(ntohs(ipch->cpi)); - break; - default: - return 0; - } - - if (type != ICMPV6_PKT_TOOBIG && - type != NDISC_REDIRECT) - return 0; - - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, - spi, protocol, AF_INET6); - if (!x) - return 0; - - xi = xfrmi_lookup(net, x); - if (!xi) { - xfrm_state_put(x); - return -1; - } - - if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0, - sock_net_uid(net, NULL)); - else - ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - xfrm_state_put(x); - - return 0; -} - -static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) -{ - if (xi->p.link != p->link) - return -EINVAL; - - xi->p.if_id = p->if_id; - - return 0; -} - -static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) -{ - struct net *net = xi->net; - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - int err; - - xfrmi_unlink(xfrmn, xi); - synchronize_net(); - err = xfrmi_change(xi, p); - xfrmi_link(xfrmn, xi); - netdev_state_change(xi->dev); - return err; -} - -static int xfrmi_get_iflink(const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - return xi->p.link; -} - -static const struct net_device_ops xfrmi_netdev_ops = { - .ndo_init = xfrmi_dev_init, - .ndo_uninit = xfrmi_dev_uninit, - .ndo_start_xmit = xfrmi_xmit, - .ndo_get_stats64 = dev_get_tstats64, - .ndo_get_iflink = xfrmi_get_iflink, -}; - -static void xfrmi_dev_setup(struct net_device *dev) -{ - dev->netdev_ops = &xfrmi_netdev_ops; - dev->header_ops = &ip_tunnel_header_ops; - dev->type = ARPHRD_NONE; - dev->mtu = ETH_DATA_LEN; - dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP_MAX_MTU; - dev->flags = IFF_NOARP; - dev->needs_free_netdev = true; - dev->priv_destructor = xfrmi_dev_free; - netif_keep_dst(dev); - - eth_broadcast_addr(dev->broadcast); -} - -#define XFRMI_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_GSO_SOFTWARE | \ - NETIF_F_HW_CSUM) - -static int xfrmi_dev_init(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); - int err; - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - err = gro_cells_init(&xi->gro_cells, dev); - if (err) { - free_percpu(dev->tstats); - return err; - } - - dev->features |= NETIF_F_LLTX; - dev->features |= XFRMI_FEATURES; - dev->hw_features |= XFRMI_FEATURES; - - if (phydev) { - dev->needed_headroom = phydev->needed_headroom; - dev->needed_tailroom = phydev->needed_tailroom; - - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, phydev); - if (is_zero_ether_addr(dev->broadcast)) - memcpy(dev->broadcast, phydev->broadcast, - dev->addr_len); - } else { - eth_hw_addr_random(dev); - eth_broadcast_addr(dev->broadcast); - } - - return 0; -} - -static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - return 0; -} - -static void xfrmi_netlink_parms(struct nlattr *data[], - struct xfrm_if_parms *parms) -{ - memset(parms, 0, sizeof(*parms)); - - if (!data) - return; - - if (data[IFLA_XFRM_LINK]) - parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); - - if (data[IFLA_XFRM_IF_ID]) - parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); - - if (data[IFLA_XFRM_COLLECT_METADATA]) - parms->collect_md = true; -} - -static int xfrmi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - struct net *net = dev_net(dev); - struct xfrm_if_parms p = {}; - struct xfrm_if *xi; - int err; - - xfrmi_netlink_parms(data, &p); - if (p.collect_md) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - if (p.link || p.if_id) { - NL_SET_ERR_MSG(extack, "link and if_id must be zero"); - return -EINVAL; - } - - if (rtnl_dereference(xfrmn->collect_md_xfrmi)) - return -EEXIST; - - } else { - if (!p.if_id) { - NL_SET_ERR_MSG(extack, "if_id must be non zero"); - return -EINVAL; - } - - xi = xfrmi_locate(net, &p); - if (xi) - return -EEXIST; - } - - xi = netdev_priv(dev); - xi->p = p; - xi->net = net; - xi->dev = dev; - - err = xfrmi_create(dev); - return err; -} - -static void xfrmi_dellink(struct net_device *dev, struct list_head *head) -{ - unregister_netdevice_queue(dev, head); -} - -static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net *net = xi->net; - struct xfrm_if_parms p = {}; - - xfrmi_netlink_parms(data, &p); - if (!p.if_id) { - NL_SET_ERR_MSG(extack, "if_id must be non zero"); - return -EINVAL; - } - - if (p.collect_md) { - NL_SET_ERR_MSG(extack, "collect_md can't be changed"); - return -EINVAL; - } - - xi = xfrmi_locate(net, &p); - if (!xi) { - xi = netdev_priv(dev); - } else { - if (xi->dev != dev) - return -EEXIST; - if (xi->p.collect_md) { - NL_SET_ERR_MSG(extack, - "device can't be changed to collect_md"); - return -EINVAL; - } - } - - return xfrmi_update(xi, &p); -} - -static size_t xfrmi_get_size(const struct net_device *dev) -{ - return - /* IFLA_XFRM_LINK */ - nla_total_size(4) + - /* IFLA_XFRM_IF_ID */ - nla_total_size(4) + - /* IFLA_XFRM_COLLECT_METADATA */ - nla_total_size(0) + - 0; -} - -static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct xfrm_if_parms *parm = &xi->p; - - if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || - nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || - (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static struct net *xfrmi_get_link_net(const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - return xi->net; -} - -static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { - [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, - [IFLA_XFRM_LINK] = { .type = NLA_U32 }, - [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, - [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, -}; - -static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { - .kind = "xfrm", - .maxtype = IFLA_XFRM_MAX, - .policy = xfrmi_policy, - .priv_size = sizeof(struct xfrm_if), - .setup = xfrmi_dev_setup, - .validate = xfrmi_validate, - .newlink = xfrmi_newlink, - .dellink = xfrmi_dellink, - .changelink = xfrmi_changelink, - .get_size = xfrmi_get_size, - .fill_info = xfrmi_fill_info, - .get_link_net = xfrmi_get_link_net, -}; - -static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) - unregister_netdevice_queue(xi->dev, &list); - } - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations xfrmi_net_ops = { - .exit_batch = xfrmi_exit_batch_net, - .id = &xfrmi_net_id, - .size = sizeof(struct xfrmi_net), -}; - -static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) -static int xfrmi6_rcv_tunnel(struct sk_buff *skb) -{ - const xfrm_address_t *saddr; - __be32 spi; - - saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; - spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); - - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); -} - -static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { - .handler = xfrmi6_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 2, -}; - -static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { - .handler = xfrmi6_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 2, -}; -#endif - -static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) -static int xfrmi4_rcv_tunnel(struct sk_buff *skb) -{ - return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); -} - -static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { - .handler = xfrmi4_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 3, -}; - -static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { - .handler = xfrmi4_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 2, -}; -#endif - -static int __init xfrmi4_init(void) -{ - int err; - - err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); - if (err < 0) - goto xfrm_proto_esp_failed; - err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); - if (err < 0) - goto xfrm_proto_ah_failed; - err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); - if (err < 0) - goto xfrm_proto_comp_failed; -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) - err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); - if (err < 0) - goto xfrm_tunnel_ipip_failed; - err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); - if (err < 0) - goto xfrm_tunnel_ipip6_failed; -#endif - - return 0; - -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) -xfrm_tunnel_ipip6_failed: - xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); -xfrm_tunnel_ipip_failed: - xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); -#endif -xfrm_proto_comp_failed: - xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); -xfrm_proto_ah_failed: - xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); -xfrm_proto_esp_failed: - return err; -} - -static void xfrmi4_fini(void) -{ -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) - xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); - xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); -#endif - xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); - xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); - xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); -} - -static int __init xfrmi6_init(void) -{ - int err; - - err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); - if (err < 0) - goto xfrm_proto_esp_failed; - err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); - if (err < 0) - goto xfrm_proto_ah_failed; - err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); - if (err < 0) - goto xfrm_proto_comp_failed; -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) - err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); - if (err < 0) - goto xfrm_tunnel_ipv6_failed; - err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); - if (err < 0) - goto xfrm_tunnel_ip6ip_failed; -#endif - - return 0; - -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) -xfrm_tunnel_ip6ip_failed: - xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); -xfrm_tunnel_ipv6_failed: - xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); -#endif -xfrm_proto_comp_failed: - xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); -xfrm_proto_ah_failed: - xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); -xfrm_proto_esp_failed: - return err; -} - -static void xfrmi6_fini(void) -{ -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) - xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); - xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); -#endif - xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); - xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); - xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); -} - -static const struct xfrm_if_cb xfrm_if_cb = { - .decode_session = xfrmi_decode_session, -}; - -static int __init xfrmi_init(void) -{ - const char *msg; - int err; - - pr_info("IPsec XFRM device driver\n"); - - msg = "tunnel device"; - err = register_pernet_device(&xfrmi_net_ops); - if (err < 0) - goto pernet_dev_failed; - - msg = "xfrm4 protocols"; - err = xfrmi4_init(); - if (err < 0) - goto xfrmi4_failed; - - msg = "xfrm6 protocols"; - err = xfrmi6_init(); - if (err < 0) - goto xfrmi6_failed; - - - msg = "netlink interface"; - err = rtnl_link_register(&xfrmi_link_ops); - if (err < 0) - goto rtnl_link_failed; - - lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); - - xfrm_if_register_cb(&xfrm_if_cb); - - return err; - -rtnl_link_failed: - xfrmi6_fini(); -xfrmi6_failed: - xfrmi4_fini(); -xfrmi4_failed: - unregister_pernet_device(&xfrmi_net_ops); -pernet_dev_failed: - pr_err("xfrmi init: failed to register %s\n", msg); - return err; -} - -static void __exit xfrmi_fini(void) -{ - xfrm_if_unregister_cb(); - lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); - rtnl_link_unregister(&xfrmi_link_ops); - xfrmi4_fini(); - xfrmi6_fini(); - unregister_pernet_device(&xfrmi_net_ops); -} - -module_init(xfrmi_init); -module_exit(xfrmi_fini); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK("xfrm"); -MODULE_ALIAS_NETDEV("xfrm0"); -MODULE_AUTHOR("Steffen Klassert"); -MODULE_DESCRIPTION("XFRM virtual interface"); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c new file mode 100644 index 000000000000..5a67b120c4db --- /dev/null +++ b/net/xfrm/xfrm_interface_core.c @@ -0,0 +1,1198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM virtual interface + * + * Copyright (C) 2018 secunet Security Networks AG + * + * Author: + * Steffen Klassert + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrmi_dev_init(struct net_device *dev); +static void xfrmi_dev_setup(struct net_device *dev); +static struct rtnl_link_ops xfrmi_link_ops __read_mostly; +static unsigned int xfrmi_net_id __read_mostly; +static const struct net_device_ops xfrmi_netdev_ops; + +#define XFRMI_HASH_BITS 8 +#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) + +struct xfrmi_net { + /* lists for storing interfaces in use */ + struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; + struct xfrm_if __rcu *collect_md_xfrmi; +}; + +static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { + [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +static void xfrmi_destroy_state(struct lwtunnel_state *lwt) +{ +} + +static int xfrmi_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWT_XFRM_MAX + 1]; + struct lwtunnel_state *new_state; + struct xfrm_md_info *info; + int ret; + + ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); + if (ret < 0) + return ret; + + if (!tb[LWT_XFRM_IF_ID]) { + NL_SET_ERR_MSG(extack, "if_id must be set"); + return -EINVAL; + } + + new_state = lwtunnel_state_alloc(sizeof(*info)); + if (!new_state) { + NL_SET_ERR_MSG(extack, "failed to create encap info"); + return -ENOMEM; + } + + new_state->type = LWTUNNEL_ENCAP_XFRM; + + info = lwt_xfrm_info(new_state); + + info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); + + if (tb[LWT_XFRM_LINK]) + info->link = nla_get_u32(tb[LWT_XFRM_LINK]); + + *ts = new_state; + return 0; +} + +static int xfrmi_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwt) +{ + struct xfrm_md_info *info = lwt_xfrm_info(lwt); + + if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || + (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) + return -EMSGSIZE; + + return 0; +} + +static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ + nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ +} + +static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct xfrm_md_info *a_info = lwt_xfrm_info(a); + struct xfrm_md_info *b_info = lwt_xfrm_info(b); + + return memcmp(a_info, b_info, sizeof(*a_info)); +} + +static const struct lwtunnel_encap_ops xfrmi_encap_ops = { + .build_state = xfrmi_build_state, + .destroy_state = xfrmi_destroy_state, + .fill_encap = xfrmi_fill_encap_info, + .get_encap_size = xfrmi_encap_nlsize, + .cmp_encap = xfrmi_encap_cmp, + .owner = THIS_MODULE, +}; + +#define for_each_xfrmi_rcu(start, xi) \ + for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) + +static u32 xfrmi_hash(u32 if_id) +{ + return hash_32(if_id, XFRMI_HASH_BITS); +} + +static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if *xi; + + for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { + if (x->if_id == xi->p.if_id && + (xi->dev->flags & IFF_UP)) + return xi; + } + + xi = rcu_dereference(xfrmn->collect_md_xfrmi); + if (xi && (xi->dev->flags & IFF_UP)) + return xi; + + return NULL; +} + +static bool xfrmi_decode_session(struct sk_buff *skb, + unsigned short family, + struct xfrm_if_decode_session_result *res) +{ + struct net_device *dev; + struct xfrm_if *xi; + int ifindex = 0; + + if (!secpath_exists(skb) || !skb->dev) + return false; + + switch (family) { + case AF_INET6: + ifindex = inet6_sdif(skb); + break; + case AF_INET: + ifindex = inet_sdif(skb); + break; + } + + if (ifindex) { + struct net *net = xs_net(xfrm_input_state(skb)); + + dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = skb->dev; + } + + if (!dev || !(dev->flags & IFF_UP)) + return false; + if (dev->netdev_ops != &xfrmi_netdev_ops) + return false; + + xi = netdev_priv(dev); + res->net = xi->net; + + if (xi->p.collect_md) + res->if_id = xfrm_input_state(skb)->if_id; + else + res->if_id = xi->p.if_id; + return true; +} + +static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; + + rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); + rcu_assign_pointer(*xip, xi); +} + +static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *iter; + + for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; + (iter = rtnl_dereference(*xip)) != NULL; + xip = &iter->next) { + if (xi == iter) { + rcu_assign_pointer(*xip, xi->next); + break; + } + } +} + +static void xfrmi_dev_free(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + gro_cells_destroy(&xi->gro_cells); + free_percpu(dev->tstats); +} + +static int xfrmi_create(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + dev->rtnl_link_ops = &xfrmi_link_ops; + err = register_netdevice(dev); + if (err < 0) + goto out; + + if (xi->p.collect_md) + rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); + else + xfrmi_link(xfrmn, xi); + + return 0; + +out: + return err; +} + +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + if (xi->p.if_id == p->if_id) + return xi; + + return NULL; +} + +static void xfrmi_dev_uninit(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); + + if (xi->p.collect_md) + RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); + else + xfrmi_unlink(xfrmn, xi); +} + +static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb_clear_tstamp(skb); + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + nf_reset_ct(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + ipvs_reset(skb); + secpath_reset(skb); + skb_orphan(skb); + skb->mark = 0; +} + +static int xfrmi_rcv_cb(struct sk_buff *skb, int err) +{ + const struct xfrm_mode *inner_mode; + struct net_device *dev; + struct xfrm_state *x; + struct xfrm_if *xi; + bool xnet; + int link; + + if (err && !secpath_exists(skb)) + return 0; + + x = xfrm_input_state(skb); + + xi = xfrmi_lookup(xs_net(x), x); + if (!xi) + return 1; + + link = skb->dev->ifindex; + dev = xi->dev; + skb->dev = dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + xnet = !net_eq(xi->net, dev_net(skb->dev)); + + if (xnet) { + inner_mode = &x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, + inner_mode->family)) + return -EPERM; + } + + xfrmi_scrub_packet(skb, xnet); + if (xi->p.collect_md) { + struct metadata_dst *md_dst; + + md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); + if (!md_dst) + return -ENOMEM; + + md_dst->u.xfrm_info.if_id = x->if_id; + md_dst->u.xfrm_info.link = link; + skb_dst_set(skb, (struct dst_entry *)md_dst); + } + dev_sw_netstats_rx_add(dev, skb->len); + + return 0; +} + +static int +xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; + struct xfrm_state *x; + int err = -1; + u32 if_id; + int mtu; + + if (xi->p.collect_md) { + struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); + + if (unlikely(!md_info)) + return -EINVAL; + + if_id = md_info->if_id; + fl->flowi_oif = md_info->link; + } else { + if_id = xi->p.if_id; + } + + dst_hold(dst); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + x = dst->xfrm; + if (!x) + goto tx_err_link_failure; + + if (x->if_id != if_id) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + dev->name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst); + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; + } else { + if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto xmit; + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } + + dst_release(dst); + return -EMSGSIZE; + } + +xmit: + xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = tdev; + + err = dst_output(xi->net, skb->sk, skb); + if (net_xmit_eval(err) == 0) { + dev_sw_netstats_tx_add(dev, 1, length); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + if (!dst) { + fl.u.ip6.flowi6_oif = dev->ifindex; + fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, dst); + } + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (!dst) { + struct rtable *rt; + + fl.u.ip4.flowi4_oif = dev->ifindex; + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, &rt->dst); + } + break; + default: + goto tx_err; + } + + fl.flowi_oif = xi->p.link; + + ret = xfrmi_xmit2(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int xfrmi4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->protocol; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + break; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, protocol); + else + ipv4_redirect(skb, net, 0, protocol); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->nexthdr; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + else + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) +{ + if (xi->p.link != p->link) + return -EINVAL; + + xi->p.if_id = p->if_id; + + return 0; +} + +static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) +{ + struct net *net = xi->net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + xfrmi_unlink(xfrmn, xi); + synchronize_net(); + err = xfrmi_change(xi, p); + xfrmi_link(xfrmn, xi); + netdev_state_change(xi->dev); + return err; +} + +static int xfrmi_get_iflink(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->p.link; +} + +static const struct net_device_ops xfrmi_netdev_ops = { + .ndo_init = xfrmi_dev_init, + .ndo_uninit = xfrmi_dev_uninit, + .ndo_start_xmit = xfrmi_xmit, + .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_iflink = xfrmi_get_iflink, +}; + +static void xfrmi_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &xfrmi_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; + dev->type = ARPHRD_NONE; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = IP_MAX_MTU; + dev->flags = IFF_NOARP; + dev->needs_free_netdev = true; + dev->priv_destructor = xfrmi_dev_free; + netif_keep_dst(dev); + + eth_broadcast_addr(dev->broadcast); +} + +#define XFRMI_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + +static int xfrmi_dev_init(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&xi->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + dev->features |= NETIF_F_LLTX; + dev->features |= XFRMI_FEATURES; + dev->hw_features |= XFRMI_FEATURES; + + if (phydev) { + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; + + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, + dev->addr_len); + } else { + eth_hw_addr_random(dev); + eth_broadcast_addr(dev->broadcast); + } + + return 0; +} + +static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void xfrmi_netlink_parms(struct nlattr *data[], + struct xfrm_if_parms *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_XFRM_LINK]) + parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); + + if (data[IFLA_XFRM_IF_ID]) + parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); + + if (data[IFLA_XFRM_COLLECT_METADATA]) + parms->collect_md = true; +} + +static int xfrmi_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net *net = dev_net(dev); + struct xfrm_if_parms p = {}; + struct xfrm_if *xi; + int err; + + xfrmi_netlink_parms(data, &p); + if (p.collect_md) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + if (p.link || p.if_id) { + NL_SET_ERR_MSG(extack, "link and if_id must be zero"); + return -EINVAL; + } + + if (rtnl_dereference(xfrmn->collect_md_xfrmi)) + return -EEXIST; + + } else { + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + + xi = xfrmi_locate(net, &p); + if (xi) + return -EEXIST; + } + + xi = netdev_priv(dev); + xi->p = p; + xi->net = net; + xi->dev = dev; + + err = xfrmi_create(dev); + return err; +} + +static void xfrmi_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + +static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = xi->net; + struct xfrm_if_parms p = {}; + + xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + + if (p.collect_md) { + NL_SET_ERR_MSG(extack, "collect_md can't be changed"); + return -EINVAL; + } + + xi = xfrmi_locate(net, &p); + if (!xi) { + xi = netdev_priv(dev); + } else { + if (xi->dev != dev) + return -EEXIST; + if (xi->p.collect_md) { + NL_SET_ERR_MSG(extack, + "device can't be changed to collect_md"); + return -EINVAL; + } + } + + return xfrmi_update(xi, &p); +} + +static size_t xfrmi_get_size(const struct net_device *dev) +{ + return + /* IFLA_XFRM_LINK */ + nla_total_size(4) + + /* IFLA_XFRM_IF_ID */ + nla_total_size(4) + + /* IFLA_XFRM_COLLECT_METADATA */ + nla_total_size(0) + + 0; +} + +static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrm_if_parms *parm = &xi->p; + + if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || + (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static struct net *xfrmi_get_link_net(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->net; +} + +static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { + [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, + [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, +}; + +static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { + .kind = "xfrm", + .maxtype = IFLA_XFRM_MAX, + .policy = xfrmi_policy, + .priv_size = sizeof(struct xfrm_if), + .setup = xfrmi_dev_setup, + .validate = xfrmi_validate, + .newlink = xfrmi_newlink, + .dellink = xfrmi_dellink, + .changelink = xfrmi_changelink, + .get_size = xfrmi_get_size, + .fill_info = xfrmi_fill_info, + .get_link_net = xfrmi_get_link_net, +}; + +static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +{ + struct net *net; + LIST_HEAD(list); + + rtnl_lock(); + list_for_each_entry(net, net_exit_list, exit_list) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; + + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + unregister_netdevice_queue(xi->dev, &list); + } + xi = rtnl_dereference(xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, &list); + } + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations xfrmi_net_ops = { + .exit_batch = xfrmi_exit_batch_net, + .id = &xfrmi_net_id, + .size = sizeof(struct xfrmi_net), +}; + +static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int xfrmi6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); +} + +static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 2, +}; + +static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 2, +}; +#endif + +static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +static int xfrmi4_rcv_tunnel(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); +} + +static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 3, +}; + +static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 2, +}; +#endif + +static int __init xfrmi4_init(void) +{ + int err; + + err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; + err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipip6_failed; +#endif + + return 0; + +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +xfrm_tunnel_ipip6_failed: + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); +#endif +xfrm_proto_comp_failed: + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi4_fini(void) +{ +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +#endif + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +} + +static int __init xfrmi6_init(void) +{ + int err; + + err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ip6ip_failed; +#endif + + return 0; + +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +xfrm_tunnel_ip6ip_failed: + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +xfrm_tunnel_ipv6_failed: + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); +#endif +xfrm_proto_comp_failed: + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi6_fini(void) +{ +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +#endif + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +} + +static const struct xfrm_if_cb xfrm_if_cb = { + .decode_session = xfrmi_decode_session, +}; + +static int __init xfrmi_init(void) +{ + const char *msg; + int err; + + pr_info("IPsec XFRM device driver\n"); + + msg = "tunnel device"; + err = register_pernet_device(&xfrmi_net_ops); + if (err < 0) + goto pernet_dev_failed; + + msg = "xfrm4 protocols"; + err = xfrmi4_init(); + if (err < 0) + goto xfrmi4_failed; + + msg = "xfrm6 protocols"; + err = xfrmi6_init(); + if (err < 0) + goto xfrmi6_failed; + + + msg = "netlink interface"; + err = rtnl_link_register(&xfrmi_link_ops); + if (err < 0) + goto rtnl_link_failed; + + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); + + xfrm_if_register_cb(&xfrm_if_cb); + + return err; + +rtnl_link_failed: + xfrmi6_fini(); +xfrmi6_failed: + xfrmi4_fini(); +xfrmi4_failed: + unregister_pernet_device(&xfrmi_net_ops); +pernet_dev_failed: + pr_err("xfrmi init: failed to register %s\n", msg); + return err; +} + +static void __exit xfrmi_fini(void) +{ + xfrm_if_unregister_cb(); + lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); + rtnl_link_unregister(&xfrmi_link_ops); + xfrmi4_fini(); + xfrmi6_fini(); + unregister_pernet_device(&xfrmi_net_ops); +} + +module_init(xfrmi_init); +module_exit(xfrmi_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("xfrm"); +MODULE_ALIAS_NETDEV("xfrm0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("XFRM virtual interface"); -- cgit v1.2.3 From 94151f5aa9667c562281abeaaa5e89b9d5c17729 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:57 +0200 Subject: xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF This change adds xfrm metadata helpers using the unstable kfunc call interface for the TC-BPF hooks. This allows steering traffic towards different IPsec connections based on logic implemented in bpf programs. This object is built based on the availability of BTF debug info. When setting the xfrm metadata, percpu metadata dsts are used in order to avoid allocating a metadata dst per packet. In order to guarantee safe module unload, the percpu dsts are allocated on first use and never freed. The percpu pointer is stored in net/core/filter.c so that it can be reused on module reload. The metadata percpu dsts take ownership of the original skb dsts so that they may be used as part of the xfrm transmission logic - e.g. for MTU calculations. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-3-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- include/net/dst_metadata.h | 1 + include/net/xfrm.h | 17 ++++++ net/core/dst.c | 8 ++- net/core/filter.c | 9 ++++ net/xfrm/Makefile | 6 +++ net/xfrm/xfrm_interface_bpf.c | 115 +++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_interface_core.c | 14 +++++ 7 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 net/xfrm/xfrm_interface_bpf.c diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a454cf4327fe..1b7fae4c6b24 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -26,6 +26,7 @@ struct macsec_info { struct xfrm_md_info { u32 if_id; int link; + struct dst_entry *dst_orig; }; struct metadata_dst { diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..3707e6b34e67 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2086,4 +2086,21 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) return false; } #endif + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern struct metadata_dst __percpu *xfrm_bpf_md_dst; + +int register_xfrm_interface_bpf(void); + +#else + +static inline int register_xfrm_interface_bpf(void) +{ + return 0; +} + +#endif + #endif /* _NET_XFRM_H */ diff --git a/net/core/dst.c b/net/core/dst.c index bc9c9be4e080..bb14a0392388 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -316,6 +316,8 @@ void metadata_dst_free(struct metadata_dst *md_dst) if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif + if (md_dst->type == METADATA_XFRM) + dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); @@ -340,16 +342,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { -#ifdef CONFIG_DST_CACHE int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); +#ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); - } #endif + if (one_md_dst->type == METADATA_XFRM) + dst_release(one_md_dst->u.xfrm_info.dst_orig); + } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index 8607136b6e2c..929358677183 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5631,6 +5631,15 @@ static const struct bpf_func_proto bpf_bind_proto = { }; #ifdef CONFIG_XFRM + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +struct metadata_dst __percpu *xfrm_bpf_md_dst; +EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); + +#endif + BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 08a2870fdd36..cd47f88921f5 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -5,6 +5,12 @@ xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o +ifeq ($(CONFIG_XFRM_INTERFACE),m) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o +else ifeq ($(CONFIG_XFRM_INTERFACE),y) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o +endif + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c new file mode 100644 index 000000000000..1ef2162cebcf --- /dev/null +++ b/net/xfrm/xfrm_interface_bpf.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable XFRM Helpers for TC-BPF hook + * + * These are called from SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include +#include + +#include +#include + +/* bpf_xfrm_info - XFRM metadata information + * + * Members: + * @if_id - XFRM if_id: + * Transmit: if_id to be used in policy and state lookups + * Receive: if_id of the state matched for the incoming packet + * @link - Underlying device ifindex: + * Transmit: used as the underlying device in VRF routing + * Receive: the device on which the packet had been received + */ +struct bpf_xfrm_info { + u32 if_id; + int link; +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in xfrm_interface BTF"); + +/* bpf_skb_get_xfrm_info - Get XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @to - Pointer to memory to which the metadata will be copied + * Cannot be NULL + */ +__used noinline +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct xfrm_md_info *info; + + info = skb_xfrm_md_info(skb); + if (!info) + return -EINVAL; + + to->if_id = info->if_id; + to->link = info->link; + return 0; +} + +/* bpf_skb_get_xfrm_info - Set XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @from - Pointer to memory from which the metadata will be copied + * Cannot be NULL + */ +__used noinline +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, + const struct bpf_xfrm_info *from) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct metadata_dst *md_dst; + struct xfrm_md_info *info; + + if (unlikely(skb_metadata_dst(skb))) + return -EINVAL; + + if (!xfrm_bpf_md_dst) { + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(0, METADATA_XFRM, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (cmpxchg(&xfrm_bpf_md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); + } + md_dst = this_cpu_ptr(xfrm_bpf_md_dst); + + info = &md_dst->u.xfrm_info; + + info->if_id = from->if_id; + info->link = from->link; + skb_dst_force(skb); + info->dst_orig = skb_dst(skb); + + dst_hold((struct dst_entry *)md_dst); + skb_dst_set(skb, (struct dst_entry *)md_dst); + return 0; +} + +__diag_pop() + +BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) +BTF_SET8_END(xfrm_ifc_kfunc_set) + +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { + .owner = THIS_MODULE, + .set = &xfrm_ifc_kfunc_set, +}; + +int __init register_xfrm_interface_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &xfrm_interface_kfunc_set); +} diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 5a67b120c4db..1f99dc469027 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -396,6 +396,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if_id = md_info->if_id; fl->flowi_oif = md_info->link; + if (md_info->dst_orig) { + struct dst_entry *tmp_dst = dst; + + dst = md_info->dst_orig; + skb_dst_set(skb, dst); + md_info->dst_orig = NULL; + dst_release(tmp_dst); + } } else { if_id = xi->p.if_id; } @@ -1162,12 +1170,18 @@ static int __init xfrmi_init(void) if (err < 0) goto rtnl_link_failed; + err = register_xfrm_interface_bpf(); + if (err < 0) + goto kfunc_failed; + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; +kfunc_failed: + rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: -- cgit v1.2.3 From 4f4ac4d9106efeec9c84469725c04c4237c7fb6c Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:58 +0200 Subject: tools: add IFLA_XFRM_COLLECT_METADATA to uapi/linux/if_link.h Needed for XFRM metadata tests. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-4-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 0242f31e339c..901d98b865a1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -673,6 +673,7 @@ enum { IFLA_XFRM_UNSPEC, IFLA_XFRM_LINK, IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, __IFLA_XFRM_MAX }; -- cgit v1.2.3 From 90a3a05eb33ff7a263f1e93e444d40e427cddf1a Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:59 +0200 Subject: selftests/bpf: add xfrm_info tests Test the xfrm_info kfunc helpers. The test setup creates three name spaces - NS0, NS1, NS2. XFRM tunnels are setup between NS0 and the two other NSs. The kfunc helpers are used to steer traffic from NS0 to the other NSs based on a userspace populated bpf global variable and validate that the return traffic had arrived from the desired NS. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-5-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/config | 2 + tools/testing/selftests/bpf/prog_tests/xfrm_info.c | 362 +++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 3 + tools/testing/selftests/bpf/progs/xfrm_info.c | 35 ++ 5 files changed, 403 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xfrm_info.c create mode 100644 tools/testing/selftests/bpf/progs/xfrm_info.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 3481f3a5ea6f..585fcf73c731 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -85,3 +85,4 @@ xdp_bonding # failed to auto-attach program 'trace_ xdp_bpf2bpf # failed to auto-attach program 'trace_on_entry': -524 (trampoline) xdp_do_redirect # prog_run_max_size unexpected error: -22 (errno 22) xdp_synproxy # JIT does not support calling kernel function (kfunc) +xfrm_info # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index f9034ea00bc9..3543c76cef56 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -23,6 +23,7 @@ CONFIG_IKCONFIG_PROC=y CONFIG_IMA=y CONFIG_IMA_READ_POLICY=y CONFIG_IMA_WRITE_POLICY=y +CONFIG_INET_ESP=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_RAW=y CONFIG_IP_NF_TARGET_SYNPROXY=y @@ -74,3 +75,4 @@ CONFIG_TEST_BPF=y CONFIG_USERFAULTFD=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y +CONFIG_XFRM_INTERFACE=y diff --git a/tools/testing/selftests/bpf/prog_tests/xfrm_info.c b/tools/testing/selftests/bpf/prog_tests/xfrm_info.c new file mode 100644 index 000000000000..8b03c9bb4862 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xfrm_info.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * Topology: + * --------- + * NS0 namespace | NS1 namespace | NS2 namespace + * | | + * +---------------+ | +---------------+ | + * | ipsec0 |---------| ipsec0 | | + * | 192.168.1.100 | | | 192.168.1.200 | | + * | if_id: bpf | | +---------------+ | + * +---------------+ | | + * | | | +---------------+ + * | | | | ipsec0 | + * \------------------------------------------| 192.168.1.200 | + * | | +---------------+ + * | | + * | | (overlay network) + * ------------------------------------------------------ + * | | (underlay network) + * +--------------+ | +--------------+ | + * | veth01 |----------| veth10 | | + * | 172.16.1.100 | | | 172.16.1.200 | | + * ---------------+ | +--------------+ | + * | | + * +--------------+ | | +--------------+ + * | veth02 |-----------------------------------| veth20 | + * | 172.16.2.100 | | | | 172.16.2.200 | + * +--------------+ | | +--------------+ + * + * + * Test Packet flow + * ----------- + * The tests perform 'ping 192.168.1.200' from the NS0 namespace: + * 1) request is routed to NS0 ipsec0 + * 2) NS0 ipsec0 tc egress BPF program is triggered and sets the if_id based + * on the requested value. This makes the ipsec0 device in external mode + * select the destination tunnel + * 3) ping reaches the other namespace (NS1 or NS2 based on which if_id was + * used) and response is sent + * 4) response is received on NS0 ipsec0, tc ingress program is triggered and + * records the response if_id + * 5) requested if_id is compared with received if_id + */ + +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "xfrm_info.skel.h" + +#define NS0 "xfrm_test_ns0" +#define NS1 "xfrm_test_ns1" +#define NS2 "xfrm_test_ns2" + +#define IF_ID_0_TO_1 1 +#define IF_ID_0_TO_2 2 +#define IF_ID_1 3 +#define IF_ID_2 4 + +#define IP4_ADDR_VETH01 "172.16.1.100" +#define IP4_ADDR_VETH10 "172.16.1.200" +#define IP4_ADDR_VETH02 "172.16.2.100" +#define IP4_ADDR_VETH20 "172.16.2.200" + +#define ESP_DUMMY_PARAMS \ + "proto esp aead 'rfc4106(gcm(aes))' " \ + "0xe4d8f4b4da1df18a3510b3781496daa82488b713 128 mode tunnel " + +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + goto fail; \ + }) + +#define SYS_NOFAIL(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + system(cmd); \ + }) + +static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd) +{ + LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, .priority = 1, + .prog_fd = igr_fd); + LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, .priority = 1, + .prog_fd = egr_fd); + int ret; + + ret = bpf_tc_hook_create(hook); + if (!ASSERT_OK(ret, "create tc hook")) + return ret; + + if (igr_fd >= 0) { + hook->attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(hook, &opts1); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(hook); + return ret; + } + } + + if (egr_fd >= 0) { + hook->attach_point = BPF_TC_EGRESS; + ret = bpf_tc_attach(hook, &opts2); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(hook); + return ret; + } + } + + return 0; +} + +static void cleanup(void) +{ + SYS_NOFAIL("test -f /var/run/netns/" NS0 " && ip netns delete " NS0); + SYS_NOFAIL("test -f /var/run/netns/" NS1 " && ip netns delete " NS1); + SYS_NOFAIL("test -f /var/run/netns/" NS2 " && ip netns delete " NS2); +} + +static int config_underlay(void) +{ + SYS("ip netns add " NS0); + SYS("ip netns add " NS1); + SYS("ip netns add " NS2); + + /* NS0 <-> NS1 [veth01 <-> veth10] */ + SYS("ip link add veth01 netns " NS0 " type veth peer name veth10 netns " NS1); + SYS("ip -net " NS0 " addr add " IP4_ADDR_VETH01 "/24 dev veth01"); + SYS("ip -net " NS0 " link set dev veth01 up"); + SYS("ip -net " NS1 " addr add " IP4_ADDR_VETH10 "/24 dev veth10"); + SYS("ip -net " NS1 " link set dev veth10 up"); + + /* NS0 <-> NS2 [veth02 <-> veth20] */ + SYS("ip link add veth02 netns " NS0 " type veth peer name veth20 netns " NS2); + SYS("ip -net " NS0 " addr add " IP4_ADDR_VETH02 "/24 dev veth02"); + SYS("ip -net " NS0 " link set dev veth02 up"); + SYS("ip -net " NS2 " addr add " IP4_ADDR_VETH20 "/24 dev veth20"); + SYS("ip -net " NS2 " link set dev veth20 up"); + + return 0; +fail: + return -1; +} + +static int setup_xfrm_tunnel_ns(const char *ns, const char *ipv4_local, + const char *ipv4_remote, int if_id) +{ + /* State: local -> remote */ + SYS("ip -net %s xfrm state add src %s dst %s spi 1 " + ESP_DUMMY_PARAMS "if_id %d", ns, ipv4_local, ipv4_remote, if_id); + + /* State: local <- remote */ + SYS("ip -net %s xfrm state add src %s dst %s spi 1 " + ESP_DUMMY_PARAMS "if_id %d", ns, ipv4_remote, ipv4_local, if_id); + + /* Policy: local -> remote */ + SYS("ip -net %s xfrm policy add dir out src 0.0.0.0/0 dst 0.0.0.0/0 " + "if_id %d tmpl src %s dst %s proto esp mode tunnel if_id %d", ns, + if_id, ipv4_local, ipv4_remote, if_id); + + /* Policy: local <- remote */ + SYS("ip -net %s xfrm policy add dir in src 0.0.0.0/0 dst 0.0.0.0/0 " + "if_id %d tmpl src %s dst %s proto esp mode tunnel if_id %d", ns, + if_id, ipv4_remote, ipv4_local, if_id); + + return 0; +fail: + return -1; +} + +static int setup_xfrm_tunnel(const char *ns_a, const char *ns_b, + const char *ipv4_a, const char *ipv4_b, + int if_id_a, int if_id_b) +{ + return setup_xfrm_tunnel_ns(ns_a, ipv4_a, ipv4_b, if_id_a) || + setup_xfrm_tunnel_ns(ns_b, ipv4_b, ipv4_a, if_id_b); +} + +static struct rtattr *rtattr_add(struct nlmsghdr *nh, unsigned short type, + unsigned short len) +{ + struct rtattr *rta = + (struct rtattr *)((uint8_t *)nh + RTA_ALIGN(nh->nlmsg_len)); + rta->rta_type = type; + rta->rta_len = RTA_LENGTH(len); + nh->nlmsg_len = RTA_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len); + return rta; +} + +static struct rtattr *rtattr_add_str(struct nlmsghdr *nh, unsigned short type, + const char *s) +{ + struct rtattr *rta = rtattr_add(nh, type, strlen(s)); + + memcpy(RTA_DATA(rta), s, strlen(s)); + return rta; +} + +static struct rtattr *rtattr_begin(struct nlmsghdr *nh, unsigned short type) +{ + return rtattr_add(nh, type, 0); +} + +static void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr) +{ + uint8_t *end = (uint8_t *)nh + nh->nlmsg_len; + + attr->rta_len = end - (uint8_t *)attr; +} + +static int setup_xfrmi_external_dev(const char *ns) +{ + struct { + struct nlmsghdr nh; + struct ifinfomsg info; + unsigned char data[128]; + } req; + struct rtattr *link_info, *info_data; + struct nstoken *nstoken; + int ret = -1, sock = -1; + struct nlmsghdr *nh; + + memset(&req, 0, sizeof(req)); + nh = &req.nh; + nh->nlmsg_len = NLMSG_LENGTH(sizeof(req.info)); + nh->nlmsg_type = RTM_NEWLINK; + nh->nlmsg_flags |= NLM_F_CREATE | NLM_F_REQUEST; + + rtattr_add_str(nh, IFLA_IFNAME, "ipsec0"); + link_info = rtattr_begin(nh, IFLA_LINKINFO); + rtattr_add_str(nh, IFLA_INFO_KIND, "xfrm"); + info_data = rtattr_begin(nh, IFLA_INFO_DATA); + rtattr_add(nh, IFLA_XFRM_COLLECT_METADATA, 0); + rtattr_end(nh, info_data); + rtattr_end(nh, link_info); + + nstoken = open_netns(ns); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; + + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (!ASSERT_GE(sock, 0, "netlink socket")) + goto done; + ret = send(sock, nh, nh->nlmsg_len, 0); + if (!ASSERT_EQ(ret, nh->nlmsg_len, "netlink send length")) + goto done; + + ret = 0; +done: + if (sock != -1) + close(sock); + if (nstoken) + close_netns(nstoken); + return ret; +} + +static int config_overlay(void) +{ + if (setup_xfrm_tunnel(NS0, NS1, IP4_ADDR_VETH01, IP4_ADDR_VETH10, + IF_ID_0_TO_1, IF_ID_1)) + goto fail; + if (setup_xfrm_tunnel(NS0, NS2, IP4_ADDR_VETH02, IP4_ADDR_VETH20, + IF_ID_0_TO_2, IF_ID_2)) + goto fail; + + /* Older iproute2 doesn't support this option */ + if (!ASSERT_OK(setup_xfrmi_external_dev(NS0), "xfrmi")) + goto fail; + + SYS("ip -net " NS0 " addr add 192.168.1.100/24 dev ipsec0"); + SYS("ip -net " NS0 " link set dev ipsec0 up"); + + SYS("ip -net " NS1 " link add ipsec0 type xfrm if_id %d", IF_ID_1); + SYS("ip -net " NS1 " addr add 192.168.1.200/24 dev ipsec0"); + SYS("ip -net " NS1 " link set dev ipsec0 up"); + + SYS("ip -net " NS2 " link add ipsec0 type xfrm if_id %d", IF_ID_2); + SYS("ip -net " NS2 " addr add 192.168.1.200/24 dev ipsec0"); + SYS("ip -net " NS2 " link set dev ipsec0 up"); + + return 0; +fail: + return -1; +} + +static int test_xfrm_ping(struct xfrm_info *skel, u32 if_id) +{ + skel->bss->req_if_id = if_id; + + SYS("ping -i 0.01 -c 3 -w 10 -q 192.168.1.200 > /dev/null"); + + if (!ASSERT_EQ(skel->bss->resp_if_id, if_id, "if_id")) + goto fail; + + return 0; +fail: + return -1; +} + +static void _test_xfrm_info(void) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + int get_xfrm_info_prog_fd, set_xfrm_info_prog_fd; + struct nstoken *nstoken = NULL; + struct xfrm_info *skel; + int ifindex; + + /* load and attach bpf progs to ipsec dev tc hook point */ + skel = xfrm_info__open_and_load(); + if (!ASSERT_OK_PTR(skel, "xfrm_info__open_and_load")) + goto done; + nstoken = open_netns(NS0); + if (!ASSERT_OK_PTR(nstoken, "setns " NS0)) + goto done; + ifindex = if_nametoindex("ipsec0"); + if (!ASSERT_NEQ(ifindex, 0, "ipsec0 ifindex")) + goto done; + tc_hook.ifindex = ifindex; + set_xfrm_info_prog_fd = bpf_program__fd(skel->progs.set_xfrm_info); + get_xfrm_info_prog_fd = bpf_program__fd(skel->progs.get_xfrm_info); + if (!ASSERT_GE(set_xfrm_info_prog_fd, 0, "bpf_program__fd")) + goto done; + if (!ASSERT_GE(get_xfrm_info_prog_fd, 0, "bpf_program__fd")) + goto done; + if (attach_tc_prog(&tc_hook, get_xfrm_info_prog_fd, + set_xfrm_info_prog_fd)) + goto done; + + /* perform test */ + if (!ASSERT_EQ(test_xfrm_ping(skel, IF_ID_0_TO_1), 0, "ping " NS1)) + goto done; + if (!ASSERT_EQ(test_xfrm_ping(skel, IF_ID_0_TO_2), 0, "ping " NS2)) + goto done; + +done: + if (nstoken) + close_netns(nstoken); + xfrm_info__destroy(skel); +} + +void test_xfrm_info(void) +{ + cleanup(); + + if (!ASSERT_OK(config_underlay(), "config_underlay")) + goto done; + if (!ASSERT_OK(config_overlay(), "config_overlay")) + goto done; + + if (test__start_subtest("xfrm_info")) + _test_xfrm_info(); + +done: + cleanup(); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index adb087aecc9e..b394817126cf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -25,6 +25,9 @@ #define IPV6_TCLASS 67 #define IPV6_AUTOFLOWLABEL 70 +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_SHOT 2 + #define SOL_TCP 6 #define TCP_NODELAY 1 #define TCP_MAXSEG 2 diff --git a/tools/testing/selftests/bpf/progs/xfrm_info.c b/tools/testing/selftests/bpf/progs/xfrm_info.c new file mode 100644 index 000000000000..3acedcdd962d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xfrm_info.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include + +__u32 req_if_id; +__u32 resp_if_id; + +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, + const struct bpf_xfrm_info *from) __ksym; +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, + struct bpf_xfrm_info *to) __ksym; + +SEC("tc") +int set_xfrm_info(struct __sk_buff *skb) +{ + struct bpf_xfrm_info info = { .if_id = req_if_id }; + + return bpf_skb_set_xfrm_info(skb, &info) ? TC_ACT_SHOT : TC_ACT_UNSPEC; +} + +SEC("tc") +int get_xfrm_info(struct __sk_buff *skb) +{ + struct bpf_xfrm_info info = {}; + + if (bpf_skb_get_xfrm_info(skb, &info) < 0) + return TC_ACT_SHOT; + + resp_if_id = info.if_id; + + return TC_ACT_UNSPEC; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b54b6003612a376e7be32cbc5c1af3754bbbbb3d Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 6 Dec 2022 17:14:10 +0800 Subject: riscv, bpf: Emit fixed-length instructions for BPF_PSEUDO_FUNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For BPF_PSEUDO_FUNC instruction, verifier will refill imm with correct addresses of bpf_calls and then run last pass of JIT. Since the emit_imm of RV64 is variable-length, which will emit appropriate length instructions accorroding to the imm, it may broke ctx->offset, and lead to unpredictable problem, such as inaccurate jump. So let's fix it with fixed-length instructions. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Suggested-by: Björn Töpel Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Reviewed-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20221206091410.1584784-1-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 00df3a8f92ac..f2417ac54edd 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -136,6 +136,25 @@ static bool in_auipc_jalr_range(s64 val) val < ((1L << 31) - (1L << 11)); } +/* Emit fixed-length instructions for address */ +static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct rv_jit_context *ctx) +{ + u64 ip = (u64)(ctx->insns + ctx->ninsns); + s64 off = addr - ip; + s64 upper = (off + (1 << 11)) >> 12; + s64 lower = off & 0xfff; + + if (extra_pass && !in_auipc_jalr_range(off)) { + pr_err("bpf-jit: target offset 0x%llx is out of range\n", off); + return -ERANGE; + } + + emit(rv_auipc(rd, upper), ctx); + emit(rv_addi(rd, rd, lower), ctx); + return 0; +} + +/* Emit variable-length instructions for 32-bit and 64-bit imm */ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx) { /* Note that the immediate from the add is sign-extended, @@ -1050,7 +1069,15 @@ out_be: u64 imm64; imm64 = (u64)insn1.imm << 32 | (u32)imm; - emit_imm(rd, imm64, ctx); + if (bpf_pseudo_func(insn)) { + /* fixed-length insns for extra jit pass */ + ret = emit_addr(rd, imm64, extra_pass, ctx); + if (ret) + return ret; + } else { + emit_imm(rd, imm64, ctx); + } + return 1; } -- cgit v1.2.3 From fa55ef14ef4fe06198c0ce811b603aec24134bc2 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 6 Dec 2022 11:19:06 +0400 Subject: bpftool: Fix memory leak in do_build_table_cb strdup() allocates memory for path. We need to release the memory in the following error path. Add free() to avoid memory leak. Fixes: 8f184732b60b ("bpftool: Switch to libbpf's hashmap for pinned paths of BPF objects") Signed-off-by: Miaoqian Lin Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221206071906.806384-1-linmq006@gmail.com --- tools/bpf/bpftool/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index c90b756945e3..620032042576 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -501,6 +501,7 @@ static int do_build_table_cb(const char *fpath, const struct stat *sb, if (err) { p_err("failed to append entry to hashmap for ID %u, path '%s': %s", pinned_info.id, path, strerror(errno)); + free(path); goto out_close; } -- cgit v1.2.3 From aa67961f3243dfff26c47769f87b4d94b07ec71f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 6 Dec 2022 11:35:54 -0800 Subject: selftests/bpf: Allow building bpf tests with CONFIG_XFRM_INTERFACE=[m|n] It is useful to use vmlinux.h in the xfrm_info test like other kfunc tests do. In particular, it is common for kfunc bpf prog that requires to use other core kernel structures in vmlinux.h Although vmlinux.h is preferred, it needs a ___local flavor of struct bpf_xfrm_info in order to build the bpf selftests when CONFIG_XFRM_INTERFACE=[m|n]. Cc: Eyal Birger Fixes: 90a3a05eb33f ("selftests/bpf: add xfrm_info tests") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221206193554.1059757-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/xfrm_info.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xfrm_info.c b/tools/testing/selftests/bpf/progs/xfrm_info.c index 3acedcdd962d..f6a501fbba2b 100644 --- a/tools/testing/selftests/bpf/progs/xfrm_info.c +++ b/tools/testing/selftests/bpf/progs/xfrm_info.c @@ -3,18 +3,23 @@ #include "bpf_tracing_net.h" #include +struct bpf_xfrm_info___local { + u32 if_id; + int link; +} __attribute__((preserve_access_index)); + __u32 req_if_id; __u32 resp_if_id; int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, - const struct bpf_xfrm_info *from) __ksym; + const struct bpf_xfrm_info___local *from) __ksym; int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, - struct bpf_xfrm_info *to) __ksym; + struct bpf_xfrm_info___local *to) __ksym; SEC("tc") int set_xfrm_info(struct __sk_buff *skb) { - struct bpf_xfrm_info info = { .if_id = req_if_id }; + struct bpf_xfrm_info___local info = { .if_id = req_if_id }; return bpf_skb_set_xfrm_info(skb, &info) ? TC_ACT_SHOT : TC_ACT_UNSPEC; } @@ -22,7 +27,7 @@ int set_xfrm_info(struct __sk_buff *skb) SEC("tc") int get_xfrm_info(struct __sk_buff *skb) { - struct bpf_xfrm_info info = {}; + struct bpf_xfrm_info___local info = {}; if (bpf_skb_get_xfrm_info(skb, &info) < 0) return TC_ACT_SHOT; -- cgit v1.2.3 From c21dc529baba16d6698a396eb997fee318300ee1 Mon Sep 17 00:00:00 2001 From: Timo Hunziker Date: Sat, 3 Dec 2022 12:37:46 +0000 Subject: libbpf: Parse usdt args without offset on x86 (e.g. 8@(%rsp)) Parse USDT arguments like "8@(%rsp)" on x86. These are emmited by SystemTap. The argument syntax is similar to the existing "memory dereference case" but the offset left out as it's zero (i.e. read the value from the address in the register). We treat it the same as the the "memory dereference case", but set the offset to 0. I've tested that this fixes the "unrecognized arg #N spec: 8@(%rsp).." error I've run into when attaching to a probe with such an argument. Attaching and reading the correct argument values works. Something similar might be needed for the other supported architectures. [0] Closes: https://github.com/libbpf/libbpf/issues/559 Signed-off-by: Timo Hunziker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221203123746.2160-1-timo.hunziker@eclipso.ch --- tools/lib/bpf/usdt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index b8daae265f99..75b411fc2c77 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1233,6 +1233,14 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec if (reg_off < 0) return reg_off; arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", &arg_sz, reg_name, &len) == 2) { + /* Memory dereference case without offset, e.g., 8@(%rsp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; } else if (sscanf(arg_str, " %d @ %%%15s %n", &arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; -- cgit v1.2.3 From d68ae4982cb773f3d738b5dc25f77f5c7550548a Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 5 Dec 2022 14:16:16 +0100 Subject: selftests/bpf: Install all required files to run selftests When installing the selftests using "make -C tools/testing/selftests install", we need to make sure all the required files to run the selftests are installed. Let's make sure this is the case. Signed-off-by: Daan De Meyer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221205131618.1524337-2-daan.j.demeyer@gmail.com --- tools/testing/selftests/bpf/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6a0f043dc410..997b3bd10cbf 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -532,8 +532,10 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ $(OUTPUT)/sign-file \ - ima_setup.sh verify_sig_setup.sh \ - $(wildcard progs/btf_dump_test_case_*.c) + ima_setup.sh \ + verify_sig_setup.sh \ + $(wildcard progs/btf_dump_test_case_*.c) \ + $(wildcard progs/*.bpf.o) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs)) -- cgit v1.2.3 From efe7fadbd59ec4513272c722403226e47321311b Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 5 Dec 2022 14:16:17 +0100 Subject: selftests/bpf: Use "is not set" instead of "=n" "=n" is not valid kconfig syntax. Use "is not set" instead to indicate the option should be disabled. Signed-off-by: Daan De Meyer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221205131618.1524337-3-daan.j.demeyer@gmail.com --- tools/testing/selftests/bpf/config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3543c76cef56..be892ea76c32 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -8,7 +8,7 @@ CONFIG_BPF_LIRC_MODE2=y CONFIG_BPF_LSM=y CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_SYSCALL=y -CONFIG_BPF_UNPRIV_DEFAULT_OFF=n +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y -- cgit v1.2.3 From d0c0b48c87274b7735f8c930a7c0d783fb46cfe9 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 5 Dec 2022 14:16:18 +0100 Subject: selftests/bpf: Use CONFIG_TEST_BPF=m instead of CONFIG_TEST_BPF=y CONFIG_TEST_BPF can only be a module, so let's indicate it as such in the selftests config. Signed-off-by: Daan De Meyer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221205131618.1524337-4-daan.j.demeyer@gmail.com --- tools/testing/selftests/bpf/config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index be892ea76c32..612f699dc4f7 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -71,7 +71,7 @@ CONFIG_NF_NAT=y CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y -CONFIG_TEST_BPF=y +CONFIG_TEST_BPF=m CONFIG_USERFAULTFD=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y -- cgit v1.2.3 From 156ed20d22ee68d470232d26ae6df2cefacac4a0 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 6 Dec 2022 15:05:38 -0600 Subject: bpf: Don't use rcu_users to refcount in task kfuncs A series of prior patches added some kfuncs that allow struct task_struct * objects to be used as kptrs. These kfuncs leveraged the 'refcount_t rcu_users' field of the task for performing refcounting. This field was used instead of 'refcount_t usage', as we wanted to leverage the safety provided by RCU for ensuring a task's lifetime. A struct task_struct is refcounted by two different refcount_t fields: 1. p->usage: The "true" refcount field which task lifetime. The task is freed as soon as this refcount drops to 0. 2. p->rcu_users: An "RCU users" refcount field which is statically initialized to 2, and is co-located in a union with a struct rcu_head field (p->rcu). p->rcu_users essentially encapsulates a single p->usage refcount, and when p->rcu_users goes to 0, an RCU callback is scheduled on the struct rcu_head which decrements the p->usage refcount. Our logic was that by using p->rcu_users, we would be able to use RCU to safely issue refcount_inc_not_zero() a task's rcu_users field to determine if a task could still be acquired, or was exiting. Unfortunately, this does not work due to p->rcu_users and p->rcu sharing a union. When p->rcu_users goes to 0, an RCU callback is scheduled to drop a single p->usage refcount, and because the fields share a union, the refcount immediately becomes nonzero again after the callback is scheduled. If we were to split the fields out of the union, this wouldn't be a problem. Doing so should also be rather non-controversial, as there are a number of places in struct task_struct that have padding which we could use to avoid growing the structure by splitting up the fields. For now, so as to fix the kfuncs to be correct, this patch instead updates bpf_task_acquire() and bpf_task_release() to use the p->usage field for refcounting via the get_task_struct() and put_task_struct() functions. Because we can no longer rely on RCU, the change also guts the bpf_task_acquire_not_zero() and bpf_task_kptr_get() functions pending a resolution on the above problem. In addition, the task fixes the kfunc and rcu_read_lock selftests to expect this new behavior. Fixes: 90660309b0c7 ("bpf: Add kfuncs for storing struct task_struct * as a kptr") Fixes: fca1aa75518c ("bpf: Handle MEM_RCU type properly") Reported-by: Matus Jokay Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221206210538.597606-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 76 ++++++++++++++-------- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 5 ++ .../selftests/bpf/progs/task_kfunc_success.c | 9 ++- 3 files changed, 60 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cca642358e80..284b3ffdbe48 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1833,8 +1833,7 @@ struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) */ struct task_struct *bpf_task_acquire(struct task_struct *p) { - refcount_inc(&p->rcu_users); - return p; + return get_task_struct(p); } /** @@ -1845,9 +1844,48 @@ struct task_struct *bpf_task_acquire(struct task_struct *p) */ struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) { - if (!refcount_inc_not_zero(&p->rcu_users)) - return NULL; - return p; + /* For the time being this function returns NULL, as it's not currently + * possible to safely acquire a reference to a task with RCU protection + * using get_task_struct() and put_task_struct(). This is due to the + * slightly odd mechanics of p->rcu_users, and how task RCU protection + * works. + * + * A struct task_struct is refcounted by two different refcount_t + * fields: + * + * 1. p->usage: The "true" refcount field which tracks a task's + * lifetime. The task is freed as soon as this + * refcount drops to 0. + * + * 2. p->rcu_users: An "RCU users" refcount field which is statically + * initialized to 2, and is co-located in a union with + * a struct rcu_head field (p->rcu). p->rcu_users + * essentially encapsulates a single p->usage + * refcount, and when p->rcu_users goes to 0, an RCU + * callback is scheduled on the struct rcu_head which + * decrements the p->usage refcount. + * + * There are two important implications to this task refcounting logic + * described above. The first is that + * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as + * after the refcount goes to 0, the RCU callback being scheduled will + * cause the memory backing the refcount to again be nonzero due to the + * fields sharing a union. The other is that we can't rely on RCU to + * guarantee that a task is valid in a BPF program. This is because a + * task could have already transitioned to being in the TASK_DEAD + * state, had its rcu_users refcount go to 0, and its rcu callback + * invoked in which it drops its single p->usage reference. At this + * point the task will be freed as soon as the last p->usage reference + * goes to 0, without waiting for another RCU gp to elapse. The only + * way that a BPF program can guarantee that a task is valid is in this + * scenario is to hold a p->usage refcount itself. + * + * Until we're able to resolve this issue, either by pulling + * p->rcu_users and p->rcu out of the union, or by getting rid of + * p->usage and just using p->rcu_users for refcounting, we'll just + * return NULL here. + */ + return NULL; } /** @@ -1858,33 +1896,15 @@ struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) */ struct task_struct *bpf_task_kptr_get(struct task_struct **pp) { - struct task_struct *p; - - rcu_read_lock(); - p = READ_ONCE(*pp); - - /* Another context could remove the task from the map and release it at - * any time, including after we've done the lookup above. This is safe - * because we're in an RCU read region, so the task is guaranteed to - * remain valid until at least the rcu_read_unlock() below. + /* We must return NULL here until we have clarity on how to properly + * leverage RCU for ensuring a task's lifetime. See the comment above + * in bpf_task_acquire_not_zero() for more details. */ - if (p && !refcount_inc_not_zero(&p->rcu_users)) - /* If the task had been removed from the map and freed as - * described above, refcount_inc_not_zero() will return false. - * The task will be freed at some point after the current RCU - * gp has ended, so just return NULL to the user. - */ - p = NULL; - rcu_read_unlock(); - - return p; + return NULL; } /** * bpf_task_release - Release the reference acquired on a struct task_struct *. - * If this kfunc is invoked in an RCU read region, the task_struct is - * guaranteed to not be freed until the current grace period has ended, even if - * its refcount drops to 0. * @p: The task on which a reference is being released. */ void bpf_task_release(struct task_struct *p) @@ -1892,7 +1912,7 @@ void bpf_task_release(struct task_struct *p) if (!p) return; - put_task_struct_rcu_user(p); + put_task_struct(p); } #ifdef CONFIG_CGROUPS diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index cf06a34fcb02..125f908024d3 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -161,6 +161,11 @@ int task_acquire(void *ctx) /* acquire a reference which can be used outside rcu read lock region */ gparent = bpf_task_acquire_not_zero(gparent); if (!gparent) + /* Until we resolve the issues with using task->rcu_users, we + * expect bpf_task_acquire_not_zero() to return a NULL task. + * See the comment at the definition of + * bpf_task_acquire_not_zero() for more details. + */ goto out; (void)bpf_task_storage_get(&map_a, gparent, 0, 0); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 60c7ead41cfc..9f359cfd29e7 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -123,12 +123,17 @@ int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags) } kptr = bpf_task_kptr_get(&v->task); - if (!kptr) { + if (kptr) { + /* Until we resolve the issues with using task->rcu_users, we + * expect bpf_task_kptr_get() to return a NULL task. See the + * comment at the definition of bpf_task_acquire_not_zero() for + * more details. + */ + bpf_task_release(kptr); err = 3; return 0; } - bpf_task_release(kptr); return 0; } -- cgit v1.2.3 From d8939cb0a03ce7e4e69f65bbd31b79fe42f7d5e6 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 6 Dec 2022 15:09:48 -0800 Subject: bpf: Loosen alloc obj test in verifier's reg_btf_record btf->struct_meta_tab is populated by btf_parse_struct_metas in btf.c. There, a BTF record is created for any type containing a spin_lock or any next-gen datastructure node/head. Currently, for non-MAP_VALUE types, reg_btf_record will only search for a record using struct_meta_tab if the reg->type exactly matches (PTR_TO_BTF_ID | MEM_ALLOC). This exact match is too strict: an "allocated obj" type - returned from bpf_obj_new - might pick up other flags while working its way through the program. Loosen the check to be exact for base_type and just use MEM_ALLOC mask for type_flag. This patch is marked Fixes as the original intent of reg_btf_record was unlikely to have been to fail finding btf_record for valid alloc obj types with additional flags, some of which (e.g. PTR_UNTRUSTED) are valid register type states for alloc obj independent of this series. However, I didn't find a specific broken repro case outside of this series' added functionality, so it's possible that nothing was triggering this logic error before. Signed-off-by: Dave Marchevsky cc: Kumar Kartikeya Dwivedi Fixes: 4e814da0d599 ("bpf: Allow locking bpf_spin_lock in allocated objects") Link: https://lore.kernel.org/r/20221206231000.3180914-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d51bd9596da..67a13110bc22 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -451,6 +451,11 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } +static bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -458,7 +463,7 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) if (reg->type == PTR_TO_MAP_VALUE) { rec = reg->map_ptr->record; - } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { + } else if (type_is_ptr_alloc_obj(reg->type)) { meta = btf_find_struct_meta(reg->btf, reg->btf_id); if (meta) rec = meta->record; -- cgit v1.2.3 From bffdeaa8a5af7200b0e74c9d5a41167f86626a36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:43 -0800 Subject: bpf: decouple prune and jump points BPF verifier marks some instructions as prune points. Currently these prune points serve two purposes. It's a point where verifier tries to find previously verified state and check current state's equivalence to short circuit verification for current code path. But also currently it's a point where jump history, used for precision backtracking, is updated. This is done so that non-linear flow of execution could be properly backtracked. Such coupling is coincidental and unnecessary. Some prune points are not part of some non-linear jump path, so don't need update of jump history. On the other hand, not all instructions which have to be recorded in jump history necessarily are good prune points. This patch splits prune and jump points into independent flags. Currently all prune points are marked as jump points to minimize amount of changes in this patch, but next patch will perform some optimization of prune vs jmp point placement. No functional changes are intended. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c07b351a5bc7..70d06a99f0b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -452,6 +452,7 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool prune_point; + bool jmp_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67a13110bc22..cf580e4c00f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2530,6 +2530,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) @@ -2538,6 +2548,9 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_idx_pair *p; size_t alloc_size; + if (!is_jmp_point(env, env->insn_idx)) + return 0; + cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); @@ -12140,11 +12153,16 @@ static struct bpf_verifier_state_list **explored_state( return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } -static void init_explored_state(struct bpf_verifier_env *env, int idx) +static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; } +static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].prune_point; +} + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -12173,9 +12191,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return -EINVAL; } - if (e == BRANCH) + if (e == BRANCH) { /* mark branch target for state pruning */ - init_explored_state(env, w); + mark_prune_point(env, w); + mark_jmp_point(env, w); + } if (insn_state[w] == 0) { /* tree-edge */ @@ -12213,10 +12233,13 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + if (t + 1 < insn_cnt) { + mark_prune_point(env, t + 1); + mark_jmp_point(env, t + 1); + } if (visit_callee) { - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -12250,13 +12273,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) + if (insns[t].imm == BPF_FUNC_timer_set_callback) { /* Mark this call insn to trigger is_state_visited() check * before call itself is processed by __check_func_call(). * Otherwise new async state will be pushed for further * exploration. */ - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -12274,18 +12299,22 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) * but it's marked, since backtracking needs * to record jmp history in is_state_visited(). */ - init_explored_state(env, t + insns[t].off + 1); + mark_prune_point(env, t + insns[t].off + 1); + mark_jmp_point(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + if (t + 1 < insn_cnt) { + mark_prune_point(env, t + 1); + mark_jmp_point(env, t + 1); + } return ret; default: /* conditional jump with two edges */ - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; @@ -13320,11 +13349,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; - if (!env->insn_aux_data[insn_idx].prune_point) + if (!is_prune_point(env, insn_idx)) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ - return 0; + return push_jmp_history(env, cur); /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 -- cgit v1.2.3 From a095f421057e22853022cb644b1589d0dd0e756e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:44 -0800 Subject: bpf: mostly decouple jump history management from is_state_visited() Jump history updating and state equivalence checks are conceptually independent, so move push_jmp_history() out of is_state_visited(). Also make a decision whether to perform state equivalence checks or not one layer higher in do_check(), keeping is_state_visited() unconditionally performing state checks. push_jmp_history() should be performed after state checks. There is just one small non-uniformity. When is_state_visited() finds already validated equivalent state, it propagates precision marks to current state's parent chain. For this to work correctly, jump history has to be updated, so is_state_visited() is doing that internally. But if no equivalent verified state is found, jump history has to be updated in a newly cloned child state, so is_jmp_point() + push_jmp_history() is performed after is_state_visited() exited with zero result, which means "proceed with validation". This change has no functional changes. It's not strictly necessary, but feels right to decouple these two processes. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cf580e4c00f5..8d8315d9b18b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13348,13 +13348,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = env->test_state_freq ? true : false; - cur->last_insn_idx = env->prev_insn_idx; - if (!is_prune_point(env, insn_idx)) - /* this 'insn_idx' instruction wasn't marked, so we will not - * be doing state search here - */ - return push_jmp_history(env, cur); - /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 * Do not add new state for future pruning if the verifier hasn't seen @@ -13489,10 +13482,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return push_jmp_history(env, cur); + return 0; if (!add_new_state) - return push_jmp_history(env, cur); + return 0; /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -13632,21 +13625,31 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, env->insn_idx); - if (err < 0) - return err; - if (err == 1) { - /* found equivalent state, can prune the search */ - if (env->log.level & BPF_LOG_LEVEL) { - if (do_print_state) - verbose(env, "\nfrom %d to %d%s: safe\n", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - else - verbose(env, "%d: safe\n", env->insn_idx); + state->last_insn_idx = env->prev_insn_idx; + + if (is_prune_point(env, env->insn_idx)) { + err = is_state_visited(env, env->insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (env->log.level & BPF_LOG_LEVEL) { + if (do_print_state) + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + else + verbose(env, "%d: safe\n", env->insn_idx); + } + goto process_bpf_exit; } - goto process_bpf_exit; + } + + if (is_jmp_point(env, env->insn_idx)) { + err = push_jmp_history(env, state); + if (err) + return err; } if (signal_pending(current)) -- cgit v1.2.3 From 618945fbed501b6e5865042068a51edfb2dda948 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:45 -0800 Subject: bpf: remove unnecessary prune and jump points Don't mark some instructions as jump points when there are actually no jumps and instructions are just processed sequentially. Such case is handled naturally by precision backtracking logic without the need to update jump history. See get_prev_insn_idx(). It goes back linearly by one instruction, unless current top of jmp_history is pointing to current instruction. In such case we use `st->jmp_history[cnt - 1].prev_idx` to find instruction from which we jumped to the current instruction non-linearly. Also remove both jump and prune point marking for instruction right after unconditional jumps, as program flow can get to the instruction right after unconditional jump instruction only if there is a jump to that instruction from somewhere else in the program. In such case we'll mark such instruction as prune/jump point because it's a destination of a jump. This change has no changes in terms of number of instructions or states processes across Cilium and selftests programs. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20221206233345.438540-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d8315d9b18b..8c5f0adbbde3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12233,13 +12233,12 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) { - mark_prune_point(env, t + 1); - mark_jmp_point(env, t + 1); - } + mark_prune_point(env, t + 1); + /* when we exit from subprog, we need to record non-linear history */ + mark_jmp_point(env, t + 1); + if (visit_callee) { mark_prune_point(env, t); - mark_jmp_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -12273,15 +12272,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) { - /* Mark this call insn to trigger is_state_visited() check - * before call itself is processed by __check_func_call(). - * Otherwise new async state will be pushed for further - * exploration. + if (insns[t].imm == BPF_FUNC_timer_set_callback) + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. */ mark_prune_point(env, t); - mark_jmp_point(env, t); - } return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -12295,26 +12292,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) if (ret) return ret; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ mark_prune_point(env, t + insns[t].off + 1); mark_jmp_point(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) { - mark_prune_point(env, t + 1); - mark_jmp_point(env, t + 1); - } return ret; default: /* conditional jump with two edges */ mark_prune_point(env, t); - mark_jmp_point(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; -- cgit v1.2.3 From e9b4aeed56699b469206d05e706ddf2db95700a9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 3 Dec 2022 17:51:04 +0100 Subject: net: xsk: Don't include There is no need to include here. Prefer the less invasive which is needed for 'hlist_head'. Signed-off-by: Christophe JAILLET Acked-by: John Fastabend Link: https://lore.kernel.org/r/88d6a1d88764cca328610854f890a9ca1f4b029e.1670086246.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexei Starovoitov --- include/net/netns/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h index e5734261ba0a..21a4f25a187a 100644 --- a/include/net/netns/xdp.h +++ b/include/net/netns/xdp.h @@ -2,8 +2,8 @@ #ifndef __NETNS_XDP_H__ #define __NETNS_XDP_H__ -#include #include +#include struct netns_xdp { struct mutex lock; -- cgit v1.2.3 From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Dec 2022 15:59:32 +0100 Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret The current way of expressing that a non-bpf kernel component is willing to accept that bpf programs can be attached to it and that they can change the return value is to abuse ALLOW_ERROR_INJECTION. This is debated in the link below, and the result is that it is not a reasonable thing to do. Reuse the kfunc declaration structure to also tag the kernel functions we want to be fmodret. This way we can control from any subsystem which functions are being modified by bpf without touching the verifier. Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/ Suggested-by: Alexei Starovoitov Signed-off-by: Benjamin Tissoires Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com --- include/linux/btf.h | 2 ++ kernel/bpf/btf.c | 30 +++++++++++++++++++++++++----- kernel/bpf/verifier.c | 17 +++++++++++++++-- net/bpf/test_run.c | 14 +++++++++++--- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..6a0808e7845c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, u32 kfunc_btf_id); +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..c1df506293e4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -204,6 +204,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_MAX, }; @@ -7448,11 +7449,14 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf, return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } -/* This function must be invoked only from initcalls/module init functions */ -int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, - const struct btf_kfunc_id_set *kset) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +{ + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); +} + +static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - enum btf_kfunc_hook hook; struct btf *btf; int ret; @@ -7471,13 +7475,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, if (IS_ERR(btf)) return PTR_ERR(btf); - hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __register_btf_kfunc_id_set(hook, kset); +} EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) +{ + return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); +} +EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); + s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 225666307bba..ec2e6ec7309e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15021,12 +15021,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - /* fentry/fexit/fmod_ret progs can be sleepable only if they are + + /* fentry/fexit/fmod_ret progs can be sleepable if they are * attached to ALLOW_ERROR_INJECTION and are not in denylist. */ if (!check_non_sleepable_error_inject(btf_id) && within_error_injection_list(addr)) ret = 0; + /* fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + + if (flags && (*flags & KF_SLEEPABLE)) + ret = 0; + } break; case BPF_PROG_TYPE_LSM: /* LSM progs check that they are attached to bpf_lsm_*() funcs. @@ -15047,7 +15057,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } - ret = check_attach_modify_return(addr, tname); + ret = -EINVAL; + if (btf_kfunc_is_modify_return(btf, btf_id) || + !check_attach_modify_return(addr, tname)) + ret = 0; if (ret) { bpf_log(log, "%s() is not modifiable\n", tname); return ret; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 13d578ce2a09..5079fb089b5f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -489,7 +489,6 @@ int noinline bpf_fentry_test1(int a) return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO); int noinline bpf_fentry_test2(int a, u64 b) { @@ -733,7 +732,15 @@ noinline void bpf_kfunc_call_test_destructive(void) __diag_pop(); -ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET8_START(bpf_test_modify_return_ids) +BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) +BTF_SET8_END(bpf_test_modify_return_ids) + +static const struct btf_kfunc_id_set bpf_test_modify_return_set = { + .owner = THIS_MODULE, + .set = &bpf_test_modify_return_ids, +}; BTF_SET8_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1668,7 +1675,8 @@ static int __init bpf_prog_test_run_init(void) }; int ret; - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); + ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, -- cgit v1.2.3 From dcb2288b1fd9a8cdf2f3b8c0c7b3763346ef515f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 7 Dec 2022 11:55:34 -0800 Subject: bpf: Remove unused insn_cnt argument from visit_[func_call_]insn() Number of total instructions in BPF program (including subprogs) can and is accessed from env->prog->len. visit_func_call_insn() doesn't do any checks against insn_cnt anymore, relying on push_insn() to do this check internally. So remove unnecessary insn_cnt input argument from visit_func_call_insn() and visit_insn() functions. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221207195534.2866030-1-andrii@kernel.org --- kernel/bpf/verifier.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1f151f996203..29e09ef15188 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12222,8 +12222,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } -static int visit_func_call_insn(int t, int insn_cnt, - struct bpf_insn *insns, +static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { @@ -12254,13 +12253,13 @@ static int visit_func_call_insn(int t, int insn_cnt, * DONE_EXPLORING - the instruction was fully explored * KEEP_EXPLORING - there is still work to be done before it is fully explored */ -static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int ret; if (bpf_pseudo_func(insns + t)) - return visit_func_call_insn(t, insn_cnt, insns, env, true); + return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && @@ -12279,7 +12278,7 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); - return visit_func_call_insn(t, insn_cnt, insns, env, + return visit_func_call_insn(t, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -12336,7 +12335,7 @@ static int check_cfg(struct bpf_verifier_env *env) while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; - ret = visit_insn(t, insn_cnt, env); + ret = visit_insn(t, env); switch (ret) { case DONE_EXPLORING: insn_state[t] = EXPLORED; -- cgit v1.2.3 From 537c3f66eac137a02ec50a40219d2da6597e5dc9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 7 Dec 2022 12:16:47 -0800 Subject: selftests/bpf: add generic BPF program tester-loader It's become a common pattern to have a collection of small BPF programs in one BPF object file, each representing one test case. On user-space side of such tests we maintain a table of program names and expected failure or success, along with optional expected verifier log message. This works, but each set of tests reimplement this mundane code over and over again, which is a waste of time for anyone trying to add a new set of tests. Furthermore, it's quite error prone as it's way too easy to miss some entries in these manually maintained test tables (as evidences by dynptr_fail tests, in which ringbuf_release_uninit_dynptr subtest was accidentally missed; this is fixed in next patch). So this patch implements generic test_loader, which accepts skeleton name and handles the rest of details: opens and loads BPF object file, making sure each program is tested in isolation. Optionally each test case can specify expected BPF verifier log message. In case of failure, tester makes sure to report verifier log, but it also reports verifier log in verbose mode unconditionally. Now, the interesting deviation from existing custom implementations is the use of btf_decl_tag attribute to specify expected-to-fail vs expected-to-succeed markers and, optionally, expected log message directly next to BPF program source code, eliminating the need to manually create and update table of tests. We define few macros wrapping btf_decl_tag with a convention that all values of btf_decl_tag start with "comment:" prefix, and then utilizing a very simple "just_some_text_tag" or "some_key_name=" pattern to define things like expected success/failure, expected verifier message, extra verifier log level (if necessary). This approach is demonstrated by next patch in which two existing sets of failure tests are converted. Tester supports both expected-to-fail and expected-to-succeed programs, though this patch set didn't convert any existing expected-to-succeed programs yet, as existing tests couple BPF program loading with their further execution through attach or test_prog_run. One way to allow testing scenarios like this would be ability to specify custom callback, executed for each successfully loaded BPF program. This is left for follow up patches, after some more analysis of existing test cases. This test_loader is, hopefully, a start of a test_verifier-like runner, but integrated into test_progs infrastructure. It will allow much better "user experience" of defining low-level verification tests that can take advantage of all the libbpf-provided nicety features on BPF side: global variables, declarative maps, etc. All while having a choice of defining it in C or as BPF assembly (through __attribute__((naked)) functions and using embedded asm), depending on what makes most sense in each particular case. This will be explored in follow up patches as well. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221207201648.2990661-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/progs/bpf_misc.h | 5 + tools/testing/selftests/bpf/test_loader.c | 233 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 33 ++++ 4 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/test_loader.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 997b3bd10cbf..c22c43bbee19 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -527,7 +527,7 @@ TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ btf_helpers.c flow_dissector_load.h \ - cap_helpers.c + cap_helpers.c test_loader.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 5bb11fe595a4..4a01ea9113bf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -2,6 +2,11 @@ #ifndef __BPF_MISC_H__ #define __BPF_MISC_H__ +#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) +#define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) +#define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) +#define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) + #if defined(__TARGET_ARCH_x86) #define SYSCALL_WRAPPER 1 #define SYS_PREFIX "__x64_" diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c new file mode 100644 index 000000000000..679efb3aa785 --- /dev/null +++ b/tools/testing/selftests/bpf/test_loader.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define str_has_pfx(str, pfx) \ + (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) + +#define TEST_LOADER_LOG_BUF_SZ 1048576 + +#define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" +#define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" +#define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" +#define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" + +struct test_spec { + const char *name; + bool expect_failure; + const char *expect_msg; + int log_level; +}; + +static int tester_init(struct test_loader *tester) +{ + if (!tester->log_buf) { + tester->log_buf_sz = TEST_LOADER_LOG_BUF_SZ; + tester->log_buf = malloc(tester->log_buf_sz); + if (!ASSERT_OK_PTR(tester->log_buf, "tester_log_buf")) + return -ENOMEM; + } + + return 0; +} + +void test_loader_fini(struct test_loader *tester) +{ + if (!tester) + return; + + free(tester->log_buf); +} + +static int parse_test_spec(struct test_loader *tester, + struct bpf_object *obj, + struct bpf_program *prog, + struct test_spec *spec) +{ + struct btf *btf; + int func_id, i; + + memset(spec, 0, sizeof(*spec)); + + spec->name = bpf_program__name(prog); + + btf = bpf_object__btf(obj); + if (!btf) { + ASSERT_FAIL("BPF object has no BTF"); + return -EINVAL; + } + + func_id = btf__find_by_name_kind(btf, spec->name, BTF_KIND_FUNC); + if (func_id < 0) { + ASSERT_FAIL("failed to find FUNC BTF type for '%s'", spec->name); + return -EINVAL; + } + + for (i = 1; i < btf__type_cnt(btf); i++) { + const struct btf_type *t; + const char *s; + + t = btf__type_by_id(btf, i); + if (!btf_is_decl_tag(t)) + continue; + + if (t->type != func_id || btf_decl_tag(t)->component_idx != -1) + continue; + + s = btf__str_by_offset(btf, t->name_off); + if (strcmp(s, TEST_TAG_EXPECT_FAILURE) == 0) { + spec->expect_failure = true; + } else if (strcmp(s, TEST_TAG_EXPECT_SUCCESS) == 0) { + spec->expect_failure = false; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { + spec->expect_msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; + } else if (str_has_pfx(s, TEST_TAG_LOG_LEVEL_PFX)) { + errno = 0; + spec->log_level = strtol(s + sizeof(TEST_TAG_LOG_LEVEL_PFX) - 1, NULL, 0); + if (errno) { + ASSERT_FAIL("failed to parse test log level from '%s'", s); + return -EINVAL; + } + } + } + + return 0; +} + +static void prepare_case(struct test_loader *tester, + struct test_spec *spec, + struct bpf_object *obj, + struct bpf_program *prog) +{ + int min_log_level = 0; + + if (env.verbosity > VERBOSE_NONE) + min_log_level = 1; + if (env.verbosity > VERBOSE_VERY) + min_log_level = 2; + + bpf_program__set_log_buf(prog, tester->log_buf, tester->log_buf_sz); + + /* Make sure we set at least minimal log level, unless test requirest + * even higher level already. Make sure to preserve independent log + * level 4 (verifier stats), though. + */ + if ((spec->log_level & 3) < min_log_level) + bpf_program__set_log_level(prog, (spec->log_level & 4) | min_log_level); + else + bpf_program__set_log_level(prog, spec->log_level); + + tester->log_buf[0] = '\0'; +} + +static void emit_verifier_log(const char *log_buf, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); +} + +static void validate_case(struct test_loader *tester, + struct test_spec *spec, + struct bpf_object *obj, + struct bpf_program *prog, + int load_err) +{ + if (spec->expect_msg) { + char *match; + + match = strstr(tester->log_buf, spec->expect_msg); + if (!ASSERT_OK_PTR(match, "expect_msg")) { + /* if we are in verbose mode, we've already emitted log */ + if (env.verbosity == VERBOSE_NONE) + emit_verifier_log(tester->log_buf, true /*force*/); + fprintf(stderr, "EXPECTED MSG: '%s'\n", spec->expect_msg); + return; + } + } +} + +/* this function is forced noinline and has short generic name to look better + * in test_progs output (in case of a failure) + */ +static noinline +void run_subtest(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory) +{ + LIBBPF_OPTS(bpf_object_open_opts, open_opts, .object_name = skel_name); + struct bpf_object *obj = NULL, *tobj; + struct bpf_program *prog, *tprog; + const void *obj_bytes; + size_t obj_byte_cnt; + int err; + + if (tester_init(tester) < 0) + return; /* failed to initialize tester */ + + obj_bytes = elf_bytes_factory(&obj_byte_cnt); + obj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts); + if (!ASSERT_OK_PTR(obj, "obj_open_mem")) + return; + + bpf_object__for_each_program(prog, obj) { + const char *prog_name = bpf_program__name(prog); + struct test_spec spec; + + if (!test__start_subtest(prog_name)) + continue; + + /* if we can't derive test specification, go to the next test */ + err = parse_test_spec(tester, obj, prog, &spec); + if (!ASSERT_OK(err, "parse_test_spec")) + continue; + + tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, &open_opts); + if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */ + continue; + + bpf_object__for_each_program(tprog, tobj) + bpf_program__set_autoload(tprog, false); + + bpf_object__for_each_program(tprog, tobj) { + /* only load specified program */ + if (strcmp(bpf_program__name(tprog), prog_name) == 0) { + bpf_program__set_autoload(tprog, true); + break; + } + } + + prepare_case(tester, &spec, tobj, tprog); + + err = bpf_object__load(tobj); + if (spec.expect_failure) { + if (!ASSERT_ERR(err, "unexpected_load_success")) { + emit_verifier_log(tester->log_buf, false /*force*/); + goto tobj_cleanup; + } + } else { + if (!ASSERT_OK(err, "unexpected_load_failure")) { + emit_verifier_log(tester->log_buf, true /*force*/); + goto tobj_cleanup; + } + } + + emit_verifier_log(tester->log_buf, false /*force*/); + validate_case(tester, &spec, tobj, tprog, err); + +tobj_cleanup: + bpf_object__close(tobj); + } + + bpf_object__close(obj); +} + +void test_loader__run_subtests(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory) +{ + /* see comment in run_subtest() for why we do this function nesting */ + run_subtest(tester, skel_name, elf_bytes_factory); +} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index b090996daee5..3f058dfadbaf 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TEST_PROGS_H +#define __TEST_PROGS_H + #include #include #include @@ -210,6 +213,12 @@ int test__join_cgroup(const char *path); #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) +#define ASSERT_FAIL(fmt, args...) ({ \ + static int duration = 0; \ + CHECK(false, "", fmt"\n", ##args); \ + false; \ +}) + #define ASSERT_TRUE(actual, name) ({ \ static int duration = 0; \ bool ___ok = (actual); \ @@ -397,3 +406,27 @@ int write_sysctl(const char *sysctl, const char *value); #endif #define BPF_TESTMOD_TEST_FILE "/sys/kernel/bpf_testmod" + +struct test_loader { + char *log_buf; + size_t log_buf_sz; + + struct bpf_object *obj; +}; + +typedef const void *(*skel_elf_bytes_fn)(size_t *sz); + +extern void test_loader__run_subtests(struct test_loader *tester, + const char *skel_name, + skel_elf_bytes_fn elf_bytes_factory); + +extern void test_loader_fini(struct test_loader *tester); + +#define RUN_TESTS(skel) ({ \ + struct test_loader tester = {}; \ + \ + test_loader__run_subtests(&tester, #skel, skel##__elf_bytes); \ + test_loader_fini(&tester); \ +}) + +#endif /* __TEST_PROGS_H */ -- cgit v1.2.3 From 26c386ecf0212affb50f02dabcb0152995b99b07 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 7 Dec 2022 12:16:48 -0800 Subject: selftests/bpf: convert dynptr_fail and map_kptr_fail subtests to generic tester Convert big chunks of dynptr and map_kptr subtests to use generic verification_tester. They are switched from using manually maintained tables of test cases, specifying program name and expected error verifier message, to btf_decl_tag-based annotations directly on corresponding BPF programs: __failure to specify that BPF program is expected to fail verification, and __msg() to specify expected log message. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207201648.2990661-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dynptr.c | 80 +--------------------- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 80 +--------------------- tools/testing/selftests/bpf/progs/dynptr_fail.c | 31 +++++++++ tools/testing/selftests/bpf/progs/dynptr_success.c | 1 + tools/testing/selftests/bpf/progs/map_kptr_fail.c | 27 ++++++++ 5 files changed, 64 insertions(+), 155 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index b0c06f821cb8..7faaf6d9e0d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -5,86 +5,16 @@ #include "dynptr_fail.skel.h" #include "dynptr_success.skel.h" -static size_t log_buf_sz = 1048576; /* 1 MB */ -static char obj_log_buf[1048576]; - static struct { const char *prog_name; const char *expected_err_msg; } dynptr_tests[] = { - /* failure cases */ - {"ringbuf_missing_release1", "Unreleased reference id=1"}, - {"ringbuf_missing_release2", "Unreleased reference id=2"}, - {"ringbuf_missing_release_callback", "Unreleased reference id"}, - {"use_after_invalid", "Expected an initialized dynptr as arg #3"}, - {"ringbuf_invalid_api", "type=mem expected=ringbuf_mem"}, - {"add_dynptr_to_map1", "invalid indirect read from stack"}, - {"add_dynptr_to_map2", "invalid indirect read from stack"}, - {"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"}, - {"data_slice_out_of_bounds_map_value", "value is outside of the allowed memory range"}, - {"data_slice_use_after_release1", "invalid mem access 'scalar'"}, - {"data_slice_use_after_release2", "invalid mem access 'scalar'"}, - {"data_slice_missing_null_check1", "invalid mem access 'mem_or_null'"}, - {"data_slice_missing_null_check2", "invalid mem access 'mem_or_null'"}, - {"invalid_helper1", "invalid indirect read from stack"}, - {"invalid_helper2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write1", "Expected an initialized dynptr as arg #1"}, - {"invalid_write2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write3", "Expected an initialized dynptr as arg #1"}, - {"invalid_write4", "arg 1 is an unacquired reference"}, - {"invalid_read1", "invalid read from stack"}, - {"invalid_read2", "cannot pass in dynptr at an offset"}, - {"invalid_read3", "invalid read from stack"}, - {"invalid_read4", "invalid read from stack"}, - {"invalid_offset", "invalid write to stack"}, - {"global", "type=map_value expected=fp"}, - {"release_twice", "arg 1 is an unacquired reference"}, - {"release_twice_callback", "arg 1 is an unacquired reference"}, - {"dynptr_from_mem_invalid_api", - "Unsupported reg type fp for bpf_dynptr_from_mem data"}, - /* success cases */ {"test_read_write", NULL}, {"test_data_slice", NULL}, {"test_ringbuf", NULL}, }; -static void verify_fail(const char *prog_name, const char *expected_err_msg) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct bpf_program *prog; - struct dynptr_fail *skel; - int err; - - opts.kernel_log_buf = obj_log_buf; - opts.kernel_log_size = log_buf_sz; - opts.kernel_log_level = 1; - - skel = dynptr_fail__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts")) - goto cleanup; - - prog = bpf_object__find_program_by_name(skel->obj, prog_name); - if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) - goto cleanup; - - bpf_program__set_autoload(prog, true); - - bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); - - err = dynptr_fail__load(skel); - if (!ASSERT_ERR(err, "unexpected load success")) - goto cleanup; - - if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) { - fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg); - fprintf(stderr, "Verifier output: %s\n", obj_log_buf); - } - -cleanup: - dynptr_fail__destroy(skel); -} - static void verify_success(const char *prog_name) { struct dynptr_success *skel; @@ -97,8 +27,6 @@ static void verify_success(const char *prog_name) skel->bss->pid = getpid(); - bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize()); - dynptr_success__load(skel); if (!ASSERT_OK_PTR(skel, "dynptr_success__load")) goto cleanup; @@ -129,10 +57,8 @@ void test_dynptr(void) if (!test__start_subtest(dynptr_tests[i].prog_name)) continue; - if (dynptr_tests[i].expected_err_msg) - verify_fail(dynptr_tests[i].prog_name, - dynptr_tests[i].expected_err_msg); - else - verify_success(dynptr_tests[i].prog_name); + verify_success(dynptr_tests[i].prog_name); } + + RUN_TESTS(dynptr_fail); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 0d66b1524208..3533a4ecad01 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -5,83 +5,6 @@ #include "map_kptr.skel.h" #include "map_kptr_fail.skel.h" -static char log_buf[1024 * 1024]; - -struct { - const char *prog_name; - const char *err_msg; -} map_kptr_fail_tests[] = { - { "size_not_bpf_dw", "kptr access size must be BPF_DW" }, - { "non_const_var_off", "kptr access cannot have variable offset" }, - { "non_const_var_off_kptr_xchg", "R1 doesn't have constant offset. kptr has to be" }, - { "misaligned_access_write", "kptr access misaligned expected=8 off=7" }, - { "misaligned_access_read", "kptr access misaligned expected=8 off=1" }, - { "reject_var_off_store", "variable untrusted_ptr_ access var_off=(0x0; 0x1e0)" }, - { "reject_bad_type_match", "invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc" }, - { "marked_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" }, - { "correct_btf_id_check_size", "access beyond struct prog_test_ref_kfunc at off 32 size 4" }, - { "inherit_untrusted_on_walk", "R1 type=untrusted_ptr_ expected=percpu_ptr_" }, - { "reject_kptr_xchg_on_unref", "off=8 kptr isn't referenced kptr" }, - { "reject_kptr_get_no_map_val", "arg#0 expected pointer to map value" }, - { "reject_kptr_get_no_null_map_val", "arg#0 expected pointer to map value" }, - { "reject_kptr_get_no_kptr", "arg#0 no referenced kptr at map value offset=0" }, - { "reject_kptr_get_on_unref", "arg#0 no referenced kptr at map value offset=8" }, - { "reject_kptr_get_bad_type_match", "kernel function bpf_kfunc_call_test_kptr_get args#0" }, - { "mark_ref_as_untrusted_or_null", "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_" }, - { "reject_untrusted_store_to_ref", "store to referenced kptr disallowed" }, - { "reject_bad_type_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member" }, - { "reject_untrusted_xchg", "R2 type=untrusted_ptr_ expected=ptr_" }, - { "reject_member_of_ref_xchg", "invalid kptr access, R2 type=ptr_prog_test_ref_kfunc" }, - { "reject_indirect_helper_access", "kptr cannot be accessed indirectly by helper" }, - { "reject_indirect_global_func_access", "kptr cannot be accessed indirectly by helper" }, - { "kptr_xchg_ref_state", "Unreleased reference id=5 alloc_insn=" }, - { "kptr_get_ref_state", "Unreleased reference id=3 alloc_insn=" }, -}; - -static void test_map_kptr_fail_prog(const char *prog_name, const char *err_msg) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf, - .kernel_log_size = sizeof(log_buf), - .kernel_log_level = 1); - struct map_kptr_fail *skel; - struct bpf_program *prog; - int ret; - - skel = map_kptr_fail__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "map_kptr_fail__open_opts")) - return; - - prog = bpf_object__find_program_by_name(skel->obj, prog_name); - if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) - goto end; - - bpf_program__set_autoload(prog, true); - - ret = map_kptr_fail__load(skel); - if (!ASSERT_ERR(ret, "map_kptr__load must fail")) - goto end; - - if (!ASSERT_OK_PTR(strstr(log_buf, err_msg), "expected error message")) { - fprintf(stderr, "Expected: %s\n", err_msg); - fprintf(stderr, "Verifier: %s\n", log_buf); - } - -end: - map_kptr_fail__destroy(skel); -} - -static void test_map_kptr_fail(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(map_kptr_fail_tests); i++) { - if (!test__start_subtest(map_kptr_fail_tests[i].prog_name)) - continue; - test_map_kptr_fail_prog(map_kptr_fail_tests[i].prog_name, - map_kptr_fail_tests[i].err_msg); - } -} - static void test_map_kptr_success(bool test_run) { LIBBPF_OPTS(bpf_test_run_opts, opts, @@ -145,5 +68,6 @@ void test_map_kptr(void) */ test_map_kptr_success(true); } - test_map_kptr_fail(); + + RUN_TESTS(map_kptr_fail); } diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b0f08ff024fb..78debc1b3820 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -43,6 +43,7 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); } ringbuf SEC(".maps"); int err, val; @@ -66,6 +67,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) * bpf_ringbuf_submit/discard_dynptr call */ SEC("?raw_tp") +__failure __msg("Unreleased reference id=1") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr; @@ -78,6 +80,7 @@ int ringbuf_missing_release1(void *ctx) } SEC("?raw_tp") +__failure __msg("Unreleased reference id=2") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -113,6 +116,7 @@ static int missing_release_callback_fn(__u32 index, void *data) /* Any dynptr initialized within a callback must have bpf_dynptr_put called */ SEC("?raw_tp") +__failure __msg("Unreleased reference id") int ringbuf_missing_release_callback(void *ctx) { bpf_loop(10, missing_release_callback_fn, NULL, 0); @@ -121,6 +125,7 @@ int ringbuf_missing_release_callback(void *ctx) /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -133,6 +138,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) /* A dynptr can't be used after it has been invalidated */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #3") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -152,6 +158,7 @@ int use_after_invalid(void *ctx) /* Can't call non-dynptr ringbuf APIs on a dynptr ringbuf sample */ SEC("?raw_tp") +__failure __msg("type=mem expected=ringbuf_mem") int ringbuf_invalid_api(void *ctx) { struct bpf_dynptr ptr; @@ -174,6 +181,7 @@ done: /* Can't add a dynptr to a map */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int add_dynptr_to_map1(void *ctx) { struct bpf_dynptr ptr; @@ -191,6 +199,7 @@ int add_dynptr_to_map1(void *ctx) /* Can't add a struct with an embedded dynptr to a map */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int add_dynptr_to_map2(void *ctx) { struct test_info x; @@ -208,6 +217,7 @@ int add_dynptr_to_map2(void *ctx) /* A data slice can't be accessed out of bounds */ SEC("?raw_tp") +__failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_ringbuf(void *ctx) { struct bpf_dynptr ptr; @@ -228,6 +238,7 @@ done: } SEC("?raw_tp") +__failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_map_value(void *ctx) { __u32 key = 0, map_val; @@ -248,6 +259,7 @@ int data_slice_out_of_bounds_map_value(void *ctx) /* A data slice can't be used after it has been released */ SEC("?raw_tp") +__failure __msg("invalid mem access 'scalar'") int data_slice_use_after_release1(void *ctx) { struct bpf_dynptr ptr; @@ -279,6 +291,7 @@ done: * ptr2 is at fp - 16). */ SEC("?raw_tp") +__failure __msg("invalid mem access 'scalar'") int data_slice_use_after_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -310,6 +323,7 @@ done: /* A data slice must be first checked for NULL */ SEC("?raw_tp") +__failure __msg("invalid mem access 'mem_or_null'") int data_slice_missing_null_check1(void *ctx) { struct bpf_dynptr ptr; @@ -330,6 +344,7 @@ int data_slice_missing_null_check1(void *ctx) /* A data slice can't be dereferenced if it wasn't checked for null */ SEC("?raw_tp") +__failure __msg("invalid mem access 'mem_or_null'") int data_slice_missing_null_check2(void *ctx) { struct bpf_dynptr ptr; @@ -352,6 +367,7 @@ done: * dynptr argument */ SEC("?raw_tp") +__failure __msg("invalid indirect read from stack") int invalid_helper1(void *ctx) { struct bpf_dynptr ptr; @@ -366,6 +382,7 @@ int invalid_helper1(void *ctx) /* A dynptr can't be passed into a helper function at a non-zero offset */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #3") int invalid_helper2(void *ctx) { struct bpf_dynptr ptr; @@ -381,6 +398,7 @@ int invalid_helper2(void *ctx) /* A bpf_dynptr is invalidated if it's been written into */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #1") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -402,6 +420,7 @@ int invalid_write1(void *ctx) * offset */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #3") int invalid_write2(void *ctx) { struct bpf_dynptr ptr; @@ -425,6 +444,7 @@ int invalid_write2(void *ctx) * non-const offset */ SEC("?raw_tp") +__failure __msg("Expected an initialized dynptr as arg #1") int invalid_write3(void *ctx) { struct bpf_dynptr ptr; @@ -456,6 +476,7 @@ static int invalid_write4_callback(__u32 index, void *data) * be invalidated as a dynptr */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int invalid_write4(void *ctx) { struct bpf_dynptr ptr; @@ -472,7 +493,9 @@ int invalid_write4(void *ctx) /* A globally-defined bpf_dynptr can't be used (it must reside as a stack frame) */ struct bpf_dynptr global_dynptr; + SEC("?raw_tp") +__failure __msg("type=map_value expected=fp") int global(void *ctx) { /* this should fail */ @@ -485,6 +508,7 @@ int global(void *ctx) /* A direct read should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read1(void *ctx) { struct bpf_dynptr ptr; @@ -501,6 +525,7 @@ int invalid_read1(void *ctx) /* A direct read at an offset should fail */ SEC("?raw_tp") +__failure __msg("cannot pass in dynptr at an offset") int invalid_read2(void *ctx) { struct bpf_dynptr ptr; @@ -516,6 +541,7 @@ int invalid_read2(void *ctx) /* A direct read at an offset into the lower stack slot should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read3(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -542,6 +568,7 @@ static int invalid_read4_callback(__u32 index, void *data) /* A direct read within a callback function should fail */ SEC("?raw_tp") +__failure __msg("invalid read from stack") int invalid_read4(void *ctx) { struct bpf_dynptr ptr; @@ -557,6 +584,7 @@ int invalid_read4(void *ctx) /* Initializing a dynptr on an offset should fail */ SEC("?raw_tp") +__failure __msg("invalid write to stack") int invalid_offset(void *ctx) { struct bpf_dynptr ptr; @@ -571,6 +599,7 @@ int invalid_offset(void *ctx) /* Can't release a dynptr twice */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -597,6 +626,7 @@ static int release_twice_callback_fn(__u32 index, void *data) * within a calback function, fails */ SEC("?raw_tp") +__failure __msg("arg 1 is an unacquired reference") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; @@ -612,6 +642,7 @@ int release_twice_callback(void *ctx) /* Reject unsupported local mem types for dynptr_from_mem API */ SEC("?raw_tp") +__failure __msg("Unsupported reg type fp for bpf_dynptr_from_mem data") int dynptr_from_mem_invalid_api(void *ctx) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index a3a6103c8569..35db7c6c1fc7 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -20,6 +20,7 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); } ringbuf SEC(".maps"); struct { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 05e209b1b12a..760e41e1a632 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -3,6 +3,7 @@ #include #include #include +#include "bpf_misc.h" struct map_value { char buf[8]; @@ -23,6 +24,7 @@ extern struct prog_test_ref_kfunc * bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **p, int a, int b) __ksym; SEC("?tc") +__failure __msg("kptr access size must be BPF_DW") int size_not_bpf_dw(struct __sk_buff *ctx) { struct map_value *v; @@ -37,6 +39,7 @@ int size_not_bpf_dw(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access cannot have variable offset") int non_const_var_off(struct __sk_buff *ctx) { struct map_value *v; @@ -55,6 +58,7 @@ int non_const_var_off(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 doesn't have constant offset. kptr has to be") int non_const_var_off_kptr_xchg(struct __sk_buff *ctx) { struct map_value *v; @@ -73,6 +77,7 @@ int non_const_var_off_kptr_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access misaligned expected=8 off=7") int misaligned_access_write(struct __sk_buff *ctx) { struct map_value *v; @@ -88,6 +93,7 @@ int misaligned_access_write(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kptr access misaligned expected=8 off=1") int misaligned_access_read(struct __sk_buff *ctx) { struct map_value *v; @@ -101,6 +107,7 @@ int misaligned_access_read(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("variable untrusted_ptr_ access var_off=(0x0; 0x1e0)") int reject_var_off_store(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -124,6 +131,7 @@ int reject_var_off_store(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("invalid kptr access, R1 type=untrusted_ptr_prog_test_ref_kfunc") int reject_bad_type_match(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -144,6 +152,7 @@ int reject_bad_type_match(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") int marked_as_untrusted_or_null(struct __sk_buff *ctx) { struct map_value *v; @@ -158,6 +167,7 @@ int marked_as_untrusted_or_null(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("access beyond struct prog_test_ref_kfunc at off 32 size 4") int correct_btf_id_check_size(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -175,6 +185,7 @@ int correct_btf_id_check_size(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_ expected=percpu_ptr_") int inherit_untrusted_on_walk(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *unref_ptr; @@ -194,6 +205,7 @@ int inherit_untrusted_on_walk(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("off=8 kptr isn't referenced kptr") int reject_kptr_xchg_on_unref(struct __sk_buff *ctx) { struct map_value *v; @@ -208,6 +220,7 @@ int reject_kptr_xchg_on_unref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 expected pointer to map value") int reject_kptr_get_no_map_val(struct __sk_buff *ctx) { bpf_kfunc_call_test_kptr_get((void *)&ctx, 0, 0); @@ -215,6 +228,7 @@ int reject_kptr_get_no_map_val(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 expected pointer to map value") int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx) { bpf_kfunc_call_test_kptr_get(bpf_map_lookup_elem(&array_map, &(int){0}), 0, 0); @@ -222,6 +236,7 @@ int reject_kptr_get_no_null_map_val(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 no referenced kptr at map value offset=0") int reject_kptr_get_no_kptr(struct __sk_buff *ctx) { struct map_value *v; @@ -236,6 +251,7 @@ int reject_kptr_get_no_kptr(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("arg#0 no referenced kptr at map value offset=8") int reject_kptr_get_on_unref(struct __sk_buff *ctx) { struct map_value *v; @@ -250,6 +266,7 @@ int reject_kptr_get_on_unref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("kernel function bpf_kfunc_call_test_kptr_get args#0") int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) { struct map_value *v; @@ -264,6 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) { struct map_value *v; @@ -278,6 +296,7 @@ int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("store to referenced kptr disallowed") int reject_untrusted_store_to_ref(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -297,6 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("R2 type=untrusted_ptr_ expected=ptr_") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -315,6 +335,8 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure +__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -333,6 +355,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -351,6 +374,7 @@ int reject_member_of_ref_xchg(struct __sk_buff *ctx) } SEC("?syscall") +__failure __msg("kptr cannot be accessed indirectly by helper") int reject_indirect_helper_access(struct __sk_buff *ctx) { struct map_value *v; @@ -371,6 +395,7 @@ int write_func(int *p) } SEC("?tc") +__failure __msg("kptr cannot be accessed indirectly by helper") int reject_indirect_global_func_access(struct __sk_buff *ctx) { struct map_value *v; @@ -384,6 +409,7 @@ int reject_indirect_global_func_access(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("Unreleased reference id=5 alloc_insn=") int kptr_xchg_ref_state(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -402,6 +428,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx) } SEC("?tc") +__failure __msg("Unreleased reference id=3 alloc_insn=") int kptr_get_ref_state(struct __sk_buff *ctx) { struct map_value *v; -- cgit v1.2.3 From 25c5e92d197bd721e706444c5910fd386c330456 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 7 Dec 2022 14:49:10 -0600 Subject: bpf/docs: Document struct task_struct * kfuncs bpf_task_acquire(), bpf_task_release(), and bpf_task_from_pid() are kfuncs that were recently added to kernel/bpf/helpers.c. These are "core" kfuncs in that they're available for use for any tracepoint or struct_ops BPF program. Though they have no ABI stability guarantees, we should still document them. This patch adds a new Core kfuncs section to the BPF kfuncs doc, and adds entries for all of these task kfuncs. Note that bpf_task_kptr_get() is not documented, as it still returns NULL while we're working to resolve how it can use RCU to ensure struct task_struct * lifetime. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221207204911.873646-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 83 ++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 2 +- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index b027fe16ee66..24ed109afc98 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -222,3 +222,86 @@ type. An example is shown below:: return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_task_kfunc_set); } late_initcall(init_subsystem); + +3. Core kfuncs +============== + +The BPF subsystem provides a number of "core" kfuncs that are potentially +applicable to a wide variety of different possible use cases and programs. +Those kfuncs are documented here. + +3.1 struct task_struct * kfuncs +------------------------------- + +There are a number of kfuncs that allow ``struct task_struct *`` objects to be +used as kptrs: + +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_task_acquire bpf_task_release + +These kfuncs are useful when you want to acquire or release a reference to a +``struct task_struct *`` that was passed as e.g. a tracepoint arg, or a +struct_ops callback arg. For example: + +.. code-block:: c + + /** + * A trivial example tracepoint program that shows how to + * acquire and release a struct task_struct * pointer. + */ + SEC("tp_btf/task_newtask") + int BPF_PROG(task_acquire_release_example, struct task_struct *task, u64 clone_flags) + { + struct task_struct *acquired; + + acquired = bpf_task_acquire(task); + + /* + * In a typical program you'd do something like store + * the task in a map, and the map will automatically + * release it later. Here, we release it manually. + */ + bpf_task_release(acquired); + return 0; + } + +---- + +A BPF program can also look up a task from a pid. This can be useful if the +caller doesn't have a trusted pointer to a ``struct task_struct *`` object that +it can acquire a reference on with bpf_task_acquire(). + +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_task_from_pid + +Here is an example of it being used: + +.. code-block:: c + + SEC("tp_btf/task_newtask") + int BPF_PROG(task_get_pid_example, struct task_struct *task, u64 clone_flags) + { + struct task_struct *lookup; + + lookup = bpf_task_from_pid(task->pid); + if (!lookup) + /* A task should always be found, as %task is a tracepoint arg. */ + return -ENOENT; + + if (lookup->pid != task->pid) { + /* bpf_task_from_pid() looks up the task via its + * globally-unique pid from the init_pid_ns. Thus, + * the pid of the lookup task should always be the + * same as the input task. + */ + bpf_task_release(lookup); + return -EINVAL; + } + + /* bpf_task_from_pid() returns an acquired reference, + * so it must be dropped before returning from the + * tracepoint handler. + */ + bpf_task_release(lookup); + return 0; + } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 284b3ffdbe48..1e4bf466b08f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1904,7 +1904,7 @@ struct task_struct *bpf_task_kptr_get(struct task_struct **pp) } /** - * bpf_task_release - Release the reference acquired on a struct task_struct *. + * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ void bpf_task_release(struct task_struct *p) -- cgit v1.2.3 From 36aa10ffd6480b93e32611411be4a8fc49804aba Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 7 Dec 2022 14:49:11 -0600 Subject: bpf/docs: Document struct cgroup * kfuncs bpf_cgroup_acquire(), bpf_cgroup_release(), bpf_cgroup_kptr_get(), and bpf_cgroup_ancestor(), are kfuncs that were recently added to kernel/bpf/helpers.c. These are "core" kfuncs in that they're available for use in any tracepoint or struct_ops BPF program. Though they have no ABI stability guarantees, we should still document them. This patch adds a struct cgroup * subsection to the Core kfuncs section which describes each of these kfuncs. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221207204911.873646-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 115 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 2 +- 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 24ed109afc98..9fd7fb539f85 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -305,3 +305,118 @@ Here is an example of it being used: bpf_task_release(lookup); return 0; } + +3.2 struct cgroup * kfuncs +-------------------------- + +``struct cgroup *`` objects also have acquire and release functions: + +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_cgroup_acquire bpf_cgroup_release + +These kfuncs are used in exactly the same manner as bpf_task_acquire() and +bpf_task_release() respectively, so we won't provide examples for them. + +---- + +You may also acquire a reference to a ``struct cgroup`` kptr that's already +stored in a map using bpf_cgroup_kptr_get(): + +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_cgroup_kptr_get + +Here's an example of how it can be used: + +.. code-block:: c + + /* struct containing the struct task_struct kptr which is actually stored in the map. */ + struct __cgroups_kfunc_map_value { + struct cgroup __kptr_ref * cgroup; + }; + + /* The map containing struct __cgroups_kfunc_map_value entries. */ + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct __cgroups_kfunc_map_value); + __uint(max_entries, 1); + } __cgroups_kfunc_map SEC(".maps"); + + /* ... */ + + /** + * A simple example tracepoint program showing how a + * struct cgroup kptr that is stored in a map can + * be acquired using the bpf_cgroup_kptr_get() kfunc. + */ + SEC("tp_btf/cgroup_mkdir") + int BPF_PROG(cgroup_kptr_get_example, struct cgroup *cgrp, const char *path) + { + struct cgroup *kptr; + struct __cgroups_kfunc_map_value *v; + s32 id = cgrp->self.id; + + /* Assume a cgroup kptr was previously stored in the map. */ + v = bpf_map_lookup_elem(&__cgroups_kfunc_map, &id); + if (!v) + return -ENOENT; + + /* Acquire a reference to the cgroup kptr that's already stored in the map. */ + kptr = bpf_cgroup_kptr_get(&v->cgroup); + if (!kptr) + /* If no cgroup was present in the map, it's because + * we're racing with another CPU that removed it with + * bpf_kptr_xchg() between the bpf_map_lookup_elem() + * above, and our call to bpf_cgroup_kptr_get(). + * bpf_cgroup_kptr_get() internally safely handles this + * race, and will return NULL if the task is no longer + * present in the map by the time we invoke the kfunc. + */ + return -EBUSY; + + /* Free the reference we just took above. Note that the + * original struct cgroup kptr is still in the map. It will + * be freed either at a later time if another context deletes + * it from the map, or automatically by the BPF subsystem if + * it's still present when the map is destroyed. + */ + bpf_cgroup_release(kptr); + + return 0; + } + +---- + +Another kfunc available for interacting with ``struct cgroup *`` objects is +bpf_cgroup_ancestor(). This allows callers to access the ancestor of a cgroup, +and return it as a cgroup kptr. + +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_cgroup_ancestor + +Eventually, BPF should be updated to allow this to happen with a normal memory +load in the program itself. This is currently not possible without more work in +the verifier. bpf_cgroup_ancestor() can be used as follows: + +.. code-block:: c + + /** + * Simple tracepoint example that illustrates how a cgroup's + * ancestor can be accessed using bpf_cgroup_ancestor(). + */ + SEC("tp_btf/cgroup_mkdir") + int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path) + { + struct cgroup *parent; + + /* The parent cgroup resides at the level before the current cgroup's level. */ + parent = bpf_cgroup_ancestor(cgrp, cgrp->level - 1); + if (!parent) + return -ENOENT; + + bpf_printk("Parent id is %d", parent->self.id); + + /* Return the parent cgroup that was acquired above. */ + bpf_cgroup_release(parent); + return 0; + } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1e4bf466b08f..6ed5875b97a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1960,7 +1960,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) } /** - * bpf_cgroup_release - Release the reference acquired on a struct cgroup *. + * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. -- cgit v1.2.3 From d35af0a7feb077c43ff0233bba5a8c6e75b73e35 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 7 Dec 2022 11:35:40 +0100 Subject: bpf: Do not zero-extend kfunc return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In BPF all global functions, and BPF helpers return a 64-bit value. For kfunc calls, this is not the case, and they can return e.g. 32-bit values. The return register R0 for kfuncs calls can therefore be marked as subreg_def != DEF_NOT_SUBREG. In general, if a register is marked with subreg_def != DEF_NOT_SUBREG, some archs (where bpf_jit_needs_zext() returns true) require the verifier to insert explicit zero-extension instructions. For kfuncs calls, however, the caller should do sign/zero extension for return values. In other words, the compiler is responsible to insert proper instructions, not the verifier. An example, provided by Yonghong Song: $ cat t.c extern unsigned foo(void); unsigned bar1(void) { return foo(); } unsigned bar2(void) { if (foo()) return 10; else return 20; } $ clang -target bpf -mcpu=v3 -O2 -c t.c && llvm-objdump -d t.o t.o: file format elf64-bpf Disassembly of section .text: 0000000000000000 : 0: 85 10 00 00 ff ff ff ff call -0x1 1: 95 00 00 00 00 00 00 00 exit 0000000000000010 : 2: 85 10 00 00 ff ff ff ff call -0x1 3: bc 01 00 00 00 00 00 00 w1 = w0 4: b4 00 00 00 14 00 00 00 w0 = 0x14 5: 16 01 01 00 00 00 00 00 if w1 == 0x0 goto +0x1 6: b4 00 00 00 0a 00 00 00 w0 = 0xa 0000000000000038 : 7: 95 00 00 00 00 00 00 00 exit If the return value of 'foo()' is used in the BPF program, the proper zero-extension will be done. Currently, the verifier correctly marks, say, a 32-bit return value as subreg_def != DEF_NOT_SUBREG, but will fail performing the actual zero-extension, due to a verifier bug in opt_subreg_zext_lo32_rnd_hi32(). load_reg is not properly set to R0, and the following path will be taken: if (WARN_ON(load_reg == -1)) { verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); return -EFAULT; } A longer discussion from v1 can be found in the link below. Correct the verifier by avoiding doing explicit zero-extension of R0 for kfunc calls. Note that R0 will still be marked as a sub-register for return values smaller than 64-bit. Fixes: 83a2881903f3 ("bpf: Account for BPF_FETCH in insn_has_def32()") Link: https://lore.kernel.org/bpf/20221202103620.1915679-1-bjorn@kernel.org/ Suggested-by: Yonghong Song Signed-off-by: Björn Töpel Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221207103540.396496-1-bjorn@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 29e09ef15188..1085dd185c95 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14836,6 +14836,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) continue; + /* Zero-extension is done by the caller. */ + if (bpf_pseudo_kfunc_call(&insn)) + continue; + if (WARN_ON(load_reg == -1)) { verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); return -EFAULT; -- cgit v1.2.3 From c2cc0ce72a5ed3e01705e14221d97e96ed7a37b8 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 8 Dec 2022 09:37:24 +0800 Subject: bpf: Fix comment error in fixup_kfunc_call function insn->imm for kfunc is the relative address of __bpf_call_base, instead of __bpf_base_call, Fix the comment error. Signed-off-by: Yang Jihong Link: https://lore.kernel.org/r/20221208013724.257848-1-yangjihong1@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1085dd185c95..3194e9d9e4e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15350,7 +15350,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } /* insn->imm has the btf func_id. Replace it with - * an address (relative to __bpf_base_call). + * an address (relative to __bpf_call_base). */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { -- cgit v1.2.3 From e60db051a4a70bff151eb59774c64af3a0266794 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 6 Dec 2022 15:27:39 -0800 Subject: selftests/bpf: Bring test_offload.py back to life Bpftool has new extra libbpf_det_bind probing map we need to exclude. Also skip trying to load netdevsim modules if it's already loaded (builtin). v2: - drop iproute2->bpftool changes (Toke) Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221206232739.2504890-1-sdf@google.com --- tools/testing/selftests/bpf/test_offload.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 7fc15e0d24a9..7cb1bc05e5cf 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -769,12 +769,14 @@ skip(ret != 0, "bpftool not installed") base_progs = progs _, base_maps = bpftool("map") base_map_names = [ - 'pid_iter.rodata' # created on each bpftool invocation + 'pid_iter.rodata', # created on each bpftool invocation + 'libbpf_det_bind', # created on each bpftool invocation ] # Check netdevsim -ret, out = cmd("modprobe netdevsim", fail=False) -skip(ret != 0, "netdevsim module could not be loaded") +if not os.path.isdir("/sys/bus/netdevsim/"): + ret, out = cmd("modprobe netdevsim", fail=False) + skip(ret != 0, "netdevsim module could not be loaded") # Check debugfs _, out = cmd("mount") -- cgit v1.2.3 From 0893d6007db5cf397f3fc92b2a6935c3ed0c6f00 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 9 Dec 2022 09:09:46 +0800 Subject: bpf: Reuse freed element in free_by_rcu during allocation When there are batched freeing operations on a specific CPU, part of the freed elements ((high_watermark - lower_watermark) / 2 + 1) will be indirectly moved into waiting_for_gp list through free_by_rcu list. After call_rcu_in_progress becomes false again, the remaining elements in free_by_rcu list will be moved to waiting_for_gp list by the next invocation of free_bulk(). However if the expiration of RCU tasks trace grace period is relatively slow, none element in free_by_rcu list will be moved. So instead of invoking __alloc_percpu_gfp() or kmalloc_node() to allocate a new object, in alloc_bulk() just check whether or not there is freed element in free_by_rcu list and reuse it if available. Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221209010947.3130477-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 8f0d65f2474a..04d96d1b98a3 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - obj = __alloc(c, node); - if (!obj) - break; + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and -- cgit v1.2.3 From 822ed78fab13d5a54f8b8c030e8c5dc0fcd2cdae Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 9 Dec 2022 09:09:47 +0800 Subject: bpf: Skip rcu_barrier() if rcu_trace_implies_rcu_gp() is true If there are pending rcu callback, free_mem_alloc() will use rcu_barrier_tasks_trace() and rcu_barrier() to wait for the pending __free_rcu_tasks_trace() and __free_rcu() callback. If rcu_trace_implies_rcu_gp() is true, there will be no pending __free_rcu(), so it will be OK to skip rcu_barrier() as well. Acked-by: Yonghong Song Acked-by: Paul E. McKenney Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221209010947.3130477-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 04d96d1b98a3..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -464,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp lists was drained, but __free_rcu might * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier_tasks_trace(); - rcu_barrier(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); free_mem_alloc_no_barrier(ma); } -- cgit v1.2.3 From 6b75bd3d036745b9be30917909f03602099adbdb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:35 +0530 Subject: bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where the underlying register type is subjected to more special checks to determine the type of object represented by the pointer and its state consistency. Move dynptr checks to their own 'process_dynptr_func' function so that is consistent and in-line with existing code. This also makes it easier to reuse this code for kfunc handling. Then, reuse this consolidated function in kfunc dynptr handling too. Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has been lifted. Acked-by: David Vernet Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 +- kernel/bpf/verifier.c | 134 +++++++++++---------- .../selftests/bpf/prog_tests/kfunc_dynptr_param.c | 7 +- .../selftests/bpf/progs/test_kfunc_dynptr_param.c | 12 -- 4 files changed, 75 insertions(+), 86 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 70d06a99f0b8..df0cb825e0e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg); -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type); +struct bpf_call_arg_meta; +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3194e9d9e4e4..fcd8a71035aa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -810,8 +810,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -830,9 +829,8 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; @@ -5859,6 +5857,65 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + return 0; + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else { + if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + regno); + return -EINVAL; + } + + if (!is_dynptr_type_expected(env, reg, arg_type)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local"; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf"; + break; + default: + err_extra = ""; + break; + } + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", + err_extra, regno); + return -EINVAL; + } + } + return 0; +} + static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -6390,52 +6447,8 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - /* We only need to check for initialized / uninitialized helper - * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the - * assumption is that if it is, that a helper function - * initialized the dynptr on behalf of the BPF program. - */ - if (base_type(reg->type) == PTR_TO_DYNPTR) - break; - if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { - verbose(env, "Dynptr has to be an uninitialized dynptr\n"); - return -EINVAL; - } - - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; - } - - meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - arg + 1); - return -EINVAL; - } else if (!is_dynptr_type_expected(env, reg, arg_type)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local"; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf"; - break; - default: - err_extra = ""; - break; - } - verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - err_extra, arg + 1); - return -EINVAL; - } + if (process_dynptr_func(env, regno, arg_type, meta)) + return -EACCES; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { @@ -8829,22 +8842,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_DYNPTR: - if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d expected pointer to stack\n", i); - return -EINVAL; - } - - if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, "arg#%d pointer type %s %s must be valid and initialized\n", - i, btf_type_str(ref_t), ref_tname); + if (reg->type != PTR_TO_STACK && + reg->type != PTR_TO_DYNPTR) { + verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); return -EINVAL; } - if (!is_dynptr_type_expected(env, reg, ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { - verbose(env, "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", - i, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL); + if (ret < 0) + return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c index 55d641c1f126..a9229260a6ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c @@ -18,11 +18,8 @@ static struct { const char *expected_verifier_err_msg; int expected_runtime_err; } kfunc_dynptr_tests[] = { - {"dynptr_type_not_supp", - "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0}, - {"not_valid_dynptr", - "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0}, - {"not_ptr_to_stack", "arg#0 expected pointer to stack", 0}, + {"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0}, + {"not_ptr_to_stack", "arg#0 expected pointer to stack or dynptr_ptr", 0}, {"dynptr_data_null", NULL, -EBADMSG}, }; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index ce39d096bba3..f4a8250329b2 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -32,18 +32,6 @@ int err, pid; char _license[] SEC("license") = "GPL"; -SEC("?lsm.s/bpf") -int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr, - unsigned int size) -{ - char write_data[64] = "hello there, world!!"; - struct bpf_dynptr ptr; - - bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); - - return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL); -} - SEC("?lsm.s/bpf") int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) { -- cgit v1.2.3 From ac50fe51ce873f4299928e312ce2042e35ab5c08 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:36 +0530 Subject: bpf: Propagate errors from process_* checks in check_func_arg Currently, we simply ignore the errors in process_spin_lock, process_timer_func, process_kptr_func, process_dynptr_func. Instead, bubble up the error by storing and checking err variable. Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcd8a71035aa..eb742ac75844 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6412,19 +6412,22 @@ skip_type_check: break; case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; + err = process_spin_lock(env, regno, true); + if (err) + return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; + err = process_spin_lock(env, regno, false); + if (err) + return err; } else { verbose(env, "verifier internal error\n"); return -EFAULT; } break; case ARG_PTR_TO_TIMER: - if (process_timer_func(env, regno, meta)) - return -EACCES; + err = process_timer_func(env, regno, meta); + if (err) + return err; break; case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; @@ -6447,8 +6450,9 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - if (process_dynptr_func(env, regno, arg_type, meta)) - return -EACCES; + err = process_dynptr_func(env, regno, arg_type, meta); + if (err) + return err; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { @@ -6515,8 +6519,9 @@ skip_type_check: break; } case ARG_PTR_TO_KPTR: - if (process_kptr_func(env, regno, meta)) - return -EACCES; + err = process_kptr_func(env, regno, meta); + if (err) + return err; break; } -- cgit v1.2.3 From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:37 +0530 Subject: bpf: Rework process_dynptr_func Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type for use in callback state, because in case of user ringbuf helpers, there is no dynptr on the stack that is passed into the callback. To reflect such a state, a special register type was created. However, some checks have been bypassed incorrectly during the addition of this feature. First, for arg_type with MEM_UNINIT flag which initialize a dynptr, they must be rejected for such register type. Secondly, in the future, there are plans to add dynptr helpers that operate on the dynptr itself and may change its offset and other properties. In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed to such helpers, however the current code simply returns 0. The rejection for helpers that release the dynptr is already handled. For fixing this, we take a step back and rework existing code in a way that will allow fitting in all classes of helpers and have a coherent model for dealing with the variety of use cases in which dynptr is used. First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together with a DYNPTR_TYPE_* constant that denotes the only type it accepts. Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this fact. To make the distinction clear, use MEM_RDONLY flag to indicate that the helper only operates on the memory pointed to by the dynptr, not the dynptr itself. In C parlance, it would be equivalent to taking the dynptr as a point to const argument. When either of these flags are not present, the helper is allowed to mutate both the dynptr itself and also the memory it points to. Currently, the read only status of the memory is not tracked in the dynptr, but it would be trivial to add this support inside dynptr state of the register. With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to better reflect its usage, it can no longer be passed to helpers that initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr. A note to reviewers is that in code that does mark_stack_slots_dynptr, and unmark_stack_slots_dynptr, we implicitly rely on the fact that PTR_TO_STACK reg is the only case that can reach that code path, as one cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In both cases such helpers won't be setting that flag. The next patch will add a couple of selftest cases to make sure this doesn't break. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- include/uapi/linux/bpf.h | 8 +- kernel/bpf/helpers.c | 18 +- kernel/bpf/verifier.c | 227 +++++++++++++++------ scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 8 +- .../selftests/bpf/prog_tests/user_ringbuf.c | 4 +- 7 files changed, 191 insertions(+), 79 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4920ac252754..3de24cfb7a3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -775,7 +775,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f89de51a45db..464ca3f01fe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5293,7 +5293,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5303,7 +5303,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5313,7 +5313,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5414,7 +5414,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6ed5875b97a3..fa1093d4ad6b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1414,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1438,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1506,12 +1506,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1532,14 +1532,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1560,7 +1560,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb742ac75844..a880776bd999 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -592,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", - [PTR_TO_DYNPTR] = "dynptr_ptr", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -725,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -746,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -756,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } return 0; @@ -779,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; - } - - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -813,9 +832,14 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; int i; + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -834,15 +858,19 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; - int spi = get_spi(reg->off); + int spi; /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; dynptr_type = arg_to_dynptr_type(arg_type); - - return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -1354,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1419,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -5857,19 +5895,58 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - /* We only need to check for initialized / uninitialized helper - * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the - * assumption is that if it is, that a helper function - * initialized the dynptr on behalf of the BPF program. + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. */ - if (base_type(reg->type) == PTR_TO_DYNPTR) - return 0; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -5885,7 +5962,13 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, } meta->uninit_dynptr_regno = regno; - } else { + } else /* MEM_RDONLY and None case from above */ { + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ + if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { + verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + return -EINVAL; + } + if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", @@ -5893,7 +5976,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - if (!is_dynptr_type_expected(env, reg, arg_type)) { + /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { const char *err_extra = ""; switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -6056,7 +6140,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } } static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, - PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + CONST_PTR_TO_DYNPTR, } }; @@ -6241,12 +6325,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; - return state->stack[spi].spilled_ptr.id; + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.ref_obj_id; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -6311,11 +6399,22 @@ skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); + /* Only dynptr created on stack can be released, thus + * the get_spi and stack state checks for spilled_ptr + * should only be done before process_dynptr_func for + * PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + spi = get_spi(reg->off); + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.ref_obj_id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else { + verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } } else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -7289,11 +7388,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, { /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void * callback_ctx, u64 flags); - * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; - __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -7687,7 +7785,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr + * is safe to do directly. + */ if (meta.uninit_dynptr_regno) { + if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); + return -EFAULT; + } /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7705,15 +7811,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { + if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + return -EFAULT; + } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - else if (meta.ref_obj_id) + } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - else if (register_is_null(®s[meta.release_regno])) + } else if (register_is_null(®s[meta.release_regno])) { + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ err = 0; + } if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -7787,11 +7902,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (base_type(reg->type) != PTR_TO_DYNPTR) - /* Find the id of the dynptr we're - * tracking the reference of - */ - meta.ref_obj_id = stack_slot_get_id(env, reg); + meta.ref_obj_id = dynptr_ref_obj_id(env, reg); break; } } @@ -8848,12 +8959,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_DYNPTR: if (reg->type != PTR_TO_STACK && - reg->type != PTR_TO_DYNPTR) { + reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); return -EINVAL; } - ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL); + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); if (ret < 0) return ret; break; diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index fdb0aff8cb5a..e8d90829f23e 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -752,6 +752,7 @@ class PrinterHelpers(Printer): 'struct bpf_timer', 'struct mptcp_sock', 'struct bpf_dynptr', + 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f89de51a45db..464ca3f01fe7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5293,7 +5293,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5303,7 +5303,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5313,7 +5313,7 @@ union bpf_attr { * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * - * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * @@ -5414,7 +5414,7 @@ union bpf_attr { * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * - * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index 02b18d018b36..aefa0a474e58 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -673,8 +673,8 @@ static struct { {"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"}, {"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"}, {"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"}, - {"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"}, - {"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"}, + {"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"}, + {"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"}, {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, }; -- cgit v1.2.3 From 184c9bdb8f65d9f909b3a089a83bc0b0f1e1ea4c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:38 +0530 Subject: bpf: Rework check_func_arg_reg_off While check_func_arg_reg_off is the place which performs generic checks needed by various candidates of reg->type, there is some handling for special cases, like ARG_PTR_TO_DYNPTR, OBJ_RELEASE, and ARG_PTR_TO_RINGBUF_MEM. This commit aims to streamline these special cases and instead leave other things up to argument type specific code to handle. The function will be restrictive by default, and cover all possible cases when OBJ_RELEASE is set, without having to update the function again (and missing to do that being a bug). This is done primarily for two reasons: associating back reg->type to its argument leaves room for the list getting out of sync when a new reg->type is supported by an arg_type. The other case is ARG_PTR_TO_RINGBUF_MEM. The problem there is something we already handle, whenever a release argument is expected, it should be passed as the pointer that was received from the acquire function. Hence zero fixed and variable offset. There is nothing special about ARG_PTR_TO_RINGBUF_MEM, where technically its target register type PTR_TO_MEM | MEM_RINGBUF can already be passed with non-zero offset to other helper functions, which makes sense. Hence, lift the arg_type_is_release check for reg->off and cover all possible register types, instead of duplicating the same kind of check twice for current OBJ_RELEASE arg_types (alloc_mem and ptr_to_btf_id). For the release argument, arg_type_is_dynptr is the special case, where we go to actual object being freed through the dynptr, so the offset of the pointer still needs to allow fixed and variable offset and process_dynptr_func will verify them later for the release argument case as well. This is not specific to ARG_PTR_TO_DYNPTR though, we will need to make this exception for any future object on the stack that needs to be released. In this sense, PTR_TO_STACK as a candidate for object on stack argument is a special case for release offset checks, and they need to be done by the helper releasing the object on stack. Since the check has been lifted above all register type checks, remove the duplicated check that is being done for PTR_TO_BTF_ID. Acked-by: Joanne Koong Acked-by: David Vernet Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 63 ++++++++++++++++---------- tools/testing/selftests/bpf/verifier/calls.c | 2 +- tools/testing/selftests/bpf/verifier/ringbuf.c | 2 +- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a880776bd999..8a7e964bcc31 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6269,11 +6269,37 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) { - enum bpf_reg_type type = reg->type; - bool fixed_off_ok = false; + u32 type = reg->type; + + /* When referenced register is passed to release function, its fixed + * offset must be 0. + * + * We will check arg_type_is_release reg has ref_obj_id when storing + * meta->release_regno. + */ + if (arg_type_is_release(arg_type)) { + /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it + * may not directly point to the object being released, but to + * dynptr pointing to such object, which might be at some offset + * on the stack. In that case, we simply to fallback to the + * default handling. + */ + if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) + return 0; + /* Doing check_ptr_off_reg check for the offset will catch this + * because fixed_off_ok is false, but checking here allows us + * to give the user a better error message. + */ + if (reg->off) { + verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", + regno); + return -EINVAL; + } + return __check_ptr_off_reg(env, reg, regno, false); + } - switch ((u32)type) { - /* Pointer types where reg offset is explicitly allowed: */ + switch (type) { + /* Pointer types where both fixed and variable offset is explicitly allowed: */ case PTR_TO_STACK: if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { verbose(env, "cannot pass in dynptr at an offset\n"); @@ -6290,12 +6316,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case SCALAR_VALUE: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (base_type(arg_type) != ARG_PTR_TO_RINGBUF_MEM) - return 0; - break; + return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows * fixed offset. */ @@ -6305,24 +6326,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. In the other cases, fixed offset - * can be non-zero. + * its fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. This was already checked above. So pass + * fixed_off_ok as true to allow fixed offset for all other + * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we + * still need to do checks instead of returning. */ - if (arg_type_is_release(arg_type) && reg->off) { - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - return -EINVAL; - } - /* For arg is release pointer, fixed_off_ok must be false, but - * we already checked and rejected reg->off != 0 above, so set - * to true to allow fixed offset for all other cases. - */ - fixed_off_ok = true; - break; + return __check_ptr_off_reg(env, reg, regno, true); default: - break; + return __check_ptr_off_reg(env, reg, regno, false); } - return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 3193915c5ee6..babcec123251 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -76,7 +76,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to ctx, but got PTR", + .errstr = "R1 must have zero offset when passed to release func or trusted arg to kfunc", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_pass_ctx", 2 }, }, diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c index 84838feba47f..92e3f6a61a79 100644 --- a/tools/testing/selftests/bpf/verifier/ringbuf.c +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -28,7 +28,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "dereference of modified ringbuf_mem ptr R1", + .errstr = "R1 must have zero offset when passed to release func", }, { "ringbuf: invalid reservation offset 2", -- cgit v1.2.3 From f6ee298fa140f6b5bae3acf6d6108741278bcced Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:39 +0530 Subject: bpf: Move PTR_TO_STACK alignment check to process_dynptr_func After previous commit, we are minimizing helper specific assumptions from check_func_arg_reg_off, making it generic, and offloading checks for a specific argument type to their respective functions called after check_func_arg_reg_off has been called. This allows relying on a consistent set of guarantees after that call and then relying on them in code that deals with registers for each argument type later. This is in line with how process_spin_lock, process_timer_func, process_kptr_func check reg->var_off to be constant. The same reasoning is used here to move the alignment check into process_dynptr_func. Note that it also needs to check for constant var_off, and accumulate the constant var_off when computing the spi in get_spi, but that fix will come in later changes. Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a7e964bcc31..9791788071d5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5932,6 +5932,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); return -EFAULT; } + /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic. We only need to check offset + * alignment for PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); + return -EINVAL; + } /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -6301,11 +6309,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, switch (type) { /* Pointer types where both fixed and variable offset is explicitly allowed: */ case PTR_TO_STACK: - if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset\n"); - return -EINVAL; - } - fallthrough; case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: -- cgit v1.2.3 From 76d16077bef0954528ec3896710f9eda8b2b4db1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:40 +0530 Subject: bpf: Use memmove for bpf_dynptr_{read,write} It may happen that destination buffer memory overlaps with memory dynptr points to. Hence, we must use memmove to correctly copy from dynptr to destination buffer, or source buffer to dynptr. This actually isn't a problem right now, as memcpy implementation falls back to memmove on detecting overlap and warns about it, but we shouldn't be relying on that. Acked-by: Joanne Koong Acked-by: David Vernet Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fa1093d4ad6b..af30c6cbd65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1495,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1523,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } -- cgit v1.2.3 From 292064cce7969787f13ed9988733aadc12fe0ca2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:41 +0530 Subject: selftests/bpf: Add test for dynptr reinit in user_ringbuf callback The original support for bpf_user_ringbuf_drain callbacks simply short-circuited checks for the dynptr state, allowing users to pass PTR_TO_DYNPTR (now CONST_PTR_TO_DYNPTR) to helpers that initialize a dynptr. This bug would have also surfaced with other dynptr helpers in the future that changed dynptr view or modified it in some way. Include test cases for all cases, i.e. both bpf_dynptr_from_mem and bpf_ringbuf_reserve_dynptr, and ensure verifier rejects both of them. Without the fix, both of these programs load and pass verification. While at it, remove sys_nanosleep target from failure cases' SEC definition, as there is no such tracepoint. Acked-by: David Vernet Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/user_ringbuf.c | 2 + .../selftests/bpf/progs/user_ringbuf_fail.c | 51 ++++++++++++++++++---- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index aefa0a474e58..dae68de285b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -676,6 +676,8 @@ static struct { {"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"}, {"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"}, {"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"}, + {"user_ringbuf_callback_reinit_dynptr_mem", "Dynptr has to be an uninitialized dynptr"}, + {"user_ringbuf_callback_reinit_dynptr_ringbuf", "Dynptr has to be an uninitialized dynptr"}, }; #define SUCCESS_TEST(_func) { _func, #_func } diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c index 82aba4529aa9..f3201dc69a60 100644 --- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -18,6 +18,13 @@ struct { __uint(type, BPF_MAP_TYPE_USER_RINGBUF); } user_ringbuf SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 2); +} ringbuf SEC(".maps"); + +static int map_value; + static long bad_access1(struct bpf_dynptr *dynptr, void *context) { @@ -32,7 +39,7 @@ bad_access1(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to read before the pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_bad_access1(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0); @@ -54,7 +61,7 @@ bad_access2(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to read past the end of the pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_bad_access2(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0); @@ -73,7 +80,7 @@ write_forbidden(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to write to that pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_write_forbidden(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0); @@ -92,7 +99,7 @@ null_context_write(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to write to that pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_null_context_write(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0); @@ -113,7 +120,7 @@ null_context_read(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to write to that pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_null_context_read(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0); @@ -132,7 +139,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to read past the end of the pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_discard_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); @@ -151,7 +158,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to read past the end of the pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_submit_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); @@ -168,10 +175,38 @@ invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context) /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should * not be able to write to that pointer. */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp/") int user_ringbuf_callback_invalid_return(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0); return 0; } + +static long +try_reinit_dynptr_mem(struct bpf_dynptr *dynptr, void *context) +{ + bpf_dynptr_from_mem(&map_value, 4, 0, dynptr); + return 0; +} + +static long +try_reinit_dynptr_ringbuf(struct bpf_dynptr *dynptr, void *context) +{ + bpf_ringbuf_reserve_dynptr(&ringbuf, 8, 0, dynptr); + return 0; +} + +SEC("?raw_tp/") +int user_ringbuf_callback_reinit_dynptr_mem(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_mem, NULL, 0); + return 0; +} + +SEC("?raw_tp/") +int user_ringbuf_callback_reinit_dynptr_ringbuf(void *ctx) +{ + bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_ringbuf, NULL, 0); + return 0; +} -- cgit v1.2.3 From f3212ad5b7e93c002bd2dbe552c2b0b0033317ff Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 9 Dec 2022 11:24:01 +0000 Subject: docs/bpf: Add documentation for BPF_MAP_TYPE_SK_STORAGE Add documentation for the BPF_MAP_TYPE_SK_STORAGE including kernel version introduced, usage and examples. Signed-off-by: Donald Hunter Acked-by: David Vernet Link: https://lore.kernel.org/r/20221209112401.69319-1-donald.hunter@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/map_sk_storage.rst | 155 +++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 Documentation/bpf/map_sk_storage.rst diff --git a/Documentation/bpf/map_sk_storage.rst b/Documentation/bpf/map_sk_storage.rst new file mode 100644 index 000000000000..047e16c8aaa8 --- /dev/null +++ b/Documentation/bpf/map_sk_storage.rst @@ -0,0 +1,155 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright (C) 2022 Red Hat, Inc. + +======================= +BPF_MAP_TYPE_SK_STORAGE +======================= + +.. note:: + - ``BPF_MAP_TYPE_SK_STORAGE`` was introduced in kernel version 5.2 + +``BPF_MAP_TYPE_SK_STORAGE`` is used to provide socket-local storage for BPF +programs. A map of type ``BPF_MAP_TYPE_SK_STORAGE`` declares the type of storage +to be provided and acts as the handle for accessing the socket-local +storage. The values for maps of type ``BPF_MAP_TYPE_SK_STORAGE`` are stored +locally with each socket instead of with the map. The kernel is responsible for +allocating storage for a socket when requested and for freeing the storage when +either the map or the socket is deleted. + +.. note:: + - The key type must be ``int`` and ``max_entries`` must be set to ``0``. + - The ``BPF_F_NO_PREALLOC`` flag must be used when creating a map for + socket-local storage. + +Usage +===== + +Kernel BPF +---------- + +bpf_sk_storage_get() +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + +Socket-local storage can be retrieved using the ``bpf_sk_storage_get()`` +helper. The helper gets the storage from ``sk`` that is associated with ``map``. +If the ``BPF_LOCAL_STORAGE_GET_F_CREATE`` flag is used then +``bpf_sk_storage_get()`` will create the storage for ``sk`` if it does not +already exist. ``value`` can be used together with +``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise it +will be zero initialized. Returns a pointer to the storage on success, or +``NULL`` in case of failure. + +.. note:: + - ``sk`` is a kernel ``struct sock`` pointer for LSM or tracing programs. + - ``sk`` is a ``struct bpf_sock`` pointer for other program types. + +bpf_sk_storage_delete() +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + +Socket-local storage can be deleted using the ``bpf_sk_storage_delete()`` +helper. The helper deletes the storage from ``sk`` that is identified by +``map``. Returns ``0`` on success, or negative error in case of failure. + +User space +---------- + +bpf_map_update_elem() +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + int bpf_map_update_elem(int map_fd, const void *key, const void *value, __u64 flags) + +Socket-local storage for the socket identified by ``key`` belonging to +``map_fd`` can be added or updated using the ``bpf_map_update_elem()`` libbpf +function. ``key`` must be a pointer to a valid ``fd`` in the user space +program. The ``flags`` parameter can be used to control the update behaviour: + +- ``BPF_ANY`` will create storage for ``fd`` or update existing storage. +- ``BPF_NOEXIST`` will create storage for ``fd`` only if it did not already + exist, otherwise the call will fail with ``-EEXIST``. +- ``BPF_EXIST`` will update existing storage for ``fd`` if it already exists, + otherwise the call will fail with ``-ENOENT``. + +Returns ``0`` on success, or negative error in case of failure. + +bpf_map_lookup_elem() +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + int bpf_map_lookup_elem(int map_fd, const void *key, void *value) + +Socket-local storage for the socket identified by ``key`` belonging to +``map_fd`` can be retrieved using the ``bpf_map_lookup_elem()`` libbpf +function. ``key`` must be a pointer to a valid ``fd`` in the user space +program. Returns ``0`` on success, or negative error in case of failure. + +bpf_map_delete_elem() +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + int bpf_map_delete_elem(int map_fd, const void *key) + +Socket-local storage for the socket identified by ``key`` belonging to +``map_fd`` can be deleted using the ``bpf_map_delete_elem()`` libbpf +function. Returns ``0`` on success, or negative error in case of failure. + +Examples +======== + +Kernel BPF +---------- + +This snippet shows how to declare socket-local storage in a BPF program: + +.. code-block:: c + + struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct my_storage); + } socket_storage SEC(".maps"); + +This snippet shows how to retrieve socket-local storage in a BPF program: + +.. code-block:: c + + SEC("sockops") + int _sockops(struct bpf_sock_ops *ctx) + { + struct my_storage *storage; + struct bpf_sock *sk; + + sk = ctx->sk; + if (!sk) + return 1; + + storage = bpf_sk_storage_get(&socket_storage, sk, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!storage) + return 1; + + /* Use 'storage' here */ + + return 1; + } + + +Please see the ``tools/testing/selftests/bpf`` directory for functional +examples. + +References +========== + +https://lwn.net/ml/netdev/20190426171103.61892-1-kafai@fb.com/ -- cgit v1.2.3 From 7c884339bbff80250bfc11d56b5cf48640e6ebdb Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:27 +0200 Subject: bpf: regsafe() must not skip check_ids() The verifier.c:regsafe() has the following shortcut: equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; ... if (equal) return true; Which is executed regardless old register type. This is incorrect for register types that might have an ID checked by check_ids(), namely: - PTR_TO_MAP_KEY - PTR_TO_MAP_VALUE - PTR_TO_PACKET_META - PTR_TO_PACKET The following pattern could be used to exploit this: 0: r9 = map_lookup_elem(...) ; Returns PTR_TO_MAP_VALUE_OR_NULL id=1. 1: r8 = map_lookup_elem(...) ; Returns PTR_TO_MAP_VALUE_OR_NULL id=2. 2: r7 = ktime_get_ns() ; Unbound SCALAR_VALUE. 3: r6 = ktime_get_ns() ; Unbound SCALAR_VALUE. 4: if r6 > r7 goto +1 ; No new information about the state ; is derived from this check, thus ; produced verifier states differ only ; in 'insn_idx'. 5: r9 = r8 ; Optionally make r9.id == r8.id. --- checkpoint --- ; Assume is_state_visisted() creates a ; checkpoint here. 6: if r9 == 0 goto ; Nullness info is propagated to all ; registers with matching ID. 7: r1 = *(u64 *) r8 ; Not always safe. Verifier first visits path 1-7 where r8 is verified to be not null at (6). Later the jump from 4 to 6 is examined. The checkpoint for (6) looks as follows: R8_rD=map_value_or_null(id=2,off=0,ks=4,vs=8,imm=0) R9_rwD=map_value_or_null(id=2,off=0,ks=4,vs=8,imm=0) R10=fp0 The current state is: R0=... R6=... R7=... fp-8=... R8=map_value_or_null(id=2,off=0,ks=4,vs=8,imm=0) R9=map_value_or_null(id=1,off=0,ks=4,vs=8,imm=0) R10=fp0 Note that R8 states are byte-to-byte identical, so regsafe() would exit early and skip call to check_ids(), thus ID mapping 2->2 will not be added to 'idmap'. Next, states for R9 are compared: these are not identical and check_ids() is executed, but 'idmap' is empty, so check_ids() adds mapping 2->1 to 'idmap' and returns success. This commit pushes the 'equal' down to register types that don't need check_ids(). Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9791788071d5..97645f767354 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13064,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; - if (rold->type == PTR_TO_STACK) - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return equal && rold->frameno == rcur->frameno; - - if (equal) - return true; - if (rold->type == NOT_INIT) /* explored state can't have used this */ return true; @@ -13080,6 +13071,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; switch (base_type(rold->type)) { case SCALAR_VALUE: + if (equal) + return true; if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { @@ -13150,20 +13143,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_CTX: - case CONST_PTR_TO_MAP: - case PTR_TO_PACKET_END: - case PTR_TO_FLOW_KEYS: - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - case PTR_TO_TCP_SOCK: - case PTR_TO_XDP_SOCK: - /* Only valid matches are exact, which memcmp() above - * would have accepted + case PTR_TO_STACK: + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar */ + return equal && rold->frameno == rcur->frameno; default: - /* Don't know what's going on, just say it's not safe */ - return false; + /* Only valid matches are exact, which memcmp() */ + return equal; } /* Shouldn't get here; if we do, say it's not safe */ -- cgit v1.2.3 From cb578c1c9cf60275f258b5af7929c2a54a188f66 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:28 +0200 Subject: selftests/bpf: test cases for regsafe() bug skipping check_id() Under certain conditions it was possible for verifier.c:regsafe() to skip check_id() call. This commit adds negative test cases previously errorneously accepted as safe. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/direct_packet_access.c | 54 ++++++++++++++++++++++ .../testing/selftests/bpf/verifier/value_or_null.c | 49 ++++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index 11acd1855acf..dce2e28aeb43 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -654,3 +654,57 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, +{ + "direct packet access: test30 (check_id() in regsafe(), bad access)", + .insns = { + /* r9 = ctx */ + BPF_MOV64_REG(BPF_REG_9, BPF_REG_1), + /* r7 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* r2 = ctx->data + * r3 = ctx->data + * r4 = ctx->data_end + */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_9, offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_9, offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_9, offsetof(struct __sk_buff, data_end)), + /* if r6 > 100 goto exit + * if r7 > 100 goto exit + */ + BPF_JMP_IMM(BPF_JGT, BPF_REG_6, 100, 9), + BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 100, 8), + /* r2 += r6 ; this forces assignment of ID to r2 + * r2 += 1 ; get some fixed off for r2 + * r3 += r7 ; this forces assignment of ID to r3 + * r3 += 1 ; get some fixed off for r3 + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_7), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 1), + /* if r6 > r7 goto +1 ; no new information about the state is derived from + * ; this check, thus produced verifier states differ + * ; only in 'insn_idx' + * r2 = r3 ; optionally share ID between r2 and r3 + */ + BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_7, 1), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_3), + /* if r3 > ctx->data_end goto exit */ + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_4, 1), + /* r5 = *(u8 *) (r2 - 1) ; access packet memory using r2, + * ; this is not always safe + */ + BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_2, -1), + /* exit(0) */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .result = REJECT, + .errstr = "invalid access to packet, off=0 size=1, R2", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 3ecb70a3d939..52a8bca14f03 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -169,3 +169,52 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, +{ + "MAP_VALUE_OR_NULL check_ids() in regsafe()", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + /* r9 = map_lookup_elem(...) */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), + /* r8 = map_lookup_elem(...) */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + /* r7 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* if r6 > r7 goto +1 ; no new information about the state is derived from + * ; this check, thus produced verifier states differ + * ; only in 'insn_idx' + * r9 = r8 ; optionally share ID between r9 and r8 + */ + BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), + /* if r9 == 0 goto */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1), + /* read map value via r8, this is not always + * safe because r8 might be not equal to r9. + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0), + /* exit 0 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .fixup_map_hash_8b = { 3, 9 }, + .result = REJECT, + .errstr = "R8 invalid mem access 'map_value_or_null'", + .result_unpriv = REJECT, + .errstr_unpriv = "", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, -- cgit v1.2.3 From 5dd9cdbc9dec3e99b19e483767e247d15ca8cc0d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:29 +0200 Subject: bpf: states_equal() must build idmap for all function frames verifier.c:states_equal() must maintain register ID mapping across all function frames. Otherwise the following example might be erroneously marked as safe: main: fp[-24] = map_lookup_elem(...) ; frame[0].fp[-24].id == 1 fp[-32] = map_lookup_elem(...) ; frame[0].fp[-32].id == 2 r1 = &fp[-24] r2 = &fp[-32] call foo() r0 = 0 exit foo: 0: r9 = r1 1: r8 = r2 2: r7 = ktime_get_ns() 3: r6 = ktime_get_ns() 4: if (r6 > r7) goto skip_assign 5: r9 = r8 skip_assign: ; <--- checkpoint 6: r9 = *r9 ; (a) frame[1].r9.id == 2 ; (b) frame[1].r9.id == 1 7: if r9 == 0 goto exit: ; mark_ptr_or_null_regs() transfers != 0 info ; for all regs sharing ID: ; (a) r9 != 0 => &frame[0].fp[-32] != 0 ; (b) r9 != 0 => &frame[0].fp[-24] != 0 8: r8 = *r8 ; (a) r8 == &frame[0].fp[-32] ; (b) r8 == &frame[0].fp[-32] 9: r0 = *r8 ; (a) safe ; (b) unsafe exit: 10: exit While processing call to foo() verifier considers the following execution paths: (a) 0-10 (b) 0-4,6-10 (There is also path 0-7,10 but it is not interesting for the issue at hand. (a) is verified first.) Suppose that checkpoint is created at (6) when path (a) is verified, next path (b) is verified and (6) is reached. If states_equal() maintains separate 'idmap' for each frame the mapping at (6) for frame[1] would be empty and regsafe(r9)::check_ids() would add a pair 2->1 and return true, which is an error. If states_equal() maintains single 'idmap' for all frames the mapping at (6) would be { 1->1, 2->2 } and regsafe(r9)::check_ids() would return false when trying to add a pair 2->1. This issue was suggested in the following discussion: https://lore.kernel.org/bpf/CAEf4BzbFB5g4oUfyxk9rHy-PJSLQ3h8q9mV=rVoXfr_JVm8+1Q@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- kernel/bpf/verifier.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index df0cb825e0e3..53d175cbaa02 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -273,9 +273,9 @@ struct bpf_id_pair { u32 cur; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 97645f767354..00c8e0a28203 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13260,7 +13260,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat { int i; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], env->idmap_scratch)) @@ -13284,6 +13283,8 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. */ -- cgit v1.2.3 From 7d05794330877986f605c1618534d7478030f5b8 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:30 +0200 Subject: selftests/bpf: verify states_equal() maintains idmap across all frames A test case that would erroneously pass verification if verifier.c:states_equal() maintains separate register ID mappings for call frames. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/calls.c | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index babcec123251..9d993926bf0e 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -2305,3 +2305,85 @@ .errstr = "!read_ok", .result = REJECT, }, +/* Make sure that verifier.c:states_equal() considers IDs from all + * frames when building 'idmap' for check_ids(). + */ +{ + "calls: check_ids() across call boundary", + .insns = { + /* Function main() */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + /* fp[-24] = map_lookup_elem(...) ; get a MAP_VALUE_PTR_OR_NULL with some ID */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_0, -24), + /* fp[-32] = map_lookup_elem(...) ; get a MAP_VALUE_PTR_OR_NULL with some ID */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_0, -32), + /* call foo(&fp[-24], &fp[-32]) ; both arguments have IDs in the current + * ; stack frame + */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -24), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -32), + BPF_CALL_REL(2), + /* exit 0 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* Function foo() + * + * r9 = &frame[0].fp[-24] ; save arguments in the callee saved registers, + * r8 = &frame[0].fp[-32] ; arguments are pointers to pointers to map value + */ + BPF_MOV64_REG(BPF_REG_9, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_2), + /* r7 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* if r6 > r7 goto +1 ; no new information about the state is derived from + * ; this check, thus produced verifier states differ + * ; only in 'insn_idx' + * r9 = r8 + */ + BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), + /* r9 = *r9 ; verifier get's to this point via two paths: + * ; (I) one including r9 = r8, verified first; + * ; (II) one excluding r9 = r8, verified next. + * ; After load of *r9 to r9 the frame[0].fp[-24].id == r9.id. + * ; Suppose that checkpoint is created here via path (I). + * ; When verifying via (II) the r9.id must be compared against + * ; frame[0].fp[-24].id, otherwise (I) and (II) would be + * ; incorrectly deemed equivalent. + * if r9 == 0 goto + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1), + /* r8 = *r8 ; read map value via r8, this is not safe + * r0 = *r8 ; because r8 might be not equal to r9. + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_8, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0), + /* exit 0 */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .fixup_map_hash_8b = { 3, 9 }, + .result = REJECT, + .errstr = "R8 invalid mem access 'map_value_or_null'", + .result_unpriv = REJECT, + .errstr_unpriv = "", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, -- cgit v1.2.3 From 4ea2bb158bec2fe171e7e07c033dcf208d86e274 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:31 +0200 Subject: bpf: use check_ids() for active_lock comparison An update for verifier.c:states_equal()/regsafe() to use check_ids() for active spin lock comparisons. This fixes the issue reported by Kumar Kartikeya Dwivedi in [1] using technique suggested by Edward Cree. W/o this commit the verifier might be tricked to accept the following program working with a map containing spin locks: 0: r9 = map_lookup_elem(...) ; Returns PTR_TO_MAP_VALUE_OR_NULL id=1. 1: r8 = map_lookup_elem(...) ; Returns PTR_TO_MAP_VALUE_OR_NULL id=2. 2: if r9 == 0 goto exit ; r9 -> PTR_TO_MAP_VALUE. 3: if r8 == 0 goto exit ; r8 -> PTR_TO_MAP_VALUE. 4: r7 = ktime_get_ns() ; Unbound SCALAR_VALUE. 5: r6 = ktime_get_ns() ; Unbound SCALAR_VALUE. 6: bpf_spin_lock(r8) ; active_lock.id == 2. 7: if r6 > r7 goto +1 ; No new information about the state ; is derived from this check, thus ; produced verifier states differ only ; in 'insn_idx'. 8: r9 = r8 ; Optionally make r9.id == r8.id. --- checkpoint --- ; Assume is_state_visisted() creates a ; checkpoint here. 9: bpf_spin_unlock(r9) ; (a,b) active_lock.id == 2. ; (a) r9.id == 2, (b) r9.id == 1. 10: exit(0) Consider two verification paths: (a) 0-10 (b) 0-7,9-10 The path (a) is verified first. If checkpoint is created at (8) the (b) would assume that (8) is safe because regsafe() does not compare register ids for registers of type PTR_TO_MAP_VALUE. [1] https://lore.kernel.org/bpf/20221111202719.982118-1-memxor@gmail.com/ Reported-by: Kumar Kartikeya Dwivedi Suggested-by: Edward Cree Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 00c8e0a28203..a5255a0dcbb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13119,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -13291,8 +13292,17 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; - if (old->active_lock.ptr != cur->active_lock.ptr || - old->active_lock.id != cur->active_lock.id) + if (old->active_lock.ptr != cur->active_lock.ptr) + return false; + + /* Old and cur active_lock's have to be either both present + * or both absent. + */ + if (!!old->active_lock.id != !!cur->active_lock.id) + return false; + + if (old->active_lock.id && + !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) return false; if (old->active_rcu_lock != cur->active_rcu_lock) -- cgit v1.2.3 From 2026f2062df860e5d282ffd4962ea5d5ed53dc51 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 9 Dec 2022 15:57:32 +0200 Subject: selftests/bpf: Add pruning test case for bpf_spin_lock Test that when reg->id is not same for the same register of type PTR_TO_MAP_VALUE between current and old explored state, we currently return false from regsafe and continue exploring. Without the fix in prior commit, the test case fails. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/spin_lock.c | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/spin_lock.c b/tools/testing/selftests/bpf/verifier/spin_lock.c index 781621facae4..0a8dcfc37fc6 100644 --- a/tools/testing/selftests/bpf/verifier/spin_lock.c +++ b/tools/testing/selftests/bpf/verifier/spin_lock.c @@ -331,3 +331,42 @@ .errstr = "inside bpf_spin_lock", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, +{ + "spin_lock: regsafe compare reg->id for map value", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_6, offsetof(struct __sk_buff, mark)), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_1), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 2 }, + .result = REJECT, + .errstr = "bpf_spin_unlock of different lock", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .flags = BPF_F_TEST_STATE_FREQ, +}, -- cgit v1.2.3 From efd6286ff74a2fa2b45ed070d344cc0822b8ea6e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:33 +0200 Subject: selftests/bpf: test case for relaxed prunning of active_lock.id Check that verifier.c:states_equal() uses check_ids() to match consistent active_lock/map_value configurations. This allows to prune states with active spin locks even if numerical values of active_lock ids do not match across compared states. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/spin_lock.c | 75 ++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/spin_lock.c b/tools/testing/selftests/bpf/verifier/spin_lock.c index 0a8dcfc37fc6..eaf114f07e2e 100644 --- a/tools/testing/selftests/bpf/verifier/spin_lock.c +++ b/tools/testing/selftests/bpf/verifier/spin_lock.c @@ -370,3 +370,78 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .flags = BPF_F_TEST_STATE_FREQ, }, +/* Make sure that regsafe() compares ids for spin lock records using + * check_ids(): + * 1: r9 = map_lookup_elem(...) ; r9.id == 1 + * 2: r8 = map_lookup_elem(...) ; r8.id == 2 + * 3: r7 = ktime_get_ns() + * 4: r6 = ktime_get_ns() + * 5: if r6 > r7 goto <9> + * 6: spin_lock(r8) + * 7: r9 = r8 + * 8: goto <10> + * 9: spin_lock(r9) + * 10: spin_unlock(r9) ; r9.id == 1 || r9.id == 2 and lock is active, + * ; second visit to (10) should be considered safe + * ; if check_ids() is used. + * 11: exit(0) + */ +{ + "spin_lock: regsafe() check_ids() similar id mappings", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + /* r9 = map_lookup_elem(...) */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 24), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), + /* r8 = map_lookup_elem(...) */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, + 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 18), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + /* r7 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* if r6 > r7 goto +5 ; no new information about the state is derived from + * ; this check, thus produced verifier states differ + * ; only in 'insn_idx' + * spin_lock(r8) + * r9 = r8 + * goto unlock + */ + BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), + BPF_JMP_A(3), + /* spin_lock(r9) */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + /* spin_unlock(r9) */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4), + BPF_EMIT_CALL(BPF_FUNC_spin_unlock), + /* exit(0) */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 3, 10 }, + .result = VERBOSE_ACCEPT, + .errstr = "28: safe", + .result_unpriv = REJECT, + .errstr_unpriv = "", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .flags = BPF_F_TEST_STATE_FREQ, +}, -- cgit v1.2.3