From b8d78cb2e24d92352878a9f6525aec002c891528 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 23 Nov 2023 02:04:39 +0200 Subject: libbpf: Start v1.4 development cycle Bump libbpf.map to v1.4.0 to start a new libbpf version cycle. Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231123000439.12025-1-eddyz87@gmail.com --- tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/libbpf_version.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b52dc28dc8af..91c5aef7dae7 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -409,3 +409,6 @@ LIBBPF_1.3.0 { ring__size; ring_buffer__ring; } LIBBPF_1.2.0; + +LIBBPF_1.4.0 { +} LIBBPF_1.3.0; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index 290411ddb39e..e783a47da815 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 3 +#define LIBBPF_MINOR_VERSION 4 #endif /* __LIBBPF_VERSION_H */ -- cgit v1.2.3 From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:36 -0800 Subject: bpf: Validate global subprogs lazily Slightly change BPF verifier logic around eagerness and order of global subprog validation. Instead of going over every global subprog eagerly and validating it before main (entry) BPF program is verified, turn it around. Validate main program first, mark subprogs that were called from main program for later verification, but otherwise assume it is valid. Afterwards, go over marked global subprogs and validate those, potentially marking some more global functions as being called. Continue this process until all (transitively) callable global subprogs are validated. It's a BFS traversal at its heart and will always converge. This is an important change because it allows to feature-gate some subprograms that might not be verifiable on some older kernel, depending on supported set of features. E.g., at some point, global functions were allowed to accept a pointer to memory, which size is identified by user-provided type. Unfortunately, older kernels don't support this feature. With BPF CO-RE approach, the natural way would be to still compile BPF object file once and guard calls to this global subprog with some CO-RE check or using .rodata variables. That's what people do to guard usage of new helpers or kfuncs, and any other new BPF-side feature that might be missing on old kernels. That's currently impossible to do with global subprogs, unfortunately, because they are eagerly and unconditionally validated. This patch set aims to change this, so that in the future when global funcs gain new features, those can be guarded using BPF CO-RE techniques in the same fashion as any other new kernel feature. Two selftests had to be adjusted in sync with these changes. test_global_func12 relied on eager global subprog validation failing before main program failure is detected (unknown return value). Fix by making sure that main program is always valid. verifier_subprog_precision's parent_stack_slot_precise subtest relied on verifier checkpointing heuristic to do a checkpoint at instruction #5, but that's no longer true because we don't have enough jumps validated before reaching insn #5 due to global subprogs being validated later. Other than that, no changes, as one would expect. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org --- include/linux/bpf.h | 2 + kernel/bpf/verifier.c | 48 +++++++++++++++++++--- .../selftests/bpf/progs/test_global_func12.c | 4 +- .../bpf/progs/verifier_subprog_precision.c | 4 +- 4 files changed, 48 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 258ba232e302..eb447b0a9423 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2939ebf2638..8e7b6072e3f4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -434,6 +434,11 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) return btf_type_name(env->prog->aux->btf, info->type_id); } +static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -9290,6 +9295,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + /* mark global subprog for verifying after main prog */ + subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); /* All global functions return a 64-bit SCALAR_VALUE */ @@ -19873,8 +19880,11 @@ out: return ret; } -/* Verify all global functions in a BPF program one by one based on their BTF. - * All global functions must pass verification. Otherwise the whole program is rejected. +/* Lazily verify all global functions based on their BTF, if they are called + * from main BPF program or any of subprograms transitively. + * BPF global subprogs called from dead code are not validated. + * All callable global functions must pass verification. + * Otherwise the whole program is rejected. * Consider: * int bar(int); * int foo(int f) @@ -19893,14 +19903,26 @@ out: static int do_check_subprogs(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; - int i, ret; + struct bpf_func_info_aux *sub_aux; + int i, ret, new_cnt; if (!aux->func_info) return 0; + /* exception callback is presumed to be always called */ + if (env->exception_callback_subprog) + subprog_aux(env, env->exception_callback_subprog)->called = true; + +again: + new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + if (!subprog_is_global(env, i)) + continue; + + sub_aux = subprog_aux(env, i); + if (!sub_aux->called || sub_aux->verified) continue; + env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i, env->exception_callback_subprog == i); @@ -19910,7 +19932,21 @@ static int do_check_subprogs(struct bpf_verifier_env *env) verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n", i, subprog_name(env, i)); } + + /* We verified new global subprog, it might have called some + * more global subprogs that we haven't verified yet, so we + * need to do another pass over subprogs to verify those. + */ + sub_aux->verified = true; + new_cnt++; } + + /* We can't loop forever as we verify at least one global subprog on + * each pass. + */ + if (new_cnt) + goto again; + return 0; } @@ -20556,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = do_check_subprogs(env); - ret = ret ?: do_check_main(env); + ret = do_check_main(env); + ret = ret ?: do_check_subprogs(env); if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c index 7f159d83c6f6..6e03d42519a6 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func12.c +++ b/tools/testing/selftests/bpf/progs/test_global_func12.c @@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb) { const struct S s = {.x = skb->len }; - return foo(&s); + foo(&s); + + return 1; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index f61d623b1ce8..b5efcaeaa1ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void) SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 9 first_idx 6") +__msg("mark_precise: frame0: last_idx 9 first_idx 0") __msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8:") -__msg("mark_precise: frame0: last_idx 5 first_idx 0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6") -- cgit v1.2.3 From e8a339b5235e294f29153149ea7cf26a9a87dbea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:37 -0800 Subject: selftests/bpf: Add lazy global subprog validation tests Add a few test that validate BPF verifier's lazy approach to validating global subprogs. We check that global subprogs that are called transitively through another global subprog is validated. We also check that invalid global subprog is not validated, if it's not called from the main program. And we also check that main program is always validated first, before any of the subprogs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-4-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_global_subprogs.c | 92 ++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_global_subprogs.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5cfa7a6316b6..8d746642cbd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -25,6 +25,7 @@ #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" #include "verifier_div_overflow.skel.h" +#include "verifier_global_subprogs.skel.h" #include "verifier_gotol.skel.h" #include "verifier_helper_access_var_len.skel.h" #include "verifier_helper_packet_access.skel.h" @@ -134,6 +135,7 @@ void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_acces void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } +void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_gotol(void) { RUN(verifier_gotol); } void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); } void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); } diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c new file mode 100644 index 000000000000..a0a5efd1caa1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +int arr[1]; +int unkn_idx; + +__noinline long global_bad(void) +{ + return arr[unkn_idx]; /* BOOM */ +} + +__noinline long global_good(void) +{ + return arr[0]; +} + +__noinline long global_calls_bad(void) +{ + return global_good() + global_bad() /* does BOOM indirectly */; +} + +__noinline long global_calls_good_only(void) +{ + return global_good(); +} + +SEC("?raw_tp") +__success __log_level(2) +/* main prog is validated completely first */ +__msg("('global_calls_good_only') is global and assumed valid.") +__msg("1: (95) exit") +/* eventually global_good() is transitively validated as well */ +__msg("Validating global_good() func") +__msg("('global_good') is safe for any args that match its prototype") +int chained_global_func_calls_success(void) +{ + return global_calls_good_only(); +} + +SEC("?raw_tp") +__failure __log_level(2) +/* main prog validated successfully first */ +__msg("1: (95) exit") +/* eventually we validate global_bad() and fail */ +__msg("Validating global_bad() func") +__msg("math between map_value pointer and register") /* BOOM */ +int chained_global_func_calls_bad(void) +{ + return global_calls_bad(); +} + +/* do out of bounds access forcing verifier to fail verification if this + * global func is called + */ +__noinline int global_unsupp(const int *mem) +{ + if (!mem) + return 0; + return mem[100]; /* BOOM */ +} + +const volatile bool skip_unsupp_global = true; + +SEC("?raw_tp") +__success +int guarded_unsupp_global_called(void) +{ + if (!skip_unsupp_global) + return global_unsupp(NULL); + return 0; +} + +SEC("?raw_tp") +__failure __log_level(2) +__msg("Func#1 ('global_unsupp') is global and assumed valid.") +__msg("Validating global_unsupp() func#1...") +__msg("value is outside of the allowed memory range") +int unguarded_unsupp_global_called(void) +{ + int x = 0; + + return global_unsupp(&x); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b16904fd9f01b580db357ef2b1cc9e86d89576c2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 26 Nov 2023 21:03:42 -0800 Subject: bpf: Fix a few selftest failures due to llvm18 change With latest upstream llvm18, the following test cases failed: $ ./test_progs -j #13/2 bpf_cookie/multi_kprobe_link_api:FAIL #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL #13 bpf_cookie:FAIL #77 fentry_fexit:FAIL #78/1 fentry_test/fentry:FAIL #78 fentry_test:FAIL #82/1 fexit_test/fexit:FAIL #82 fexit_test:FAIL #112/1 kprobe_multi_test/skel_api:FAIL #112/2 kprobe_multi_test/link_api_addrs:FAIL [...] #112 kprobe_multi_test:FAIL #356/17 test_global_funcs/global_func17:FAIL #356 test_global_funcs:FAIL Further analysis shows llvm upstream patch [1] is responsible for the above failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, without [1], the asm code is: 0000000000000400 : 400: f3 0f 1e fa endbr64 404: e8 00 00 00 00 callq 0x409 409: 48 89 f8 movq %rdi, %rax 40c: c3 retq 40d: 0f 1f 00 nopl (%rax) ... and with [1], the asm code is: 0000000000005d20 : 5d20: e8 00 00 00 00 callq 0x5d25 5d25: c3 retq ... and is called instead of and this caused test failures for #13/#77 etc. except #356. For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog looks like: 0000000000000000 : 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a 1: 95 00 00 00 00 00 00 00 exit ... which passed verification while the test itself expects a verification failure. Let us add 'barrier_var' style asm code in both places to prevent function specialization which caused selftests failure. [1] https://github.com/llvm/llvm-project/pull/72903 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev --- net/bpf/test_run.c | 2 +- tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c9fdcc5cdce1..711cf5d59816 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -542,7 +542,7 @@ struct bpf_fentry_test_t { int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile (""); + asm volatile ("": "+r"(arg)); return (long)arg; } diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c index a32e11c7d933..5de44b09e8ec 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func17.c +++ b/tools/testing/selftests/bpf/progs/test_global_func17.c @@ -5,6 +5,7 @@ __noinline int foo(int *p) { + barrier_var(p); return p ? (*p = 42) : 0; } -- cgit v1.2.3 From 876843ce1e4897e8ceade50bfa3d9a4ec483abf3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 10:20:56 -0800 Subject: bpftool: mark orphaned programs during prog show Commit ef01f4e25c17 ("bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD") stopped removing program's id from idr when the offloaded/bound netdev goes away. I was supposed to take a look and check in [0], but apparently I did not. Martin points out it might be useful to keep it that way for observability sake, but we at least need to mark those programs as unusable. Mark those programs as 'orphaned' and keep printing the list when we encounter ENODEV. 0: unspec tag 0000000000000000 xlated 0B not jited memlock 4096B orphaned [0]: https://lore.kernel.org/all/CAKH8qBtyR20ZWAc11z1-6pGb3Hd47AQUTbE_cfoktG59TqaJ7Q@mail.gmail.com/ v3: * use two spaces for " orphaned" (Quentin) Cc: netdev@vger.kernel.org Fixes: ef01f4e25c17 ("bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD") Signed-off-by: Stanislav Fomichev Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/r/20231127182057.1081138-1-sdf@google.com Signed-off-by: Martin KaFai Lau --- tools/bpf/bpftool/prog.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 7ec4f5671e7a..feb8e305804f 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -442,7 +442,7 @@ static void print_prog_header_json(struct bpf_prog_info *info, int fd) jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); } -static void print_prog_json(struct bpf_prog_info *info, int fd) +static void print_prog_json(struct bpf_prog_info *info, int fd, bool orphaned) { char *memlock; @@ -461,6 +461,7 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) jsonw_uint_field(json_wtr, "uid", info->created_by_uid); } + jsonw_bool_field(json_wtr, "orphaned", orphaned); jsonw_uint_field(json_wtr, "bytes_xlated", info->xlated_prog_len); if (info->jited_prog_len) { @@ -527,7 +528,7 @@ static void print_prog_header_plain(struct bpf_prog_info *info, int fd) printf("\n"); } -static void print_prog_plain(struct bpf_prog_info *info, int fd) +static void print_prog_plain(struct bpf_prog_info *info, int fd, bool orphaned) { char *memlock; @@ -554,6 +555,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) printf(" memlock %sB", memlock); free(memlock); + if (orphaned) + printf(" orphaned"); + if (info->nr_map_ids) show_prog_maps(fd, info->nr_map_ids); @@ -581,15 +585,15 @@ static int show_prog(int fd) int err; err = bpf_prog_get_info_by_fd(fd, &info, &len); - if (err) { + if (err && err != -ENODEV) { p_err("can't get prog info: %s", strerror(errno)); return -1; } if (json_output) - print_prog_json(&info, fd); + print_prog_json(&info, fd, err == -ENODEV); else - print_prog_plain(&info, fd); + print_prog_plain(&info, fd, err == -ENODEV); return 0; } -- cgit v1.2.3 From cf9791631027a476f7cdb0e1b3ac6add16eff264 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 10:20:57 -0800 Subject: selftests/bpf: update test_offload to use new orphaned property - filter orphaned programs by default - when trying to query orphaned program, don't expect bpftool failure Cc: netdev@vger.kernel.org Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127182057.1081138-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_offload.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 40cba8d368d9..6157f884d091 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -169,12 +169,14 @@ def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail, include_stderr=include_stderr) -def bpftool_prog_list(expected=None, ns=""): +def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) # Remove the base progs for p in base_progs: if p in progs: progs.remove(p) + if exclude_orphaned: + progs = [ p for p in progs if not p['orphaned'] ] if expected is not None: if len(progs) != expected: fail(True, "%d BPF programs loaded, expected %d" % @@ -612,11 +614,9 @@ def pin_map(file_name, idx=0, expected=1): def check_dev_info_removed(prog_file=None, map_file=None): bpftool_prog_list(expected=0) + bpftool_prog_list(expected=1, exclude_orphaned=False) ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) - fail(ret == 0, "Showing prog with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing prog with removed device expected ENODEV, error is %s" % - (err["error"])) + fail(ret != 0, "failed to show prog with removed device") bpftool_map_list(expected=0) ret, err = bpftool("map show pin %s" % (map_file), fail=False) @@ -1395,10 +1395,7 @@ try: start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") ret, out = bpftool("prog show %s" % (progB), fail=False) - fail(ret == 0, "got information about orphaned program") - fail("error" not in out, "no error reported for get info on orphaned") - fail(out["error"] != "can't get prog info: No such device", - "wrong error for get info on orphaned") + fail(ret != 0, "couldn't get information about orphaned program") print("%s: OK" % (os.path.basename(__file__))) -- cgit v1.2.3 From 48f0dfd8d3e212ab27b6db147ed10407ff6aaa88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:25 +0100 Subject: libbpf: Add st_type argument to elf_resolve_syms_offsets function We need to get offsets for static variables in following changes, so making elf_resolve_syms_offsets to take st_type value as argument and passing it to elf_sym_iter_new. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20231125193130.834322-2-jolsa@kernel.org --- tools/lib/bpf/elf.c | 5 +++-- tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/libbpf_internal.h | 3 ++- tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c index 2a62bf411bb3..b02faec748a5 100644 --- a/tools/lib/bpf/elf.c +++ b/tools/lib/bpf/elf.c @@ -407,7 +407,8 @@ static int symbol_cmp(const void *a, const void *b) * size, that needs to be released by the caller. */ int elf_resolve_syms_offsets(const char *binary_path, int cnt, - const char **syms, unsigned long **poffsets) + const char **syms, unsigned long **poffsets, + int st_type) { int sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; int err = 0, i, cnt_done = 0; @@ -438,7 +439,7 @@ int elf_resolve_syms_offsets(const char *binary_path, int cnt, struct elf_sym_iter iter; struct elf_sym *sym; - err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC); + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], st_type); if (err == -ENOENT) continue; if (err) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e067be95da3c..ea9b8158c20d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11447,7 +11447,7 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, return libbpf_err_ptr(err); offsets = resolved_offsets; } else if (syms) { - err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets); + err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets, STT_FUNC); if (err < 0) return libbpf_err_ptr(err); offsets = resolved_offsets; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index f0f08635adb0..b5d334754e5d 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -594,7 +594,8 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd); void elf_close(struct elf_fd *elf_fd); int elf_resolve_syms_offsets(const char *binary_path, int cnt, - const char **syms, unsigned long **poffsets); + const char **syms, unsigned long **poffsets, + int st_type); int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, unsigned long **poffsets, size_t *pcnt); diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index cd051d3901a9..ece260cf2c0b 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -249,7 +249,7 @@ static void __test_link_api(struct child *child) int link_extra_fd = -1; int err; - err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets); + err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets, STT_FUNC); if (!ASSERT_OK(err, "elf_resolve_syms_offsets")) return; -- cgit v1.2.3 From e56fdbfb06e26a7066b070967badef4148528df2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:27 +0100 Subject: bpf: Add link_info support for uprobe multi link Adding support to get uprobe_link details through bpf_link_info interface. Adding new struct uprobe_multi to struct bpf_link_info to carry the uprobe_multi link details. The uprobe_multi.count is passed from user space to denote size of array fields (offsets/ref_ctr_offsets/cookies). The actual array size is stored back to uprobe_multi.count (allowing user to find out the actual array size) and array fields are populated up to the user passed size. All the non-array fields (path/count/flags/pid) are always set. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20231125193130.834322-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 10 ++++++ kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 10 ++++++ 3 files changed, 92 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad0323f27288..c284a4ad0315 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3042,6 +3042,7 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; + u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3083,9 +3084,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) kfree(umulti_link); } +static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); + u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); + u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); + u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); + u32 upath_size = info->uprobe_multi.path_size; + struct bpf_uprobe_multi_link *umulti_link; + u32 ucount = info->uprobe_multi.count; + int err = 0, i; + long left; + + if (!upath ^ !upath_size) + return -EINVAL; + + if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) + return -EINVAL; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + info->uprobe_multi.count = umulti_link->cnt; + info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + + if (upath) { + char *p, *buf; + + upath_size = min_t(u32, upath_size, PATH_MAX); + + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { + kfree(buf); + return PTR_ERR(p); + } + upath_size = buf + upath_size - p; + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; + } + + if (!uoffsets && !ucookies && !uref_ctr_offsets) + return 0; + + if (ucount < umulti_link->cnt) + err = -ENOSPC; + else + ucount = umulti_link->cnt; + + for (i = 0; i < ucount; i++) { + if (uoffsets && + put_user(umulti_link->uprobes[i].offset, uoffsets + i)) + return -EFAULT; + if (uref_ctr_offsets && + put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) + return -EFAULT; + if (ucookies && + put_user(umulti_link->uprobes[i].cookie, ucookies + i)) + return -EFAULT; + } + + return err; +} + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc = bpf_uprobe_multi_link_dealloc, + .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3274,6 +3345,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; + link->flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; -- cgit v1.2.3 From 1703612885723869064f18e8816c6f3f87987748 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:28 +0100 Subject: selftests/bpf: Use bpf_link__destroy in fill_link_info tests The fill_link_info test keeps skeleton open and just creates various links. We are wrongly calling bpf_link__detach after each test to close them, we need to call bpf_link__destroy. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Yafang Shao Link: https://lore.kernel.org/bpf/20231125193130.834322-5-jolsa@kernel.org --- .../selftests/bpf/prog_tests/fill_link_info.c | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c index 97142a4db374..9294cb8d7743 100644 --- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c +++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c @@ -140,14 +140,14 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel, .retprobe = type == BPF_PERF_EVENT_KRETPROBE, ); ssize_t entry_offset = 0; + struct bpf_link *link; int link_fd, err; - skel->links.kprobe_run = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run, - KPROBE_FUNC, &opts); - if (!ASSERT_OK_PTR(skel->links.kprobe_run, "attach_kprobe")) + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run, KPROBE_FUNC, &opts); + if (!ASSERT_OK_PTR(link, "attach_kprobe")) return; - link_fd = bpf_link__fd(skel->links.kprobe_run); + link_fd = bpf_link__fd(link); if (!invalid) { /* See also arch_adjust_kprobe_addr(). */ if (skel->kconfig->CONFIG_X86_KERNEL_IBT) @@ -157,39 +157,41 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel, } else { kprobe_fill_invalid_user_buffer(link_fd); } - bpf_link__detach(skel->links.kprobe_run); + bpf_link__destroy(link); } static void test_tp_fill_link_info(struct test_fill_link_info *skel) { + struct bpf_link *link; int link_fd, err; - skel->links.tp_run = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME); - if (!ASSERT_OK_PTR(skel->links.tp_run, "attach_tp")) + link = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME); + if (!ASSERT_OK_PTR(link, "attach_tp")) return; - link_fd = bpf_link__fd(skel->links.tp_run); + link_fd = bpf_link__fd(link); err = verify_perf_link_info(link_fd, BPF_PERF_EVENT_TRACEPOINT, 0, 0, 0); ASSERT_OK(err, "verify_perf_link_info"); - bpf_link__detach(skel->links.tp_run); + bpf_link__destroy(link); } static void test_uprobe_fill_link_info(struct test_fill_link_info *skel, enum bpf_perf_event_type type) { + struct bpf_link *link; int link_fd, err; - skel->links.uprobe_run = bpf_program__attach_uprobe(skel->progs.uprobe_run, - type == BPF_PERF_EVENT_URETPROBE, - 0, /* self pid */ - UPROBE_FILE, uprobe_offset); - if (!ASSERT_OK_PTR(skel->links.uprobe_run, "attach_uprobe")) + link = bpf_program__attach_uprobe(skel->progs.uprobe_run, + type == BPF_PERF_EVENT_URETPROBE, + 0, /* self pid */ + UPROBE_FILE, uprobe_offset); + if (!ASSERT_OK_PTR(link, "attach_uprobe")) return; - link_fd = bpf_link__fd(skel->links.uprobe_run); + link_fd = bpf_link__fd(link); err = verify_perf_link_info(link_fd, type, 0, uprobe_offset, 0); ASSERT_OK(err, "verify_perf_link_info"); - bpf_link__detach(skel->links.uprobe_run); + bpf_link__destroy(link); } static int verify_kmulti_link_info(int fd, bool retprobe) @@ -278,24 +280,24 @@ static void test_kprobe_multi_fill_link_info(struct test_fill_link_info *skel, bool retprobe, bool invalid) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct bpf_link *link; int link_fd, err; opts.syms = kmulti_syms; opts.cnt = KMULTI_CNT; opts.retprobe = retprobe; - skel->links.kmulti_run = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run, - NULL, &opts); - if (!ASSERT_OK_PTR(skel->links.kmulti_run, "attach_kprobe_multi")) + link = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run, NULL, &opts); + if (!ASSERT_OK_PTR(link, "attach_kprobe_multi")) return; - link_fd = bpf_link__fd(skel->links.kmulti_run); + link_fd = bpf_link__fd(link); if (!invalid) { err = verify_kmulti_link_info(link_fd, retprobe); ASSERT_OK(err, "verify_kmulti_link_info"); } else { verify_kmulti_invalid_user_buffer(link_fd); } - bpf_link__detach(skel->links.kmulti_run); + bpf_link__destroy(link); } void test_fill_link_info(void) -- cgit v1.2.3 From 147c69307bcf67f1f01246f9acb794da9837f299 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:29 +0100 Subject: selftests/bpf: Add link_info test for uprobe_multi link Adding fill_link_info test for uprobe_multi link. Setting up uprobes with bogus ref_ctr_offsets and cookie values to test all the bpf_link_info::uprobe_multi fields. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20231125193130.834322-6-jolsa@kernel.org --- .../selftests/bpf/prog_tests/fill_link_info.c | 198 +++++++++++++++++++++ .../selftests/bpf/progs/test_fill_link_info.c | 6 + 2 files changed, 204 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c index 9294cb8d7743..d4b1901f7879 100644 --- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c +++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c @@ -7,6 +7,7 @@ #include #include "trace_helpers.h" #include "test_fill_link_info.skel.h" +#include "bpf/libbpf_internal.h" #define TP_CAT "sched" #define TP_NAME "sched_switch" @@ -300,6 +301,196 @@ static void test_kprobe_multi_fill_link_info(struct test_fill_link_info *skel, bpf_link__destroy(link); } +#define SEC(name) __attribute__((section(name), used)) + +static short uprobe_link_info_sema_1 SEC(".probes"); +static short uprobe_link_info_sema_2 SEC(".probes"); +static short uprobe_link_info_sema_3 SEC(".probes"); + +noinline void uprobe_link_info_func_1(void) +{ + asm volatile (""); + uprobe_link_info_sema_1++; +} + +noinline void uprobe_link_info_func_2(void) +{ + asm volatile (""); + uprobe_link_info_sema_2++; +} + +noinline void uprobe_link_info_func_3(void) +{ + asm volatile (""); + uprobe_link_info_sema_3++; +} + +static int +verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets, + __u64 *cookies, __u64 *ref_ctr_offsets) +{ + char path[PATH_MAX], path_buf[PATH_MAX]; + struct bpf_link_info info; + __u32 len = sizeof(info); + __u64 ref_ctr_offsets_buf[3]; + __u64 offsets_buf[3]; + __u64 cookies_buf[3]; + int i, err, bit; + __u32 count = 0; + + memset(path, 0, sizeof(path)); + err = readlink("/proc/self/exe", path, sizeof(path)); + if (!ASSERT_NEQ(err, -1, "readlink")) + return -1; + + for (bit = 0; bit < 8; bit++) { + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path = ptr_to_u64(path_buf); + info.uprobe_multi.path_size = sizeof(path_buf); + info.uprobe_multi.count = count; + + if (bit & 0x1) + info.uprobe_multi.offsets = ptr_to_u64(offsets_buf); + if (bit & 0x2) + info.uprobe_multi.cookies = ptr_to_u64(cookies_buf); + if (bit & 0x4) + info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets_buf); + + err = bpf_link_get_info_by_fd(fd, &info, &len); + if (!ASSERT_OK(err, "bpf_link_get_info_by_fd")) + return -1; + + if (!ASSERT_EQ(info.type, BPF_LINK_TYPE_UPROBE_MULTI, "info.type")) + return -1; + + ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid"); + ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count"); + ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN, + retprobe, "info.uprobe_multi.flags.retprobe"); + ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size"); + ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path"); + + for (i = 0; i < info.uprobe_multi.count; i++) { + if (info.uprobe_multi.offsets) + ASSERT_EQ(offsets_buf[i], offsets[i], "info.uprobe_multi.offsets"); + if (info.uprobe_multi.cookies) + ASSERT_EQ(cookies_buf[i], cookies[i], "info.uprobe_multi.cookies"); + if (info.uprobe_multi.ref_ctr_offsets) { + ASSERT_EQ(ref_ctr_offsets_buf[i], ref_ctr_offsets[i], + "info.uprobe_multi.ref_ctr_offsets"); + } + } + count = count ?: info.uprobe_multi.count; + } + + return 0; +} + +static void verify_umulti_invalid_user_buffer(int fd) +{ + struct bpf_link_info info; + __u32 len = sizeof(info); + __u64 buf[3]; + int err; + + /* upath_size defined, not path */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path_size = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EINVAL, "failed_upath_size"); + + /* path defined, but small */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path = ptr_to_u64(buf); + info.uprobe_multi.path_size = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_LT(err, 0, "failed_upath_small"); + + /* path has wrong pointer */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path_size = PATH_MAX; + info.uprobe_multi.path = 123; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EFAULT, "failed_bad_path_ptr"); + + /* count zero, with offsets */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = ptr_to_u64(buf); + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EINVAL, "failed_count"); + + /* offsets not big enough */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = ptr_to_u64(buf); + info.uprobe_multi.count = 2; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -ENOSPC, "failed_small_count"); + + /* offsets has wrong pointer */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = 123; + info.uprobe_multi.count = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EFAULT, "failed_wrong_offsets"); +} + +static void test_uprobe_multi_fill_link_info(struct test_fill_link_info *skel, + bool retprobe, bool invalid) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .retprobe = retprobe, + ); + const char *syms[3] = { + "uprobe_link_info_func_1", + "uprobe_link_info_func_2", + "uprobe_link_info_func_3", + }; + __u64 cookies[3] = { + 0xdead, + 0xbeef, + 0xcafe, + }; + const char *sema[3] = { + "uprobe_link_info_sema_1", + "uprobe_link_info_sema_2", + "uprobe_link_info_sema_3", + }; + __u64 *offsets = NULL, *ref_ctr_offsets; + struct bpf_link *link; + int link_fd, err; + + err = elf_resolve_syms_offsets("/proc/self/exe", 3, sema, + (unsigned long **) &ref_ctr_offsets, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_object")) + return; + + err = elf_resolve_syms_offsets("/proc/self/exe", 3, syms, + (unsigned long **) &offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func")) + goto out; + + opts.syms = syms; + opts.cookies = &cookies[0]; + opts.ref_ctr_offsets = (unsigned long *) &ref_ctr_offsets[0]; + opts.cnt = ARRAY_SIZE(syms); + + link = bpf_program__attach_uprobe_multi(skel->progs.umulti_run, 0, + "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto out; + + link_fd = bpf_link__fd(link); + if (invalid) + verify_umulti_invalid_user_buffer(link_fd); + else + verify_umulti_link_info(link_fd, retprobe, offsets, cookies, ref_ctr_offsets); + + bpf_link__destroy(link); +out: + free(ref_ctr_offsets); + free(offsets); +} + void test_fill_link_info(void) { struct test_fill_link_info *skel; @@ -339,6 +530,13 @@ void test_fill_link_info(void) if (test__start_subtest("kprobe_multi_invalid_ubuff")) test_kprobe_multi_fill_link_info(skel, true, true); + if (test__start_subtest("uprobe_multi_link_info")) + test_uprobe_multi_fill_link_info(skel, false, false); + if (test__start_subtest("uretprobe_multi_link_info")) + test_uprobe_multi_fill_link_info(skel, true, false); + if (test__start_subtest("uprobe_multi_invalid")) + test_uprobe_multi_fill_link_info(skel, false, true); + cleanup: test_fill_link_info__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c index 564f402d56fe..69509f8bb680 100644 --- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c +++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c @@ -39,4 +39,10 @@ int BPF_PROG(kmulti_run) return 0; } +SEC("uprobe.multi") +int BPF_PROG(umulti_run) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a7795698f8b6c48283fa4334eb313bc1350b2864 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:30 +0100 Subject: bpftool: Add support to display uprobe_multi links Adding support to display details for uprobe_multi links, both plain: # bpftool link -p ... 24: uprobe_multi prog 126 uprobe.multi path /home/jolsa/bpf/test_progs func_cnt 3 pid 4143 offset ref_ctr_offset cookies 0xd1f88 0xf5d5a8 0xdead 0xd1f8f 0xf5d5aa 0xbeef 0xd1f96 0xf5d5ac 0xcafe and json: # bpftool link -p [{ ... },{ "id": 24, "type": "uprobe_multi", "prog_id": 126, "retprobe": false, "path": "/home/jolsa/bpf/test_progs", "func_cnt": 3, "pid": 4143, "funcs": [{ "offset": 860040, "ref_ctr_offset": 16111016, "cookie": 57005 },{ "offset": 860047, "ref_ctr_offset": 16111018, "cookie": 48879 },{ "offset": 860054, "ref_ctr_offset": 16111020, "cookie": 51966 } ] } ] Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20231125193130.834322-7-jolsa@kernel.org --- tools/bpf/bpftool/link.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index a1528cde81ab..cb46667a6b2e 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -294,6 +294,37 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) jsonw_end_array(json_wtr); } +static __u64 *u64_to_arr(__u64 val) +{ + return (__u64 *) u64_to_ptr(val); +} + +static void +show_uprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + __u32 i; + + jsonw_bool_field(json_wtr, "retprobe", + info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN); + jsonw_string_field(json_wtr, "path", (char *) u64_to_ptr(info->uprobe_multi.path)); + jsonw_uint_field(json_wtr, "func_cnt", info->uprobe_multi.count); + jsonw_int_field(json_wtr, "pid", (int) info->uprobe_multi.pid); + jsonw_name(json_wtr, "funcs"); + jsonw_start_array(json_wtr); + + for (i = 0; i < info->uprobe_multi.count; i++) { + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "offset", + u64_to_arr(info->uprobe_multi.offsets)[i]); + jsonw_uint_field(json_wtr, "ref_ctr_offset", + u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i]); + jsonw_uint_field(json_wtr, "cookie", + u64_to_arr(info->uprobe_multi.cookies)[i]); + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); +} + static void show_perf_event_kprobe_json(struct bpf_link_info *info, json_writer_t *wtr) { @@ -465,6 +496,9 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) case BPF_LINK_TYPE_KPROBE_MULTI: show_kprobe_multi_json(info, json_wtr); break; + case BPF_LINK_TYPE_UPROBE_MULTI: + show_uprobe_multi_json(info, json_wtr); + break; case BPF_LINK_TYPE_PERF_EVENT: switch (info->perf_event.type) { case BPF_PERF_EVENT_EVENT: @@ -674,6 +708,33 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) } } +static void show_uprobe_multi_plain(struct bpf_link_info *info) +{ + __u32 i; + + if (!info->uprobe_multi.count) + return; + + if (info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN) + printf("\n\turetprobe.multi "); + else + printf("\n\tuprobe.multi "); + + printf("path %s ", (char *) u64_to_ptr(info->uprobe_multi.path)); + printf("func_cnt %u ", info->uprobe_multi.count); + + if (info->uprobe_multi.pid) + printf("pid %d ", info->uprobe_multi.pid); + + printf("\n\t%-16s %-16s %-16s", "offset", "ref_ctr_offset", "cookies"); + for (i = 0; i < info->uprobe_multi.count; i++) { + printf("\n\t0x%-16llx 0x%-16llx 0x%-16llx", + u64_to_arr(info->uprobe_multi.offsets)[i], + u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i], + u64_to_arr(info->uprobe_multi.cookies)[i]); + } +} + static void show_perf_event_kprobe_plain(struct bpf_link_info *info) { const char *buf; @@ -807,6 +868,9 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) case BPF_LINK_TYPE_KPROBE_MULTI: show_kprobe_multi_plain(info); break; + case BPF_LINK_TYPE_UPROBE_MULTI: + show_uprobe_multi_plain(info); + break; case BPF_LINK_TYPE_PERF_EVENT: switch (info->perf_event.type) { case BPF_PERF_EVENT_EVENT: @@ -846,8 +910,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) static int do_show_link(int fd) { + __u64 *ref_ctr_offsets = NULL, *offsets = NULL, *cookies = NULL; struct bpf_link_info info; __u32 len = sizeof(info); + char path_buf[PATH_MAX]; __u64 *addrs = NULL; char buf[PATH_MAX]; int count; @@ -889,6 +955,39 @@ again: goto again; } } + if (info.type == BPF_LINK_TYPE_UPROBE_MULTI && + !info.uprobe_multi.offsets) { + count = info.uprobe_multi.count; + if (count) { + offsets = calloc(count, sizeof(__u64)); + if (!offsets) { + p_err("mem alloc failed"); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.offsets = ptr_to_u64(offsets); + ref_ctr_offsets = calloc(count, sizeof(__u64)); + if (!ref_ctr_offsets) { + p_err("mem alloc failed"); + free(offsets); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets); + cookies = calloc(count, sizeof(__u64)); + if (!cookies) { + p_err("mem alloc failed"); + free(cookies); + free(offsets); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.cookies = ptr_to_u64(cookies); + info.uprobe_multi.path = ptr_to_u64(path_buf); + info.uprobe_multi.path_size = sizeof(path_buf); + goto again; + } + } if (info.type == BPF_LINK_TYPE_PERF_EVENT) { switch (info.perf_event.type) { case BPF_PERF_EVENT_TRACEPOINT: @@ -924,8 +1023,10 @@ again: else show_link_close_plain(fd, &info); - if (addrs) - free(addrs); + free(ref_ctr_offsets); + free(cookies); + free(offsets); + free(addrs); close(fd); return 0; } -- cgit v1.2.3 From 2ce344b6891618063c0bebe22d9cdf9c086e0aa8 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 25 Nov 2023 17:42:50 +0900 Subject: selftests/bpf: Choose pkg-config for the target pkg-config is used to build sign-file executable. It should use the library for the target instead of the host as it is called during tests. Signed-off-by: Akihiko Odaki Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231125084253.85025-2-akihiko.odaki@daynix.com --- tools/testing/selftests/bpf/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9c27b67bc7b1..94825ef813d5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -18,7 +18,7 @@ else GENDIR := $(abspath ../../../../include/generated) endif GENHDR := $(GENDIR)/autoconf.h -HOSTPKG_CONFIG := pkg-config +PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config ifneq ($(wildcard $(GENHDR)),) GENFLAGS := -DHAVE_GENHDR @@ -219,9 +219,9 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c $(call msg,SIGN-FILE,,$@) - $(Q)$(CC) $(shell $(HOSTPKG_CONFIG) --cflags libcrypto 2> /dev/null) \ + $(Q)$(CC) $(shell $(PKG_CONFIG) --cflags libcrypto 2> /dev/null) \ $< -o $@ \ - $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto) + $(shell $(PKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto) $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) -- cgit v1.2.3 From 18f6f9de98d1d0f7040e6e6c39fce8939f55520f Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 25 Nov 2023 17:42:51 +0900 Subject: selftests/bpf: Override PKG_CONFIG for static builds A library may need to depend on additional archive files for static builds so pkg-config should be instructed to list them. Signed-off-by: Akihiko Odaki Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231125084253.85025-3-akihiko.odaki@daynix.com --- tools/testing/selftests/bpf/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index cb9b95702ac6..9af79c7a9b58 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking: .. code-block:: console - $ LDLIBS=-static vmtest.sh + $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh .. note:: Some distros may not support static linking. -- cgit v1.2.3 From 8998a479fd96b0b209dcb2feb468eba7eddb4ddb Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sat, 25 Nov 2023 17:42:52 +0900 Subject: selftests/bpf: Use pkg-config for libelf When linking statically, libraries may require other dependencies to be included to ld flags. In particular, libelf may require libzstd. Use pkg-config to determine such dependencies. Signed-off-by: Akihiko Odaki Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231125084253.85025-4-akihiko.odaki@daynix.com --- tools/testing/selftests/bpf/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 94825ef813d5..617ae55c3bb5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -29,13 +29,17 @@ SAN_CFLAGS ?= SAN_LDFLAGS ?= $(SAN_CFLAGS) RELEASE ?= OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) + +LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null) +LIBELF_LIBS := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) + CFLAGS += -g $(OPT_FLAGS) -rdynamic \ -Wall -Werror \ - $(GENFLAGS) $(SAN_CFLAGS) \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) LDFLAGS += $(SAN_LDFLAGS) -LDLIBS += -lelf -lz -lrt -lpthread +LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread ifneq ($(LLVM),) # Silence some warnings when compiled with clang -- cgit v1.2.3 From 341ac980eab90ac1f6c22ee9f9da83ed9604d899 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:07 -0800 Subject: xsk: Support tx_metadata_len For zerocopy mode, tx_desc->addr can point to an arbitrary offset and carry some TX metadata in the headroom. For copy mode, there is no way currently to populate skb metadata. Introduce new tx_metadata_len umem config option that indicates how many bytes to treat as metadata. Metadata bytes come prior to tx_desc address (same as in RX case). The size of the metadata has mostly the same constraints as XDP: - less than 256 bytes - 8-byte aligned (compared to 4-byte alignment on xdp, due to 8-byte timestamp in the completion) - non-zero This data is not interpreted in any way right now. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/net/xsk_buff_pool.h | 1 + include/uapi/linux/if_xdp.h | 1 + net/xdp/xdp_umem.c | 4 ++++ net/xdp/xsk.c | 12 +++++++++++- net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 17 ++++++++++------- tools/include/uapi/linux/if_xdp.h | 1 + 8 files changed, 30 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f83128007fb0..bcf765124f72 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..1985ffaf9b0c 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -77,6 +77,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 8d48863472b9..2ecf79282c26 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -76,6 +76,7 @@ struct xdp_umem_reg { __u32 chunk_size; __u32 headroom; __u32 flags; + __u32 tx_metadata_len; }; struct xdp_statistics { diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 06cead2b8e34..946a687fb8e8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -199,6 +199,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; + if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) + return -EINVAL; + umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; @@ -207,6 +210,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; + umem->tx_metadata_len = mr->tx_metadata_len; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ae9f8cb611f6..c904356e2800 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1283,6 +1283,14 @@ struct xdp_umem_reg_v1 { __u32 headroom; }; +struct xdp_umem_reg_v2 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; +}; + static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1326,8 +1334,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; - else if (optlen < sizeof(mr)) + else if (optlen < sizeof(struct xdp_umem_reg_v2)) mr_size = sizeof(struct xdp_umem_reg_v1); + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v2); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 49cb9f9a09be..386eddcdf837 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, XDP_PACKET_HEADROOM; pool->umem = umem; pool->addrs = umem->addrs; + pool->tx_metadata_len = umem->tx_metadata_len; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 13354a1e4280..c74a1372bcb9 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -143,15 +143,17 @@ static inline bool xp_unused_options_set(u32 options) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 offset = desc->addr & (pool->chunk_size - 1); + u64 addr = desc->addr - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; + u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; - if (offset + desc->len > pool->chunk_size) + if (offset + len > pool->chunk_size) return false; - if (desc->addr >= pool->addrs_cnt) + if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) @@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; - if (desc->len > pool->chunk_size) + if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 73a47da885dc..34411a2e5b6c 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -76,6 +76,7 @@ struct xdp_umem_reg { __u32 chunk_size; __u32 headroom; __u32 flags; + __u32 tx_metadata_len; }; struct xdp_statistics { -- cgit v1.2.3 From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:08 -0800 Subject: xsk: Add TX timestamp and TX checksum offload support This change actually defines the (initial) metadata layout that should be used by AF_XDP userspace (xsk_tx_metadata). The first field is flags which requests appropriate offloads, followed by the offload-specific fields. The supported per-device offloads are exported via netlink (new xsk-flags). The offloads themselves are still implemented in a bit of a framework-y fashion that's left from my initial kfunc attempt. I'm introducing new xsk_tx_metadata_ops which drivers are supposed to implement. The drivers are also supposed to call xsk_tx_metadata_request/xsk_tx_metadata_complete in the right places. Since xsk_tx_metadata_{request,_complete} are static inline, we don't incur any extra overhead doing indirect calls. The benefit of this scheme is as follows: - keeps all metadata layout parsing away from driver code - makes it easy to grep and see which drivers implement what - don't need any extra flags to maintain to keep track of what offloads are implemented; if the callback is implemented - the offload is supported (used by netlink reporting code) Two offloads are defined right now: 1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset 2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata area upon completion (tx_timestamp field) XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass metadata pointer). The struct is forward-compatible and can be extended in the future by appending more fields. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- Documentation/netlink/specs/netdev.yaml | 19 +++++- include/linux/netdevice.h | 2 + include/linux/skbuff.h | 14 +++- include/net/xdp_sock.h | 110 ++++++++++++++++++++++++++++++++ include/net/xdp_sock_drv.h | 13 ++++ include/net/xsk_buff_pool.h | 6 ++ include/uapi/linux/if_xdp.h | 38 +++++++++++ include/uapi/linux/netdev.h | 16 +++++ net/core/netdev-genl.c | 13 +++- net/xdp/xsk.c | 34 ++++++++++ net/xdp/xsk_queue.h | 2 +- tools/include/uapi/linux/if_xdp.h | 52 +++++++++++++-- tools/include/uapi/linux/netdev.h | 16 +++++ tools/net/ynl/generated/netdev-user.c | 19 ++++++ tools/net/ynl/generated/netdev-user.h | 3 + 15 files changed, 348 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 14511b13f305..00439bcbd2e3 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -45,7 +45,6 @@ definitions: - type: flags name: xdp-rx-metadata - render-max: true entries: - name: timestamp @@ -55,6 +54,18 @@ definitions: name: hash doc: Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash(). + - + type: flags + name: xsk-flags + entries: + - + name: tx-timestamp + doc: + HW timestamping egress packets is supported by the driver. + - + name: tx-checksum + doc: + L3 checksum HW offload is supported by the driver. attribute-sets: - @@ -86,6 +97,11 @@ attribute-sets: See Documentation/networking/xdp-rx-metadata.rst for more details. type: u64 enum: xdp-rx-metadata + - + name: xsk-features + doc: Bitmask of enabled AF_XDP features. + type: u64 + enum: xsk-flags operations: list: @@ -103,6 +119,7 @@ operations: - xdp-features - xdp-zc-max-segs - xdp-rx-metadata-features + - xsk-features dump: reply: *dev-all - diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87caa81f70c..08da8b28c816 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1865,6 +1865,7 @@ enum netdev_stat_type { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. + * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2128,6 +2129,7 @@ struct net_device { unsigned long long priv_flags; const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; + const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; int ifindex; unsigned short gflags; unsigned short hard_header_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..b370eb8d70f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -566,6 +566,15 @@ struct ubuf_info_msgzc { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); +/* Preserve some data across TX submission and completion. + * + * Note, this state is stored in the driver. Extending the layout + * might need some special care. + */ +struct xsk_tx_metadata_compl { + __u64 *tx_timestamp; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -578,7 +587,10 @@ struct skb_shared_info { /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; - struct skb_shared_hwtstamps hwtstamps; + union { + struct skb_shared_hwtstamps hwtstamps; + struct xsk_tx_metadata_compl xsk_meta; + }; unsigned int gso_type; u32 tskey; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index bcf765124f72..3cb4dc9bd70e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -93,12 +93,105 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -115,6 +208,23 @@ static inline void __xsk_map_flush(void) { } +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} + +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..e2558ac3e195 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -165,6 +165,14 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + if (!pool->tx_metadata_len) + return NULL; + + return xp_raw_get_data(pool, addr) - pool->tx_metadata_len; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -324,6 +332,11 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 1985ffaf9b0c..97f5cc10d79e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -33,6 +33,7 @@ struct xdp_buff_xsk { }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -234,4 +235,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 2ecf79282c26..95de66d5a26c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -106,6 +106,41 @@ struct xdp_options { #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of struct + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; @@ -122,4 +157,7 @@ struct xdp_desc { */ #define XDP_PKT_CONTD (1 << 0) +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) + #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 2943a151d4f1..48d5477a668c 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_MASK = 3, }; +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + + /* private: */ + NETDEV_XSK_FLAGS_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index fe61f85bcf33..10f2124e9e23 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -13,6 +14,7 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; @@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + } + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, - xdp_rx_meta, NETDEV_A_DEV_PAD)) { + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c904356e2800..e83ade32f1fd 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -571,6 +571,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb) static void xsk_destruct_skb(struct sk_buff *skb) { + struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; + + if (compl->tx_timestamp) { + /* sw completion timestamp, not a real one */ + *compl->tx_timestamp = ktime_get_tai_fast_ns(); + } + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } @@ -655,8 +662,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { + struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; + bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { @@ -687,6 +696,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, kfree_skb(skb); goto free_err; } + + first_frag = true; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; @@ -709,12 +720,35 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } + + if (first_frag && desc->options & XDP_TX_METADATA) { + if (unlikely(xs->pool->tx_metadata_len == 0)) { + err = -EINVAL; + goto free_err; + } + + meta = buffer - xs->pool->tx_metadata_len; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > len)) { + err = -EINVAL; + goto free_err; + } + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + } + } } skb->dev = dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_set_destructor_arg(skb); return skb; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index c74a1372bcb9..6f2d1621c992 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -137,7 +137,7 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_unused_options_set(u32 options) { - return options & ~XDP_PKT_CONTD; + return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 34411a2e5b6c..d0882edc1642 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -26,11 +26,11 @@ */ #define XDP_USE_NEED_WAKEUP (1 << 3) /* By setting this option, userspace application indicates that it can - * handle multiple descriptors per packet thus enabling xsk core to split + * handle multiple descriptors per packet thus enabling AF_XDP to split * multi-buffer XDP frames into multiple Rx descriptors. Without this set - * such frames will be dropped by xsk. + * such frames will be dropped. */ -#define XDP_USE_SG (1 << 4) +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -106,6 +106,41 @@ struct xdp_options { #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of union xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of union + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; @@ -113,9 +148,16 @@ struct xdp_desc { __u32 options; }; -/* Flag indicating packet constitutes of multiple buffers*/ +/* UMEM descriptor is __u64 */ + +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ #define XDP_PKT_CONTD (1 << 0) -/* UMEM descriptor is __u64 */ +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) #endif /* _LINUX_IF_XDP_H */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 2943a151d4f1..48d5477a668c 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_MASK = 3, }; +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + + /* private: */ + NETDEV_XSK_FLAGS_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/tools/net/ynl/generated/netdev-user.c b/tools/net/ynl/generated/netdev-user.c index b5ffe8cd1144..6283d87dad37 100644 --- a/tools/net/ynl/generated/netdev-user.c +++ b/tools/net/ynl/generated/netdev-user.c @@ -58,6 +58,19 @@ const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value) return netdev_xdp_rx_metadata_strmap[value]; } +static const char * const netdev_xsk_flags_strmap[] = { + [0] = "tx-timestamp", + [1] = "tx-checksum", +}; + +const char *netdev_xsk_flags_str(enum netdev_xsk_flags value) +{ + value = ffs(value) - 1; + if (value < 0 || value >= (int)MNL_ARRAY_SIZE(netdev_xsk_flags_strmap)) + return NULL; + return netdev_xsk_flags_strmap[value]; +} + /* Policies */ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = { [NETDEV_A_DEV_IFINDEX] = { .name = "ifindex", .type = YNL_PT_U32, }, @@ -65,6 +78,7 @@ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = { [NETDEV_A_DEV_XDP_FEATURES] = { .name = "xdp-features", .type = YNL_PT_U64, }, [NETDEV_A_DEV_XDP_ZC_MAX_SEGS] = { .name = "xdp-zc-max-segs", .type = YNL_PT_U32, }, [NETDEV_A_DEV_XDP_RX_METADATA_FEATURES] = { .name = "xdp-rx-metadata-features", .type = YNL_PT_U64, }, + [NETDEV_A_DEV_XSK_FEATURES] = { .name = "xsk-features", .type = YNL_PT_U64, }, }; struct ynl_policy_nest netdev_dev_nest = { @@ -116,6 +130,11 @@ int netdev_dev_get_rsp_parse(const struct nlmsghdr *nlh, void *data) return MNL_CB_ERROR; dst->_present.xdp_rx_metadata_features = 1; dst->xdp_rx_metadata_features = mnl_attr_get_u64(attr); + } else if (type == NETDEV_A_DEV_XSK_FEATURES) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.xsk_features = 1; + dst->xsk_features = mnl_attr_get_u64(attr); } } diff --git a/tools/net/ynl/generated/netdev-user.h b/tools/net/ynl/generated/netdev-user.h index 4fafac879df3..39af1908444b 100644 --- a/tools/net/ynl/generated/netdev-user.h +++ b/tools/net/ynl/generated/netdev-user.h @@ -19,6 +19,7 @@ extern const struct ynl_family ynl_netdev_family; const char *netdev_op_str(int op); const char *netdev_xdp_act_str(enum netdev_xdp_act value); const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value); +const char *netdev_xsk_flags_str(enum netdev_xsk_flags value); /* Common nested types */ /* ============== NETDEV_CMD_DEV_GET ============== */ @@ -50,12 +51,14 @@ struct netdev_dev_get_rsp { __u32 xdp_features:1; __u32 xdp_zc_max_segs:1; __u32 xdp_rx_metadata_features:1; + __u32 xsk_features:1; } _present; __u32 ifindex; __u64 xdp_features; __u32 xdp_zc_max_segs; __u64 xdp_rx_metadata_features; + __u64 xsk_features; }; void netdev_dev_get_rsp_free(struct netdev_dev_get_rsp *rsp); -- cgit v1.2.3 From 9276009d35d3f65e083b95d30a4f967baaae0fc6 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:09 -0800 Subject: tools: ynl: Print xsk-features from the sample In a similar fashion we do for the other bit masks. Fix mask parsing (>= vs >) while we are it. Signed-off-by: Stanislav Fomichev Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/net/ynl/samples/netdev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index b828225daad0..591b90e21890 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -33,17 +33,23 @@ static void netdev_print_device(struct netdev_dev_get_rsp *d, unsigned int op) return; printf("xdp-features (%llx):", d->xdp_features); - for (int i = 0; d->xdp_features > 1U << i; i++) { + for (int i = 0; d->xdp_features >= 1U << i; i++) { if (d->xdp_features & (1U << i)) printf(" %s", netdev_xdp_act_str(1 << i)); } printf(" xdp-rx-metadata-features (%llx):", d->xdp_rx_metadata_features); - for (int i = 0; d->xdp_rx_metadata_features > 1U << i; i++) { + for (int i = 0; d->xdp_rx_metadata_features >= 1U << i; i++) { if (d->xdp_rx_metadata_features & (1U << i)) printf(" %s", netdev_xdp_rx_metadata_str(1 << i)); } + printf(" xsk-features (%llx):", d->xsk_features); + for (int i = 0; d->xsk_features >= 1U << i; i++) { + if (d->xsk_features & (1U << i)) + printf(" %s", netdev_xsk_flags_str(1 << i)); + } + printf(" xdp-zc-max-segs=%u", d->xdp_zc_max_segs); name = netdev_op_str(op); -- cgit v1.2.3 From 11614723af26e7c32fcb704d8f30fdf60c1122dc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:14 -0800 Subject: xsk: Add option to calculate TX checksum in SW For XDP_COPY mode, add a UMEM option XDP_UMEM_TX_SW_CSUM to call skb_checksum_help in transmit path. Might be useful to debugging issues with real hardware. I also use this mode in the selftests. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-9-sdf@google.com Signed-off-by: Alexei Starovoitov --- Documentation/networking/xsk-tx-metadata.rst | 9 +++++++++ include/net/xsk_buff_pool.h | 1 + include/uapi/linux/if_xdp.h | 8 +++++++- net/xdp/xdp_umem.c | 7 ++++++- net/xdp/xsk.c | 6 ++++++ net/xdp/xsk_buff_pool.c | 1 + tools/include/uapi/linux/if_xdp.h | 8 +++++++- 7 files changed, 37 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst index 4f376560b23f..97ecfa480d00 100644 --- a/Documentation/networking/xsk-tx-metadata.rst +++ b/Documentation/networking/xsk-tx-metadata.rst @@ -50,6 +50,15 @@ packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA`` bit in the ``options`` field. Also note that in a multi-buffer packet only the first chunk should carry the metadata. +Software TX Checksum +==================== + +For development and testing purposes its possible to pass +``XDP_UMEM_TX_SW_CSUM`` flag to ``XDP_UMEM_REG`` UMEM registration call. +In this case, when running in ``XDK_COPY`` mode, the TX checksum +is calculated on the CPU. Do not enable this option in production because +it will negatively affect performance. + Querying Device Capabilities ============================ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 97f5cc10d79e..8d48d37ab7c0 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -83,6 +83,7 @@ struct xsk_buff_pool { bool uses_need_wakeup; bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 95de66d5a26c..d31698410410 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -33,7 +33,13 @@ #define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ -#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) struct sockaddr_xdp { __u16 sxdp_family; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 946a687fb8e8..caa340134b0e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -148,6 +148,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } +#define XDP_UMEM_FLAGS_VALID ( \ + XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ + XDP_UMEM_TX_SW_CSUM | \ + 0) + static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; @@ -167,7 +172,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) + if (mr->flags & ~XDP_UMEM_FLAGS_VALID) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d66ba9d6154f..281d49b4fca4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -744,6 +744,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->csum_start = hr + meta->request.csum_start; skb->csum_offset = meta->request.csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(xs->pool->tx_sw_csum)) { + err = skb_checksum_help(skb); + if (err) + goto free_err; + } } } } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 386eddcdf837..4f6f538a5462 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; + pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index d0882edc1642..638c606dfa74 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -33,7 +33,13 @@ #define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ -#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) struct sockaddr_xdp { __u16 sxdp_family; -- cgit v1.2.3 From df3ed0003ec4994ce8ed4818c435c481281df89e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:15 -0800 Subject: selftests/xsk: Support tx_metadata_len Add new config field and propagate to UMEM registration setsockopt. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-10-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xsk.c | 3 +++ tools/testing/selftests/bpf/xsk.h | 1 + 2 files changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index e574711eeb84..25d568abf0f2 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -115,6 +115,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg, cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; cfg->flags = XSK_UMEM__DEFAULT_FLAGS; + cfg->tx_metadata_len = 0; return; } @@ -123,6 +124,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg, cfg->frame_size = usr_cfg->frame_size; cfg->frame_headroom = usr_cfg->frame_headroom; cfg->flags = usr_cfg->flags; + cfg->tx_metadata_len = usr_cfg->tx_metadata_len; } static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, @@ -252,6 +254,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, mr.chunk_size = umem->config.frame_size; mr.headroom = umem->config.frame_headroom; mr.flags = umem->config.flags; + mr.tx_metadata_len = umem->config.tx_metadata_len; err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); if (err) { diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 771570bc3731..93c2cc413cfc 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -200,6 +200,7 @@ struct xsk_umem_config { __u32 frame_size; __u32 frame_headroom; __u32 flags; + __u32 tx_metadata_len; }; int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags); -- cgit v1.2.3 From f6642de0c3e94d3ef6f44e127d11fcf4138873f7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:16 -0800 Subject: selftests/bpf: Add csum helpers Checksum helpers will be used to calculate pseudo-header checksum in AF_XDP metadata selftests. The helpers are mirroring existing kernel ones: - csum_tcpudp_magic : IPv4 pseudo header csum - csum_ipv6_magic : IPv6 pseudo header csum - csum_fold : fold csum and do one's complement Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-11-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/network_helpers.h | 43 +++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 34f1200a781b..94b9be24e39b 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -71,4 +71,47 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); + +static __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + + return (__u16)~csum; +} + +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, + __wsum csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; + s += htons(proto + len); + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, + __wsum csum) +{ + __u64 s = csum; + int i; + + for (i = 0; i < 4; i++) + s += (__u32)saddr->s6_addr32[i]; + for (i = 0; i < 4; i++) + s += (__u32)daddr->s6_addr32[i]; + s += htons(proto + len); + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + #endif -- cgit v1.2.3 From 40808a237d9c8fcaa3eaeefe2ac59e4733907478 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:17 -0800 Subject: selftests/bpf: Add TX side to xdp_metadata Request TX timestamp and make sure it's not empty. Request TX checksum offload (SW-only) and make sure it's resolved to the correct one. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-12-sdf@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_metadata.c | 33 +++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 4439ba9392f8..33cdf88efa6b 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -56,7 +56,8 @@ static int open_xsk(int ifindex, struct xsk *xsk) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM, + .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx; u64 addr; @@ -138,6 +139,7 @@ static void ip_csum(struct iphdr *iph) static int generate_packet(struct xsk *xsk, __u16 dst_port) { + struct xsk_tx_metadata *meta; struct xdp_desc *tx_desc; struct udphdr *udph; struct ethhdr *eth; @@ -151,10 +153,14 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) return -1; tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx); - tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE; + tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata); printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr); data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr); + meta = data - sizeof(struct xsk_tx_metadata); + memset(meta, 0, sizeof(*meta)); + meta->flags = XDP_TXMD_FLAGS_TIMESTAMP; + eth = data; iph = (void *)(eth + 1); udph = (void *)(iph + 1); @@ -178,11 +184,17 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) udph->source = htons(AF_XDP_SOURCE_PORT); udph->dest = htons(dst_port); udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES); - udph->check = 0; + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES); + meta->flags |= XDP_TXMD_FLAGS_CHECKSUM; + meta->request.csum_start = sizeof(*eth) + sizeof(*iph); + meta->request.csum_offset = offsetof(struct udphdr, check); + tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES; + tx_desc->options |= XDP_TX_METADATA; xsk_ring_prod__submit(&xsk->tx, 1); ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0); @@ -194,13 +206,21 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) static void complete_tx(struct xsk *xsk) { - __u32 idx; + struct xsk_tx_metadata *meta; __u64 addr; + void *data; + __u32 idx; if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) { addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx); printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr); + + data = xsk_umem__get_data(xsk->umem_area, addr); + meta = data - sizeof(struct xsk_tx_metadata); + + ASSERT_NEQ(meta->completion.tx_timestamp, 0, "tx_timestamp"); + xsk_ring_cons__release(&xsk->comp, 1); } } @@ -221,6 +241,7 @@ static int verify_xsk_metadata(struct xsk *xsk) const struct xdp_desc *rx_desc; struct pollfd fds = {}; struct xdp_meta *meta; + struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; __u64 comp_addr; @@ -257,6 +278,7 @@ static int verify_xsk_metadata(struct xsk *xsk) ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto"); iph = (void *)(eth + 1); ASSERT_EQ((int)iph->version, 4, "iph->version"); + udph = (void *)(iph + 1); /* custom metadata */ @@ -270,6 +292,9 @@ static int verify_xsk_metadata(struct xsk *xsk) ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type"); + /* checksum offload */ + ASSERT_EQ(udph->check, htons(0x721c), "csum"); + xsk_ring_cons__release(&xsk->rx, 1); refill_rx(xsk, comp_addr); -- cgit v1.2.3 From 12b4b7963d3cca253db3c31aa7611b988699e30f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:18 -0800 Subject: selftests/bpf: Convert xdp_hw_metadata to XDP_USE_NEED_WAKEUP This is the recommended way to run AF_XDP, so let's use it in the test. Also, some unrelated changes to now blow up the log too much: - change default mode to zerocopy and add -c to use copy mode - small fixes for the flags/sizes/prints - add print_tstamp_delta to print timestamp + reference Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-13-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 73 ++++++++++++++++++--------- 1 file changed, 49 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index c3ba40d0b9de..4484b17c7a56 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -32,7 +32,7 @@ #include "xdp_metadata.h" -#define UMEM_NUM 16 +#define UMEM_NUM 256 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM) #define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE) @@ -48,7 +48,7 @@ struct xsk { }; struct xdp_hw_metadata *bpf_obj; -__u16 bind_flags = XDP_COPY; +__u16 bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY; struct xsk *rx_xsk; const char *ifname; int ifindex; @@ -68,7 +68,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + .flags = XSK_UMEM__DEFAULT_FLAGS, }; __u32 idx; u64 addr; @@ -110,7 +110,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) for (i = 0; i < UMEM_NUM / 2; i++) { addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE; printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr); - *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr; + *xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr; } xsk_ring_prod__submit(&xsk->fill, ret); @@ -131,12 +131,22 @@ static void refill_rx(struct xsk *xsk, __u64 addr) __u32 idx; if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) { - printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr); + printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr); *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr; xsk_ring_prod__submit(&xsk->fill, 1); } } +static int kick_tx(struct xsk *xsk) +{ + return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0); +} + +static int kick_rx(struct xsk *xsk) +{ + return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL); +} + #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ static __u64 gettime(clockid_t clock_id) { @@ -152,6 +162,17 @@ static __u64 gettime(clockid_t clock_id) return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } +static void print_tstamp_delta(const char *name, const char *refname, + __u64 tstamp, __u64 reference) +{ + __s64 delta = (__s64)reference - (__s64)tstamp; + + printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n", + name, tstamp, (double)tstamp / NANOSEC_PER_SEC, refname, + (double)delta / NANOSEC_PER_SEC, + (double)delta / 1000); +} + static void verify_xdp_metadata(void *data, clockid_t clock_id) { struct xdp_meta *meta; @@ -167,22 +188,13 @@ static void verify_xdp_metadata(void *data, clockid_t clock_id) printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp, (double)meta->rx_timestamp / NANOSEC_PER_SEC); if (meta->rx_timestamp) { - __u64 usr_clock = gettime(clock_id); - __u64 xdp_clock = meta->xdp_timestamp; - __s64 delta_X = xdp_clock - meta->rx_timestamp; - __s64 delta_X2U = usr_clock - xdp_clock; - - printf("XDP RX-time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n", - xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC, - (double)delta_X / NANOSEC_PER_SEC, - (double)delta_X / 1000); - - printf("AF_XDP time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n", - usr_clock, (double)usr_clock / NANOSEC_PER_SEC, - (double)delta_X2U / NANOSEC_PER_SEC, - (double)delta_X2U / 1000); - } + __u64 ref_tstamp = gettime(clock_id); + print_tstamp_delta("HW RX-time", "User RX-time", + meta->rx_timestamp, ref_tstamp); + print_tstamp_delta("XDP RX-time", "User RX-time", + meta->xdp_timestamp, ref_tstamp); + } } static void verify_skb_metadata(int fd) @@ -252,6 +264,13 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t while (true) { errno = 0; + + for (i = 0; i < rxq; i++) { + ret = kick_rx(&rx_xsk[i]); + if (ret) + printf("kick_rx ret=%d\n", ret); + } + ret = poll(fds, rxq + 1, 1000); printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n", ret, errno, bpf_obj->bss->pkts_skip, @@ -420,8 +439,9 @@ static void print_usage(void) { const char *usage = "Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n" - " -m Enable multi-buffer XDP for larger MTU\n" + " -c Run in copy mode (zerocopy is default)\n" " -h Display this help and exit\n\n" + " -m Enable multi-buffer XDP for larger MTU\n" "Generate test packets on the other machine with:\n" " echo -n xdp | nc -u -q1 9091\n"; @@ -432,14 +452,19 @@ static void read_args(int argc, char *argv[]) { int opt; - while ((opt = getopt(argc, argv, "mh")) != -1) { + while ((opt = getopt(argc, argv, "chm")) != -1) { switch (opt) { - case 'm': - bind_flags |= XDP_USE_SG; + case 'c': + bind_flags &= ~XDP_USE_NEED_WAKEUP; + bind_flags &= ~XDP_ZEROCOPY; + bind_flags |= XDP_COPY; break; case 'h': print_usage(); exit(0); + case 'm': + bind_flags |= XDP_USE_SG; + break; case '?': if (isprint(optopt)) fprintf(stderr, "Unknown option: -%c\n", optopt); -- cgit v1.2.3 From 60523115c1b10c8855492d84c1ba88af452e221c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:19 -0800 Subject: selftests/bpf: Add TX side to xdp_hw_metadata When we get a packet on port 9091, we swap src/dst and send it out. At this point we also request the timestamp and checksum offloads. Checksum offload is verified by looking at the tcpdump on the other side. The tool prints pseudo-header csum and the final one it expects. The final checksum actually matches the incoming packets checksum because we only flip the src/dst and don't change the payload. Some other related changes: - switched to zerocopy mode by default; new flag can be used to force old behavior - request fixed tx_metadata_len headroom - some other small fixes (umem size, fill idx+i, etc) mvbz3:~# ./xdp_hw_metadata eth3 ... xsk_ring_cons__peek: 1 0x19546f8: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100 rx_hash: 0x80B7EA8B with RSS type:0x2A rx_timestamp: 1697580171852147395 (sec:1697580171.8521) HW RX-time: 1697580171852147395 (sec:1697580171.8521), delta to User RX-time sec:0.2797 (279673.082 usec) XDP RX-time: 1697580172131699047 (sec:1697580172.1317), delta to User RX-time sec:0.0001 (121.430 usec) 0x19546f8: ping-pong with csum=3b8e (want d862) csum_start=54 csum_offset=6 0x19546f8: complete tx idx=0 addr=8 tx_timestamp: 1697580172056756493 (sec:1697580172.0568) HW TX-complete-time: 1697580172056756493 (sec:1697580172.0568), delta to User TX-complete-time sec:0.0852 (85175.537 usec) XDP RX-time: 1697580172131699047 (sec:1697580172.1317), delta to User TX-complete-time sec:0.0102 (10232.983 usec) HW RX-time: 1697580171852147395 (sec:1697580171.8521), delta to HW TX-complete-time sec:0.2046 (204609.098 usec) 0x19546f8: complete rx idx=128 addr=80100 mvbz4:~# nc -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091 mvbz4:~# tcpdump -vvx -i eth3 udp tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144 bytes 12:26:09.301074 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1087.55807 > fe80::1270:fdff:fe48:1077.9091: [bad udp cksum 0x3b8e -> 0xde7e!] UDP, length 3 0x0000: 6003 5fa5 000b 117f fe80 0000 0000 0000 0x0010: 1270 fdff fe48 1087 fe80 0000 0000 0000 0x0020: 1270 fdff fe48 1077 d9ff 2383 000b 3b8e 0x0030: 7864 70 12:26:09.301976 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.55807: [udp sum ok] UDP, length 3 0x0000: 6003 5fa5 000b 117f fe80 0000 0000 0000 0x0010: 1270 fdff fe48 1077 fe80 0000 0000 0000 0x0020: 1270 fdff fe48 1087 2383 d9ff 000b de7e 0x0030: 7864 70 Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-14-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 164 +++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 4484b17c7a56..3291625ba4fb 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -10,7 +10,9 @@ * - rx_hash * * TX: - * - TBD + * - UDP 9091 packets trigger TX reply + * - TX HW timestamp is requested and reported back upon completion + * - TX checksum is requested */ #include @@ -24,11 +26,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include "xdp_metadata.h" @@ -53,6 +58,9 @@ struct xsk *rx_xsk; const char *ifname; int ifindex; int rxq; +bool skip_tx; +__u64 last_hw_rx_timestamp; +__u64 last_xdp_rx_timestamp; void test__fail(void) { /* for network_helpers.c */ } @@ -69,6 +77,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, .flags = XSK_UMEM__DEFAULT_FLAGS, + .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx; u64 addr; @@ -185,15 +194,19 @@ static void verify_xdp_metadata(void *data, clockid_t clock_id) printf("rx_hash: 0x%X with RSS type:0x%X\n", meta->rx_hash, meta->rx_hash_type); - printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp, - (double)meta->rx_timestamp / NANOSEC_PER_SEC); if (meta->rx_timestamp) { __u64 ref_tstamp = gettime(clock_id); + /* store received timestamps to calculate a delta at tx */ + last_hw_rx_timestamp = meta->rx_timestamp; + last_xdp_rx_timestamp = meta->xdp_timestamp; + print_tstamp_delta("HW RX-time", "User RX-time", meta->rx_timestamp, ref_tstamp); print_tstamp_delta("XDP RX-time", "User RX-time", meta->xdp_timestamp, ref_tstamp); + } else { + printf("No rx_timestamp\n"); } } @@ -242,6 +255,129 @@ static void verify_skb_metadata(int fd) printf("skb hwtstamp is not found!\n"); } +static bool complete_tx(struct xsk *xsk, clockid_t clock_id) +{ + struct xsk_tx_metadata *meta; + __u64 addr; + void *data; + __u32 idx; + + if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx)) + return false; + + addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx); + data = xsk_umem__get_data(xsk->umem_area, addr); + meta = data - sizeof(struct xsk_tx_metadata); + + printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr); + + if (meta->completion.tx_timestamp) { + __u64 ref_tstamp = gettime(clock_id); + + print_tstamp_delta("HW TX-complete-time", "User TX-complete-time", + meta->completion.tx_timestamp, ref_tstamp); + print_tstamp_delta("XDP RX-time", "User TX-complete-time", + last_xdp_rx_timestamp, ref_tstamp); + print_tstamp_delta("HW RX-time", "HW TX-complete-time", + last_hw_rx_timestamp, meta->completion.tx_timestamp); + } else { + printf("No tx_timestamp\n"); + } + + xsk_ring_cons__release(&xsk->comp, 1); + + return true; +} + +#define swap(a, b, len) do { \ + for (int i = 0; i < len; i++) { \ + __u8 tmp = ((__u8 *)a)[i]; \ + ((__u8 *)a)[i] = ((__u8 *)b)[i]; \ + ((__u8 *)b)[i] = tmp; \ + } \ +} while (0) + +static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id) +{ + struct xsk_tx_metadata *meta; + struct ipv6hdr *ip6h = NULL; + struct iphdr *iph = NULL; + struct xdp_desc *tx_desc; + struct udphdr *udph; + struct ethhdr *eth; + __sum16 want_csum; + void *data; + __u32 idx; + int ret; + int len; + + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx); + if (ret != 1) { + printf("%p: failed to reserve tx slot\n", xsk); + return; + } + + tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx); + tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata); + data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr); + + meta = data - sizeof(struct xsk_tx_metadata); + memset(meta, 0, sizeof(*meta)); + meta->flags = XDP_TXMD_FLAGS_TIMESTAMP; + + eth = rx_packet; + + if (eth->h_proto == htons(ETH_P_IP)) { + iph = (void *)(eth + 1); + udph = (void *)(iph + 1); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + ip6h = (void *)(eth + 1); + udph = (void *)(ip6h + 1); + } else { + printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto); + xsk_ring_prod__cancel(&xsk->tx, 1); + return; + } + + len = ETH_HLEN; + if (ip6h) + len += sizeof(*ip6h) + ntohs(ip6h->payload_len); + if (iph) + len += ntohs(iph->tot_len); + + swap(eth->h_dest, eth->h_source, ETH_ALEN); + if (iph) + swap(&iph->saddr, &iph->daddr, 4); + else + swap(&ip6h->saddr, &ip6h->daddr, 16); + swap(&udph->source, &udph->dest, 2); + + want_csum = udph->check; + if (ip6h) + udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); + else + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); + + meta->flags |= XDP_TXMD_FLAGS_CHECKSUM; + if (iph) + meta->request.csum_start = sizeof(*eth) + sizeof(*iph); + else + meta->request.csum_start = sizeof(*eth) + sizeof(*ip6h); + meta->request.csum_offset = offsetof(struct udphdr, check); + + printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n", + xsk, ntohs(udph->check), ntohs(want_csum), + meta->request.csum_start, meta->request.csum_offset); + + memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */ + tx_desc->options |= XDP_TX_METADATA; + tx_desc->len = len; + + xsk_ring_prod__submit(&xsk->tx, 1); +} + static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id) { const struct xdp_desc *rx_desc; @@ -307,6 +443,22 @@ peek: verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr), clock_id); first_seg = false; + + if (!skip_tx) { + /* mirror first chunk back */ + ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr), + clock_id); + + ret = kick_tx(xsk); + if (ret) + printf("kick_tx ret=%d\n", ret); + + for (int j = 0; j < 500; j++) { + if (complete_tx(xsk, clock_id)) + break; + usleep(10*1000); + } + } } xsk_ring_cons__release(&xsk->rx, 1); @@ -442,6 +594,7 @@ static void print_usage(void) " -c Run in copy mode (zerocopy is default)\n" " -h Display this help and exit\n\n" " -m Enable multi-buffer XDP for larger MTU\n" + " -r Don't generate AF_XDP reply (rx metadata only)\n" "Generate test packets on the other machine with:\n" " echo -n xdp | nc -u -q1 9091\n"; @@ -452,7 +605,7 @@ static void read_args(int argc, char *argv[]) { int opt; - while ((opt = getopt(argc, argv, "chm")) != -1) { + while ((opt = getopt(argc, argv, "chmr")) != -1) { switch (opt) { case 'c': bind_flags &= ~XDP_USE_NEED_WAKEUP; @@ -465,6 +618,9 @@ static void read_args(int argc, char *argv[]) case 'm': bind_flags |= XDP_USE_SG; break; + case 'r': + skip_tx = true; + break; case '?': if (isprint(optopt)) fprintf(stderr, "Unknown option: -%c\n", optopt); -- cgit v1.2.3