From 8a021e7fa10576eeb3938328f39bbf98fe7d4715 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 21 Dec 2023 18:22:24 -0500 Subject: bpf: Simplify checking size of helper accesses This patch simplifies the verification of size arguments associated to pointer arguments to helpers and kfuncs. Many helpers take a pointer argument followed by the size of the memory access performed to be performed through that pointer. Before this patch, the handling of the size argument in check_mem_size_reg() was confusing and wasteful: if the size register's lower bound was 0, then the verification was done twice: once considering the size of the access to be the lower-bound of the respective argument, and once considering the upper bound (even if the two are the same). The upper bound checking is a super-set of the lower-bound checking(*), except: the only point of the lower-bound check is to handle the case where zero-sized-accesses are explicitly not allowed and the lower-bound is zero. This static condition is now checked explicitly, replacing a much more complex, expensive and confusing verification call to check_helper_mem_access(). Error messages change in this patch. Before, messages about illegal zero-size accesses depended on the type of the pointer and on other conditions, and sometimes the message was plain wrong: in some tests that changed you'll see that the old message was something like "R1 min value is outside of the allowed memory range", where R1 is the pointer register; the error was wrongly claiming that the pointer was bad instead of the size being bad. Other times the information that the size came for a register with a possible range of values was wrong, and the error presented the size as a fixed zero. Now the errors refer to the right register. However, the old error messages did contain useful information about the pointer register which is now lost; recovering this information was deemed not important enough. (*) Besides standing to reason that the checks for a bigger size access are a super-set of the checks for a smaller size access, I have also mechanically verified this by reading the code for all types of pointers. I could convince myself that it's true for all but PTR_TO_BTF_ID (check_ptr_to_btf_access). There, simply looking line-by-line does not immediately prove what we want. If anyone has any qualms, let me know. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231221232225.568730-2-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 10 ++++------ .../testing/selftests/bpf/progs/verifier_helper_value_access.c | 8 ++++---- tools/testing/selftests/bpf/progs/verifier_raw_stack.c | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a376eb609c41..d4e31f61de0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7279,12 +7279,10 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; + if (reg->umin_value == 0 && !zero_size_allowed) { + verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", + regno, reg->umin_value, reg->umax_value); + return -EACCES; } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c index 692216c0ad3d..3e8340c2408f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c @@ -91,7 +91,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to map: empty range") -__failure __msg("invalid access to map value, value_size=48 off=0 size=0") +__failure __msg("R2 invalid zero-sized read") __naked void access_to_map_empty_range(void) { asm volatile (" \ @@ -221,7 +221,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via const imm): empty range") -__failure __msg("invalid access to map value, value_size=48 off=4 size=0") +__failure __msg("R2 invalid zero-sized read") __naked void via_const_imm_empty_range(void) { asm volatile (" \ @@ -386,7 +386,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via const reg): empty range") -__failure __msg("R1 min value is outside of the allowed memory range") +__failure __msg("R2 invalid zero-sized read") __naked void via_const_reg_empty_range(void) { asm volatile (" \ @@ -556,7 +556,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via variable): empty range") -__failure __msg("R1 min value is outside of the allowed memory range") +__failure __msg("R2 invalid zero-sized read") __naked void map_via_variable_empty_range(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c index f67390224a9c..7cc83acac727 100644 --- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c @@ -64,7 +64,7 @@ __naked void load_bytes_negative_len_2(void) SEC("tc") __description("raw_stack: skb_load_bytes, zero len") -__failure __msg("invalid zero-sized read") +__failure __msg("R4 invalid zero-sized read: u64=[0,0]") __naked void skb_load_bytes_zero_len(void) { asm volatile (" \ -- cgit v1.2.3 From 72187506de4f19fcc8ae63a2b2f36d75e5259d9d Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 21 Dec 2023 18:22:25 -0500 Subject: bpf: Add a possibly-zero-sized read test This patch adds a test for the condition that the previous patch mucked with - illegal zero-sized helper memory access. As opposed to existing tests, this new one uses a size whose lower bound is zero, as opposed to a known-zero one. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231221232225.568730-3-andreimatei1@gmail.com --- .../bpf/progs/verifier_helper_value_access.c | 39 +++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c index 3e8340c2408f..886498b5e6f3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c @@ -89,9 +89,14 @@ l0_%=: exit; \ : __clobber_all); } +/* Call a function taking a pointer and a size which doesn't allow the size to + * be zero (i.e. bpf_trace_printk() declares the second argument to be + * ARG_CONST_SIZE, not ARG_CONST_SIZE_OR_ZERO). We attempt to pass zero for the + * size and expect to fail. + */ SEC("tracepoint") __description("helper access to map: empty range") -__failure __msg("R2 invalid zero-sized read") +__failure __msg("R2 invalid zero-sized read: u64=[0,0]") __naked void access_to_map_empty_range(void) { asm volatile (" \ @@ -113,6 +118,38 @@ l0_%=: exit; \ : __clobber_all); } +/* Like the test above, but this time the size register is not known to be zero; + * its lower-bound is zero though, which is still unacceptable. + */ +SEC("tracepoint") +__description("helper access to map: possibly-empty ange") +__failure __msg("R2 invalid zero-sized read: u64=[0,4]") +__naked void access_to_map_possibly_empty_range(void) +{ + asm volatile (" \ + r2 = r10; \ + r2 += -8; \ + r1 = 0; \ + *(u64*)(r2 + 0) = r1; \ + r1 = %[map_hash_48b] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + r1 = r0; \ + /* Read an unknown value */ \ + r7 = *(u64*)(r0 + 0); \ + /* Make it small and positive, to avoid other errors */ \ + r7 &= 4; \ + r2 = 0; \ + r2 += r7; \ + call %[bpf_trace_printk]; \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm(bpf_trace_printk), + __imm_addr(map_hash_48b) + : __clobber_all); +} + SEC("tracepoint") __description("helper access to map: out-of-bound range") __failure __msg("invalid access to map value, value_size=48 off=0 size=56") -- cgit v1.2.3 From 495d2d8133fd1407519170a5238f455abbd9ec9b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:43 -0800 Subject: selftests/bpf: Attempt to build BPF programs with -Wsign-compare GCC's -Wall includes -Wsign-compare while clang does not. Since BPF programs are built with clang we need to add this flag explicitly to catch problematic comparisons like: int i = -1; unsigned int j = 1; if (i < j) // this is false. long i = -1; unsigned int j = 1; if (i < j) // this is true. C standard for reference: - If either operand is unsigned long the other shall be converted to unsigned long. - Otherwise, if one operand is a long int and the other unsigned int, then if a long int can represent all the values of an unsigned int, the unsigned int shall be converted to a long int; otherwise both operands shall be converted to unsigned long int. - Otherwise, if either operand is long, the other shall be converted to long. - Otherwise, if either operand is unsigned, the other shall be converted to unsigned. Unfortunately clang's -Wsign-compare is very noisy. It complains about (s32)a == (u32)b which is safe and doen't have surprising behavior. This patch fixes some of the issues. It needs a follow up to fix the rest. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20231226191148.48536-2-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/Makefile | 1 + tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c | 2 +- tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c | 2 +- tools/testing/selftests/bpf/progs/bpf_iter_tasks.c | 2 +- tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c | 2 +- .../testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c | 2 +- tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c | 2 +- tools/testing/selftests/bpf/progs/cpumask_success.c | 2 +- tools/testing/selftests/bpf/progs/iters.c | 4 ++-- tools/testing/selftests/bpf/progs/linked_funcs1.c | 2 +- tools/testing/selftests/bpf/progs/linked_funcs2.c | 2 +- tools/testing/selftests/bpf/progs/linked_list.c | 2 +- tools/testing/selftests/bpf/progs/local_storage.c | 2 +- tools/testing/selftests/bpf/progs/lsm.c | 2 +- tools/testing/selftests/bpf/progs/normal_map_btf.c | 2 +- tools/testing/selftests/bpf/progs/profiler.inc.h | 4 ++-- tools/testing/selftests/bpf/progs/sockopt_inherit.c | 2 +- tools/testing/selftests/bpf/progs/sockopt_multi.c | 2 +- tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c | 2 +- tools/testing/selftests/bpf/progs/test_bpf_ma.c | 2 +- tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c | 2 +- tools/testing/selftests/bpf/progs/test_core_reloc_module.c | 8 ++++---- tools/testing/selftests/bpf/progs/test_fsverity.c | 2 +- tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c | 2 +- tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c | 2 +- 25 files changed, 30 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 617ae55c3bb5..fd15017ed3b1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -383,6 +383,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# TODO: enable me -Wsign-compare CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c index feaaa2b89c57..5014a17d6c02 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c @@ -20,7 +20,7 @@ struct { } hashmap1 SEC(".maps"); /* will set before prog run */ -volatile const __u32 num_cpus = 0; +volatile const __s32 num_cpus = 0; /* will collect results during prog run */ __u32 key_sum_a = 0, key_sum_b = 0, key_sum_c = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c index dd923dc637d5..423b39e60b6f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c @@ -35,7 +35,7 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) return 0; file = vma->vm_file; - if (task->tgid != pid) { + if (task->tgid != (pid_t)pid) { if (one_task) one_task_error = 1; return 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c b/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c index 96131b9a1caa..6cbb3393f243 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c @@ -22,7 +22,7 @@ int dump_task(struct bpf_iter__task *ctx) return 0; } - if (task->pid != tid) + if (task->pid != (pid_t)tid) num_unknown_tid++; else num_known_tid++; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c index 400fdf8d6233..dbf61c44acac 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -45,7 +45,7 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) } /* fill seq_file buffer */ - for (i = 0; i < print_len; i++) + for (i = 0; i < (int)print_len; i++) bpf_seq_write(seq, &seq_num, sizeof(seq_num)); return ret; diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c index b7fa8804e19d..45a0e9f492a9 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c +++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c @@ -11,7 +11,7 @@ __u32 invocations = 0; __u32 assertion_error = 0; __u32 retval_value = 0; -__u32 page_size = 0; +__s32 page_size = 0; SEC("cgroup/setsockopt") int get_retval(struct bpf_sockopt *ctx) diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c index facedd8b8250..5e282c16eadc 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c @@ -15,7 +15,7 @@ struct { __type(value, long); } map_a SEC(".maps"); -__u32 target_pid; +__s32 target_pid; __u64 cgroup_id; int target_hid; bool is_cgroup1; diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index fc3666edf456..7a1e64c6c065 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -332,7 +332,7 @@ SEC("tp_btf/task_newtask") int BPF_PROG(test_copy_any_anyand, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *mask1, *mask2, *dst1, *dst2; - u32 cpu; + int cpu; if (!is_test_task()) return 0; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 3aca3dc145b5..fe971992e635 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -6,7 +6,7 @@ #include #include "bpf_misc.h" -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) static volatile int zero = 0; @@ -676,7 +676,7 @@ static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n) while ((t = bpf_iter_num_next(it))) { i = *t; - if (i >= n) + if ((__u32)i >= n) break; sum += arr[i]; } diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c index c4b49ceea967..cc79dddac182 100644 --- a/tools/testing/selftests/bpf/progs/linked_funcs1.c +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -8,7 +8,7 @@ #include "bpf_misc.h" /* weak and shared between two files */ -const volatile int my_tid __weak; +const volatile __u32 my_tid __weak; long syscall_id __weak; int output_val1; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c index 013ff0645f0c..942cc5526ddf 100644 --- a/tools/testing/selftests/bpf/progs/linked_funcs2.c +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -68,7 +68,7 @@ int BPF_PROG(handler2, struct pt_regs *regs, long id) { static volatile int whatever; - if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + if (my_tid != (s32)bpf_get_current_pid_tgid() || id != syscall_id) return 0; /* make sure we have CO-RE relocations in main program */ diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 84d1777a9e6c..26205ca80679 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -6,7 +6,7 @@ #include "bpf_experimental.h" #ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) #endif #include "linked_list.h" diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index bc8ea56671a1..e5e3a8b8dd07 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -13,7 +13,7 @@ char _license[] SEC("license") = "GPL"; #define DUMMY_STORAGE_VALUE 0xdeadbeef -int monitored_pid = 0; +__u32 monitored_pid = 0; int inode_storage_result = -1; int sk_storage_result = -1; int task_storage_result = -1; diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index fadfdd98707c..0c13b7409947 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -92,7 +92,7 @@ int BPF_PROG(test_int_hook, struct vm_area_struct *vma, if (ret != 0) return ret; - __u32 pid = bpf_get_current_pid_tgid() >> 32; + __s32 pid = bpf_get_current_pid_tgid() >> 32; int is_stack = 0; is_stack = (vma->vm_start <= vma->vm_mm->start_stack && diff --git a/tools/testing/selftests/bpf/progs/normal_map_btf.c b/tools/testing/selftests/bpf/progs/normal_map_btf.c index 66cde82aa86d..a45c9299552c 100644 --- a/tools/testing/selftests/bpf/progs/normal_map_btf.c +++ b/tools/testing/selftests/bpf/progs/normal_map_btf.c @@ -36,7 +36,7 @@ int add_to_list_in_array(void *ctx) struct node_data *new; int zero = 0; - if (done || (u32)bpf_get_current_pid_tgid() != pid) + if (done || (int)bpf_get_current_pid_tgid() != pid) return 0; value = bpf_map_lookup_elem(&array, &zero); diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 897061930cb7..ba99d17dac54 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -132,7 +132,7 @@ struct { } disallowed_exec_inodes SEC(".maps"); #ifndef ARRAY_SIZE -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#define ARRAY_SIZE(arr) (int)(sizeof(arr) / sizeof(arr[0])) #endif static INLINE bool IS_ERR(const void* ptr) @@ -645,7 +645,7 @@ int raw_tracepoint__sched_process_exit(void* ctx) for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) { struct var_kill_data_t* past_kill_data = &arr_struct->array[i]; - if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) { + if (past_kill_data != NULL && past_kill_data->kill_target_pid == (pid_t)tpid) { bpf_probe_read_kernel(kill_data, sizeof(*past_kill_data), past_kill_data); void* payload = kill_data->payload; diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c index c8f59caa4639..a3434b840928 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c @@ -9,7 +9,7 @@ char _license[] SEC("license") = "GPL"; #define CUSTOM_INHERIT2 1 #define CUSTOM_LISTENER 2 -__u32 page_size = 0; +__s32 page_size = 0; struct sockopt_inherit { __u8 val; diff --git a/tools/testing/selftests/bpf/progs/sockopt_multi.c b/tools/testing/selftests/bpf/progs/sockopt_multi.c index 96f29fce050b..db67278e12d4 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_multi.c +++ b/tools/testing/selftests/bpf/progs/sockopt_multi.c @@ -5,7 +5,7 @@ char _license[] SEC("license") = "GPL"; -__u32 page_size = 0; +__s32 page_size = 0; SEC("cgroup/getsockopt") int _getsockopt_child(struct bpf_sockopt *ctx) diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c index dbe235ede7f3..83753b00a556 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -9,7 +9,7 @@ char _license[] SEC("license") = "GPL"; -__u32 page_size = 0; +__s32 page_size = 0; SEC("cgroup/setsockopt") int sockopt_qos_to_cc(struct bpf_sockopt *ctx) diff --git a/tools/testing/selftests/bpf/progs/test_bpf_ma.c b/tools/testing/selftests/bpf/progs/test_bpf_ma.c index 069db9085e78..b78f4f702ae0 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_ma.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_ma.c @@ -21,7 +21,7 @@ const unsigned int data_sizes[] = {16, 32, 64, 96, 128, 192, 256, 512, 1024, 204 const volatile unsigned int data_btf_ids[ARRAY_SIZE(data_sizes)] = {}; int err = 0; -int pid = 0; +u32 pid = 0; #define DEFINE_ARRAY_WITH_KPTR(_size) \ struct bin_data_##_size { \ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index a17dd83eae67..ee4a601dcb06 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -53,7 +53,7 @@ int test_core_kernel(void *ctx) struct task_struct *task = (void *)bpf_get_current_task(); struct core_reloc_kernel_output *out = (void *)&data.out; uint64_t pid_tgid = bpf_get_current_pid_tgid(); - uint32_t real_tgid = (uint32_t)pid_tgid; + int32_t real_tgid = (int32_t)pid_tgid; int pid, tgid; if (data.my_pid_tgid != pid_tgid) diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c index f59f175c7baf..bcb31ff92dcc 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c @@ -43,8 +43,8 @@ int BPF_PROG(test_core_module_probed, #if __has_builtin(__builtin_preserve_enum_value) struct core_reloc_module_output *out = (void *)&data.out; __u64 pid_tgid = bpf_get_current_pid_tgid(); - __u32 real_tgid = (__u32)(pid_tgid >> 32); - __u32 real_pid = (__u32)pid_tgid; + __s32 real_tgid = (__s32)(pid_tgid >> 32); + __s32 real_pid = (__s32)pid_tgid; if (data.my_pid_tgid != pid_tgid) return 0; @@ -77,8 +77,8 @@ int BPF_PROG(test_core_module_direct, #if __has_builtin(__builtin_preserve_enum_value) struct core_reloc_module_output *out = (void *)&data.out; __u64 pid_tgid = bpf_get_current_pid_tgid(); - __u32 real_tgid = (__u32)(pid_tgid >> 32); - __u32 real_pid = (__u32)pid_tgid; + __s32 real_tgid = (__s32)(pid_tgid >> 32); + __s32 real_pid = (__s32)pid_tgid; if (data.my_pid_tgid != pid_tgid) return 0; diff --git a/tools/testing/selftests/bpf/progs/test_fsverity.c b/tools/testing/selftests/bpf/progs/test_fsverity.c index 3975495b75c8..9e0f73e8189c 100644 --- a/tools/testing/selftests/bpf/progs/test_fsverity.c +++ b/tools/testing/selftests/bpf/progs/test_fsverity.c @@ -38,7 +38,7 @@ int BPF_PROG(test_file_open, struct file *f) return 0; got_fsverity = 1; - for (i = 0; i < sizeof(digest); i++) { + for (i = 0; i < (int)sizeof(digest); i++) { if (digest[i] != expected_digest[i]) return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c b/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c index eacda9fe07eb..4cfa42aa9436 100644 --- a/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c +++ b/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c @@ -29,7 +29,7 @@ int BPF_PROG(unix_listen, struct socket *sock, int backlog) len = unix_sk->addr->len - sizeof(short); path[0] = '@'; for (i = 1; i < len; i++) { - if (i >= sizeof(struct sockaddr_un)) + if (i >= (int)sizeof(struct sockaddr_un)) break; path[i] = unix_sk->addr->name->sun_path[i]; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c index 5baaafed0d2d..3abf068b8446 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c @@ -38,7 +38,7 @@ int xdp_redirect(struct xdp_md *xdp) if (payload + 1 > data_end) return XDP_ABORTED; - if (xdp->ingress_ifindex != ifindex_in) + if (xdp->ingress_ifindex != (__u32)ifindex_in) return XDP_ABORTED; if (metadata + 1 > data) -- cgit v1.2.3 From a8b242d77bd72556b7a9d8be779f7d27b95ba73c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:44 -0800 Subject: bpf: Introduce "volatile compare" macros Compilers optimize conditional operators at will, but often bpf programmers want to force compilers to keep the same operator in asm as it's written in C. Introduce bpf_cmp_likely/unlikely(var1, conditional_op, var2) macros that can be used as: - if (seen >= 1000) + if (bpf_cmp_unlikely(seen, >=, 1000)) The macros take advantage of BPF assembly that is C like. The macros check the sign of variable 'seen' and emits either signed or unsigned compare. For example: int a; bpf_cmp_unlikely(a, >, 0) will be translated to 'if rX s> 0 goto' in BPF assembly. unsigned int a; bpf_cmp_unlikely(a, >, 0) will be translated to 'if rX > 0 goto' in BPF assembly. C type conversions coupled with comparison operator are tricky. int i = -1; unsigned int j = 1; if (i < j) // this is false. long i = -1; unsigned int j = 1; if (i < j) // this is true. Make sure BPF program is compiled with -Wsign-compare then the macros will catch the mistake. The macros check LHS (left hand side) only to figure out the sign of compare. 'if 0 < rX goto' is not allowed in the assembly, so the users have to use a variable on LHS anyway. The patch updates few tests to demonstrate the use of the macros. The macro allows to use BPF_JSET in C code, since LLVM doesn't generate it at present. For example: if (i & j) compiles into r0 &= r1; if r0 == 0 goto while if (bpf_cmp_unlikely(i, &, j)) compiles into if r0 & r1 goto Note that the macros has to be careful with RHS assembly predicate. Since: u64 __rhs = 1ull << 42; asm goto("if r0 < %[rhs] goto +1" :: [rhs] "ri" (__rhs)); LLVM will silently truncate 64-bit constant into s32 imm. Note that [lhs] "r"((short)LHS) the type cast is a workaround for LLVM issue. When LHS is exactly 32-bit LLVM emits redundant <<=32, >>=32 to zero upper 32-bits. When LHS is 64 or 16 or 8-bit variable there are no shifts. When LHS is 32-bit the (u64) cast doesn't help. Hence use (short) cast. It does _not_ truncate the variable before it's assigned to a register. Traditional likely()/unlikely() macros that use __builtin_expect(!!(x), 1 or 0) have no effect on these macros, hence macros implement the logic manually. bpf_cmp_unlikely() macro preserves compare operator as-is while bpf_cmp_likely() macro flips the compare. Consider two cases: A. for() { if (foo >= 10) { bar += foo; } other code; } B. for() { if (foo >= 10) break; other code; } It's ok to use either bpf_cmp_likely or bpf_cmp_unlikely macros in both cases, but consider that 'break' is effectively 'goto out_of_the_loop'. Hence it's better to use bpf_cmp_unlikely in the B case. While 'bar += foo' is better to keep as 'fallthrough' == likely code path in the A case. When it's written as: A. for() { if (bpf_cmp_likely(foo, >=, 10)) { bar += foo; } other code; } B. for() { if (bpf_cmp_unlikely(foo, >=, 10)) break; other code; } The assembly will look like: A. for() { if r1 < 10 goto L1; bar += foo; L1: other code; } B. for() { if r1 >= 10 goto L2; other code; } L2: The bpf_cmp_likely vs bpf_cmp_unlikely changes basic block layout, hence it will greatly influence the verification process. The number of processed instructions will be different, since the verifier walks the fallthrough first. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20231226191148.48536-3-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/bpf_experimental.h | 69 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/exceptions.c | 20 +++---- tools/testing/selftests/bpf/progs/iters_task_vma.c | 3 +- 3 files changed, 80 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 1386baf9ae4a..19ed6c941c1c 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -254,6 +254,75 @@ extern void bpf_throw(u64 cookie) __ksym; } \ }) +#define __cmp_cannot_be_signed(x) \ + __builtin_strcmp(#x, "==") == 0 || __builtin_strcmp(#x, "!=") == 0 || \ + __builtin_strcmp(#x, "&") == 0 + +#define __is_signed_type(type) (((type)(-1)) < (type)1) + +#define __bpf_cmp(LHS, OP, SIGN, PRED, RHS, DEFAULT) \ + ({ \ + __label__ l_true; \ + bool ret = DEFAULT; \ + asm volatile goto("if %[lhs] " SIGN #OP " %[rhs] goto %l[l_true]" \ + :: [lhs] "r"((short)LHS), [rhs] PRED (RHS) :: l_true); \ + ret = !DEFAULT; \ +l_true: \ + ret; \ + }) + +/* C type conversions coupled with comparison operator are tricky. + * Make sure BPF program is compiled with -Wsign-compare then + * __lhs OP __rhs below will catch the mistake. + * Be aware that we check only __lhs to figure out the sign of compare. + */ +#define _bpf_cmp(LHS, OP, RHS, NOFLIP) \ + ({ \ + typeof(LHS) __lhs = (LHS); \ + typeof(RHS) __rhs = (RHS); \ + bool ret; \ + _Static_assert(sizeof(&(LHS)), "1st argument must be an lvalue expression"); \ + (void)(__lhs OP __rhs); \ + if (__cmp_cannot_be_signed(OP) || !__is_signed_type(typeof(__lhs))) { \ + if (sizeof(__rhs) == 8) \ + ret = __bpf_cmp(__lhs, OP, "", "r", __rhs, NOFLIP); \ + else \ + ret = __bpf_cmp(__lhs, OP, "", "i", __rhs, NOFLIP); \ + } else { \ + if (sizeof(__rhs) == 8) \ + ret = __bpf_cmp(__lhs, OP, "s", "r", __rhs, NOFLIP); \ + else \ + ret = __bpf_cmp(__lhs, OP, "s", "i", __rhs, NOFLIP); \ + } \ + ret; \ + }) + +#ifndef bpf_cmp_unlikely +#define bpf_cmp_unlikely(LHS, OP, RHS) _bpf_cmp(LHS, OP, RHS, true) +#endif + +#ifndef bpf_cmp_likely +#define bpf_cmp_likely(LHS, OP, RHS) \ + ({ \ + bool ret; \ + if (__builtin_strcmp(#OP, "==") == 0) \ + ret = _bpf_cmp(LHS, !=, RHS, false); \ + else if (__builtin_strcmp(#OP, "!=") == 0) \ + ret = _bpf_cmp(LHS, ==, RHS, false); \ + else if (__builtin_strcmp(#OP, "<=") == 0) \ + ret = _bpf_cmp(LHS, >, RHS, false); \ + else if (__builtin_strcmp(#OP, "<") == 0) \ + ret = _bpf_cmp(LHS, >=, RHS, false); \ + else if (__builtin_strcmp(#OP, ">") == 0) \ + ret = _bpf_cmp(LHS, <=, RHS, false); \ + else if (__builtin_strcmp(#OP, ">=") == 0) \ + ret = _bpf_cmp(LHS, <, RHS, false); \ + else \ + (void) "bug"; \ + ret; \ + }) +#endif + /* Description * Assert that a conditional expression is true. * Returns diff --git a/tools/testing/selftests/bpf/progs/exceptions.c b/tools/testing/selftests/bpf/progs/exceptions.c index 2811ee842b01..f09cd14d8e04 100644 --- a/tools/testing/selftests/bpf/progs/exceptions.c +++ b/tools/testing/selftests/bpf/progs/exceptions.c @@ -210,7 +210,7 @@ __noinline int assert_zero_gfunc(u64 c) { volatile u64 cookie = c; - bpf_assert_eq(cookie, 0); + bpf_assert(bpf_cmp_unlikely(cookie, ==, 0)); return 0; } @@ -218,7 +218,7 @@ __noinline int assert_neg_gfunc(s64 c) { volatile s64 cookie = c; - bpf_assert_lt(cookie, 0); + bpf_assert(bpf_cmp_unlikely(cookie, <, 0)); return 0; } @@ -226,7 +226,7 @@ __noinline int assert_pos_gfunc(s64 c) { volatile s64 cookie = c; - bpf_assert_gt(cookie, 0); + bpf_assert(bpf_cmp_unlikely(cookie, >, 0)); return 0; } @@ -234,7 +234,7 @@ __noinline int assert_negeq_gfunc(s64 c) { volatile s64 cookie = c; - bpf_assert_le(cookie, -1); + bpf_assert(bpf_cmp_unlikely(cookie, <=, -1)); return 0; } @@ -242,7 +242,7 @@ __noinline int assert_poseq_gfunc(s64 c) { volatile s64 cookie = c; - bpf_assert_ge(cookie, 1); + bpf_assert(bpf_cmp_unlikely(cookie, >=, 1)); return 0; } @@ -258,7 +258,7 @@ __noinline int assert_zero_gfunc_with(u64 c) { volatile u64 cookie = c; - bpf_assert_eq_with(cookie, 0, cookie + 100); + bpf_assert_with(bpf_cmp_unlikely(cookie, ==, 0), cookie + 100); return 0; } @@ -266,7 +266,7 @@ __noinline int assert_neg_gfunc_with(s64 c) { volatile s64 cookie = c; - bpf_assert_lt_with(cookie, 0, cookie + 100); + bpf_assert_with(bpf_cmp_unlikely(cookie, <, 0), cookie + 100); return 0; } @@ -274,7 +274,7 @@ __noinline int assert_pos_gfunc_with(s64 c) { volatile s64 cookie = c; - bpf_assert_gt_with(cookie, 0, cookie + 100); + bpf_assert_with(bpf_cmp_unlikely(cookie, >, 0), cookie + 100); return 0; } @@ -282,7 +282,7 @@ __noinline int assert_negeq_gfunc_with(s64 c) { volatile s64 cookie = c; - bpf_assert_le_with(cookie, -1, cookie + 100); + bpf_assert_with(bpf_cmp_unlikely(cookie, <=, -1), cookie + 100); return 0; } @@ -290,7 +290,7 @@ __noinline int assert_poseq_gfunc_with(s64 c) { volatile s64 cookie = c; - bpf_assert_ge_with(cookie, 1, cookie + 100); + bpf_assert_with(bpf_cmp_unlikely(cookie, >=, 1), cookie + 100); return 0; } diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c index e085a51d153e..dc0c3691dcc2 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_vma.c +++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c @@ -28,9 +28,8 @@ int iter_task_vma_for_each(const void *ctx) return 0; bpf_for_each(task_vma, vma, task, 0) { - if (seen >= 1000) + if (bpf_cmp_unlikely(seen, >=, 1000)) break; - barrier_var(seen); vm_ranges[seen].vm_start = vma->vm_start; vm_ranges[seen].vm_end = vma->vm_end; -- cgit v1.2.3 From 624cd2a17672f4596fee97a5558bc990778bbcf9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:45 -0800 Subject: selftests/bpf: Convert exceptions_assert.c to bpf_cmp Convert exceptions_assert.c to bpf_cmp_unlikely() macro. Since bpf_assert(bpf_cmp_unlikely(var, ==, 100)); other code; will generate assembly code: if r1 == 100 goto L2; r0 = 0 call bpf_throw L1: other code; ... L2: goto L1; LLVM generates redundant basic block with extra goto. LLVM will be fixed eventually. Right now it's less efficient than __bpf_assert(var, ==, 100) macro that produces: if r1 == 100 goto L1; r0 = 0 call bpf_throw L1: other code; But extra goto doesn't hurt the verification process. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20231226191148.48536-4-alexei.starovoitov@gmail.com --- .../selftests/bpf/progs/exceptions_assert.c | 80 +++++++++++----------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 0ef81040da59..5e0a1ca96d4e 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -11,51 +11,51 @@ #define check_assert(type, op, name, value) \ SEC("?tc") \ __log_level(2) __failure \ - int check_assert_##op##_##name(void *ctx) \ + int check_assert_##name(void *ctx) \ { \ type num = bpf_ktime_get_ns(); \ - bpf_assert_##op(num, value); \ + bpf_assert(bpf_cmp_unlikely(num, op, value)); \ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000 R10=fp0") -check_assert(s64, eq, int_min, INT_MIN); -__msg(": R0_w=0x7fffffff R10=fp0") -check_assert(s64, eq, int_max, INT_MAX); -__msg(": R0_w=0 R10=fp0") -check_assert(s64, eq, zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000 R10=fp0") -check_assert(s64, eq, llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff R10=fp0") -check_assert(s64, eq, llong_max, LLONG_MAX); - -__msg(": R0_w=scalar(smax=0x7ffffffe) R10=fp0") -check_assert(s64, lt, pos, INT_MAX); -__msg(": R0_w=scalar(smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") -check_assert(s64, lt, zero, 0); -__msg(": R0_w=scalar(smax=0xffffffff7fffffff,umin=0x8000000000000000,umax=0xffffffff7fffffff,var_off=(0x8000000000000000; 0x7fffffffffffffff))") -check_assert(s64, lt, neg, INT_MIN); - -__msg(": R0_w=scalar(smax=0x7fffffff) R10=fp0") -check_assert(s64, le, pos, INT_MAX); -__msg(": R0_w=scalar(smax=0) R10=fp0") -check_assert(s64, le, zero, 0); -__msg(": R0_w=scalar(smax=0xffffffff80000000,umin=0x8000000000000000,umax=0xffffffff80000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") -check_assert(s64, le, neg, INT_MIN); - -__msg(": R0_w=scalar(smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") -check_assert(s64, gt, pos, INT_MAX); -__msg(": R0_w=scalar(smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") -check_assert(s64, gt, zero, 0); -__msg(": R0_w=scalar(smin=0xffffffff80000001) R10=fp0") -check_assert(s64, gt, neg, INT_MIN); - -__msg(": R0_w=scalar(smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") -check_assert(s64, ge, pos, INT_MAX); -__msg(": R0_w=scalar(smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0") -check_assert(s64, ge, zero, 0); -__msg(": R0_w=scalar(smin=0xffffffff80000000) R10=fp0") -check_assert(s64, ge, neg, INT_MIN); +__msg(": R0_w=0xffffffff80000000") +check_assert(s64, ==, eq_int_min, INT_MIN); +__msg(": R0_w=0x7fffffff") +check_assert(s64, ==, eq_int_max, INT_MAX); +__msg(": R0_w=0") +check_assert(s64, ==, eq_zero, 0); +__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +check_assert(s64, ==, eq_llong_min, LLONG_MIN); +__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +check_assert(s64, ==, eq_llong_max, LLONG_MAX); + +__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +check_assert(s64, <, lt_pos, INT_MAX); +__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +check_assert(s64, <, lt_zero, 0); +__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +check_assert(s64, <, lt_neg, INT_MIN); + +__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +check_assert(s64, <=, le_pos, INT_MAX); +__msg(": R0_w=scalar(id=1,smax=0)") +check_assert(s64, <=, le_zero, 0); +__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +check_assert(s64, <=, le_neg, INT_MIN); + +__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +check_assert(s64, >, gt_pos, INT_MAX); +__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +check_assert(s64, >, gt_zero, 0); +__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +check_assert(s64, >, gt_neg, INT_MIN); + +__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +check_assert(s64, >=, ge_pos, INT_MAX); +__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +check_assert(s64, >=, ge_zero, 0); +__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") __log_level(2) __failure -- cgit v1.2.3 From 907dbd3ede5ffd4f9519dd1fae2a8a983603bf3b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:46 -0800 Subject: selftests/bpf: Remove bpf_assert_eq-like macros. Since the last user was converted to bpf_cmp, remove bpf_assert_eq/ne/... macros. __bpf_assert_op() macro is kept for experiments, since it's slightly more efficient than bpf_assert(bpf_cmp_unlikely()) until LLVM is fixed. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20231226191148.48536-5-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/bpf_experimental.h | 150 ------------------------- 1 file changed, 150 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 19ed6c941c1c..2ef9949fbd63 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -341,156 +341,6 @@ l_true: \ */ #define bpf_assert_with(cond, value) if (!(cond)) bpf_throw(value); -/* Description - * Assert that LHS is equal to RHS. This statement updates the known value - * of LHS during verification. Note that RHS must be a constant value, and - * must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the value zero when the assertion fails. - */ -#define bpf_assert_eq(LHS, RHS) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, ==, RHS, 0, true); \ - }) - -/* Description - * Assert that LHS is equal to RHS. This statement updates the known value - * of LHS during verification. Note that RHS must be a constant value, and - * must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the specified value when the assertion fails. - */ -#define bpf_assert_eq_with(LHS, RHS, value) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, ==, RHS, value, true); \ - }) - -/* Description - * Assert that LHS is less than RHS. This statement updates the known - * bounds of LHS during verification. Note that RHS must be a constant - * value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the value zero when the assertion fails. - */ -#define bpf_assert_lt(LHS, RHS) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, <, RHS, 0, false); \ - }) - -/* Description - * Assert that LHS is less than RHS. This statement updates the known - * bounds of LHS during verification. Note that RHS must be a constant - * value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the specified value when the assertion fails. - */ -#define bpf_assert_lt_with(LHS, RHS, value) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, <, RHS, value, false); \ - }) - -/* Description - * Assert that LHS is greater than RHS. This statement updates the known - * bounds of LHS during verification. Note that RHS must be a constant - * value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the value zero when the assertion fails. - */ -#define bpf_assert_gt(LHS, RHS) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, >, RHS, 0, false); \ - }) - -/* Description - * Assert that LHS is greater than RHS. This statement updates the known - * bounds of LHS during verification. Note that RHS must be a constant - * value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the specified value when the assertion fails. - */ -#define bpf_assert_gt_with(LHS, RHS, value) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, >, RHS, value, false); \ - }) - -/* Description - * Assert that LHS is less than or equal to RHS. This statement updates the - * known bounds of LHS during verification. Note that RHS must be a - * constant value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the value zero when the assertion fails. - */ -#define bpf_assert_le(LHS, RHS) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, <=, RHS, 0, false); \ - }) - -/* Description - * Assert that LHS is less than or equal to RHS. This statement updates the - * known bounds of LHS during verification. Note that RHS must be a - * constant value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the specified value when the assertion fails. - */ -#define bpf_assert_le_with(LHS, RHS, value) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, <=, RHS, value, false); \ - }) - -/* Description - * Assert that LHS is greater than or equal to RHS. This statement updates - * the known bounds of LHS during verification. Note that RHS must be a - * constant value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the value zero when the assertion fails. - */ -#define bpf_assert_ge(LHS, RHS) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, >=, RHS, 0, false); \ - }) - -/* Description - * Assert that LHS is greater than or equal to RHS. This statement updates - * the known bounds of LHS during verification. Note that RHS must be a - * constant value, and must fit within the data type of LHS. - * Returns - * Void. - * Throws - * An exception with the specified value when the assertion fails. - */ -#define bpf_assert_ge_with(LHS, RHS, value) \ - ({ \ - barrier_var(LHS); \ - __bpf_assert_op(LHS, >=, RHS, value, false); \ - }) - /* Description * Assert that LHS is in the range [BEG, END] (inclusive of both). This * statement updates the known bounds of LHS during verification. Note -- cgit v1.2.3 From 0bcc62aa9813f519db58df14ddf1d523fa971e62 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:47 -0800 Subject: bpf: Add bpf_nop_mov() asm macro. bpf_nop_mov(var) asm macro emits nop register move: rX = rX. If 'var' is a scalar and not a fixed constant the verifier will assign ID to it. If it's later spilled the stack slot will carry that ID as well. Hence the range refining comparison "if rX < const" will update all copies including spilled slot. This macro is a temporary workaround until the verifier gets smarter. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231226191148.48536-6-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/bpf_experimental.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2ef9949fbd63..f44875f8b367 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -323,6 +323,11 @@ l_true: \ }) #endif +#ifndef bpf_nop_mov +#define bpf_nop_mov(var) \ + asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) +#endif + /* Description * Assert that a conditional expression is true. * Returns -- cgit v1.2.3 From 7e3811cb998f0e2493677c7daf6cefb4ece27111 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Dec 2023 11:11:48 -0800 Subject: selftests/bpf: Convert profiler.c to bpf_cmp. Convert profiler[123].c to "volatile compare" to compare barrier_var() approach vs bpf_cmp_likely() vs bpf_cmp_unlikely(). bpf_cmp_unlikely() produces correct code, but takes much longer to verify: ./veristat -C -e prog,insns,states before after_with_unlikely Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------------------ --------- --------- ------------------ ---------- ---------- ----------------- kprobe__proc_sys_write 1603 19606 +18003 (+1123.08%) 123 1678 +1555 (+1264.23%) kprobe__vfs_link 11815 70305 +58490 (+495.05%) 971 4967 +3996 (+411.53%) kprobe__vfs_symlink 5464 42896 +37432 (+685.07%) 434 3126 +2692 (+620.28%) kprobe_ret__do_filp_open 5641 44578 +38937 (+690.25%) 446 3162 +2716 (+608.97%) raw_tracepoint__sched_process_exec 2770 35962 +33192 (+1198.27%) 226 3121 +2895 (+1280.97%) raw_tracepoint__sched_process_exit 1526 2135 +609 (+39.91%) 133 208 +75 (+56.39%) raw_tracepoint__sched_process_fork 265 337 +72 (+27.17%) 19 24 +5 (+26.32%) tracepoint__syscalls__sys_enter_kill 18782 140407 +121625 (+647.56%) 1286 12176 +10890 (+846.81%) bpf_cmp_likely() is equivalent to barrier_var(): ./veristat -C -e prog,insns,states before after_with_likely Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------------------ --------- --------- -------------- ---------- ---------- ------------- kprobe__proc_sys_write 1603 1663 +60 (+3.74%) 123 127 +4 (+3.25%) kprobe__vfs_link 11815 12090 +275 (+2.33%) 971 971 +0 (+0.00%) kprobe__vfs_symlink 5464 5448 -16 (-0.29%) 434 426 -8 (-1.84%) kprobe_ret__do_filp_open 5641 5739 +98 (+1.74%) 446 446 +0 (+0.00%) raw_tracepoint__sched_process_exec 2770 2608 -162 (-5.85%) 226 216 -10 (-4.42%) raw_tracepoint__sched_process_exit 1526 1526 +0 (+0.00%) 133 133 +0 (+0.00%) raw_tracepoint__sched_process_fork 265 265 +0 (+0.00%) 19 19 +0 (+0.00%) tracepoint__syscalls__sys_enter_kill 18782 18970 +188 (+1.00%) 1286 1286 +0 (+0.00%) kprobe__proc_sys_write 2700 2809 +109 (+4.04%) 107 109 +2 (+1.87%) kprobe__vfs_link 12238 12366 +128 (+1.05%) 267 269 +2 (+0.75%) kprobe__vfs_symlink 7139 7365 +226 (+3.17%) 167 175 +8 (+4.79%) kprobe_ret__do_filp_open 7264 7070 -194 (-2.67%) 180 182 +2 (+1.11%) raw_tracepoint__sched_process_exec 3768 3453 -315 (-8.36%) 211 199 -12 (-5.69%) raw_tracepoint__sched_process_exit 3138 3138 +0 (+0.00%) 83 83 +0 (+0.00%) raw_tracepoint__sched_process_fork 265 265 +0 (+0.00%) 19 19 +0 (+0.00%) tracepoint__syscalls__sys_enter_kill 26679 24327 -2352 (-8.82%) 1067 1037 -30 (-2.81%) kprobe__proc_sys_write 1833 1833 +0 (+0.00%) 157 157 +0 (+0.00%) kprobe__vfs_link 9995 10127 +132 (+1.32%) 803 803 +0 (+0.00%) kprobe__vfs_symlink 5606 5672 +66 (+1.18%) 451 451 +0 (+0.00%) kprobe_ret__do_filp_open 5716 5782 +66 (+1.15%) 462 462 +0 (+0.00%) raw_tracepoint__sched_process_exec 3042 3042 +0 (+0.00%) 278 278 +0 (+0.00%) raw_tracepoint__sched_process_exit 1680 1680 +0 (+0.00%) 146 146 +0 (+0.00%) raw_tracepoint__sched_process_fork 299 299 +0 (+0.00%) 25 25 +0 (+0.00%) tracepoint__syscalls__sys_enter_kill 18372 18372 +0 (+0.00%) 1558 1558 +0 (+0.00%) default (mcpu=v3), no_alu32, cpuv4 have similar differences. Note one place where bpf_nop_mov() is used to workaround the verifier lack of link between the scalar register and its spill to stack. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231226191148.48536-7-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/progs/profiler.inc.h | 64 +++++++----------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index ba99d17dac54..de3b6e4e4d0a 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -7,6 +7,7 @@ #include "profiler.h" #include "err.h" +#include "bpf_experimental.h" #ifndef NULL #define NULL 0 @@ -221,8 +222,7 @@ static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node, return payload; if (cgroup_node == cgroup_root_node) *root_pos = payload - payload_start; - if (filepart_length <= MAX_PATH) { - barrier_var(filepart_length); + if (bpf_cmp_likely(filepart_length, <=, MAX_PATH)) { payload += filepart_length; } cgroup_node = BPF_CORE_READ(cgroup_node, parent); @@ -305,9 +305,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, size_t cgroup_root_length = bpf_probe_read_kernel_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name)); - barrier_var(cgroup_root_length); - if (cgroup_root_length <= MAX_PATH) { - barrier_var(cgroup_root_length); + if (bpf_cmp_likely(cgroup_root_length, <=, MAX_PATH)) { cgroup_data->cgroup_root_length = cgroup_root_length; payload += cgroup_root_length; } @@ -315,9 +313,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, size_t cgroup_proc_length = bpf_probe_read_kernel_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name)); - barrier_var(cgroup_proc_length); - if (cgroup_proc_length <= MAX_PATH) { - barrier_var(cgroup_proc_length); + if (bpf_cmp_likely(cgroup_proc_length, <=, MAX_PATH)) { cgroup_data->cgroup_proc_length = cgroup_proc_length; payload += cgroup_proc_length; } @@ -347,9 +343,7 @@ static INLINE void* populate_var_metadata(struct var_metadata_t* metadata, metadata->comm_length = 0; size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); - barrier_var(comm_length); - if (comm_length <= TASK_COMM_LEN) { - barrier_var(comm_length); + if (bpf_cmp_likely(comm_length, <=, TASK_COMM_LEN)) { metadata->comm_length = comm_length; payload += comm_length; } @@ -494,10 +488,9 @@ read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload) filepart_length = bpf_probe_read_kernel_str(payload, MAX_PATH, BPF_CORE_READ(filp_dentry, d_name.name)); - barrier_var(filepart_length); - if (filepart_length > MAX_PATH) + bpf_nop_mov(filepart_length); + if (bpf_cmp_unlikely(filepart_length, >, MAX_PATH)) break; - barrier_var(filepart_length); payload += filepart_length; length += filepart_length; @@ -579,9 +572,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write, size_t sysctl_val_length = bpf_probe_read_kernel_str(payload, CTL_MAXNAME, buf); - barrier_var(sysctl_val_length); - if (sysctl_val_length <= CTL_MAXNAME) { - barrier_var(sysctl_val_length); + if (bpf_cmp_likely(sysctl_val_length, <=, CTL_MAXNAME)) { sysctl_data->sysctl_val_length = sysctl_val_length; payload += sysctl_val_length; } @@ -590,9 +581,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write, bpf_probe_read_kernel_str(payload, MAX_PATH, BPF_CORE_READ(filp, f_path.dentry, d_name.name)); - barrier_var(sysctl_path_length); - if (sysctl_path_length <= MAX_PATH) { - barrier_var(sysctl_path_length); + if (bpf_cmp_likely(sysctl_path_length, <=, MAX_PATH)) { sysctl_data->sysctl_path_length = sysctl_path_length; payload += sysctl_path_length; } @@ -658,9 +647,7 @@ int raw_tracepoint__sched_process_exit(void* ctx) kill_data->kill_target_cgroup_proc_length = 0; size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); - barrier_var(comm_length); - if (comm_length <= TASK_COMM_LEN) { - barrier_var(comm_length); + if (bpf_cmp_likely(comm_length, <=, TASK_COMM_LEN)) { kill_data->kill_target_name_length = comm_length; payload += comm_length; } @@ -669,9 +656,7 @@ int raw_tracepoint__sched_process_exit(void* ctx) bpf_probe_read_kernel_str(payload, KILL_TARGET_LEN, BPF_CORE_READ(proc_kernfs, name)); - barrier_var(cgroup_proc_length); - if (cgroup_proc_length <= KILL_TARGET_LEN) { - barrier_var(cgroup_proc_length); + if (bpf_cmp_likely(cgroup_proc_length, <=, KILL_TARGET_LEN)) { kill_data->kill_target_cgroup_proc_length = cgroup_proc_length; payload += cgroup_proc_length; } @@ -731,9 +716,7 @@ int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx) const char* filename = BPF_CORE_READ(bprm, filename); size_t bin_path_length = bpf_probe_read_kernel_str(payload, MAX_FILENAME_LEN, filename); - barrier_var(bin_path_length); - if (bin_path_length <= MAX_FILENAME_LEN) { - barrier_var(bin_path_length); + if (bpf_cmp_likely(bin_path_length, <=, MAX_FILENAME_LEN)) { proc_exec_data->bin_path_length = bin_path_length; payload += bin_path_length; } @@ -743,8 +726,7 @@ int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx) unsigned int cmdline_length = probe_read_lim(payload, arg_start, arg_end - arg_start, MAX_ARGS_LEN); - if (cmdline_length <= MAX_ARGS_LEN) { - barrier_var(cmdline_length); + if (bpf_cmp_likely(cmdline_length, <=, MAX_ARGS_LEN)) { proc_exec_data->cmdline_length = cmdline_length; payload += cmdline_length; } @@ -821,9 +803,7 @@ int kprobe_ret__do_filp_open(struct pt_regs* ctx) payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload); - barrier_var(len); - if (len <= MAX_FILEPATH_LENGTH) { - barrier_var(len); + if (bpf_cmp_likely(len, <=, MAX_FILEPATH_LENGTH)) { payload += len; filemod_data->dst_filepath_length = len; } @@ -876,17 +856,13 @@ int BPF_KPROBE(kprobe__vfs_link, payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); size_t len = read_absolute_file_path_from_dentry(old_dentry, payload); - barrier_var(len); - if (len <= MAX_FILEPATH_LENGTH) { - barrier_var(len); + if (bpf_cmp_likely(len, <=, MAX_FILEPATH_LENGTH)) { payload += len; filemod_data->src_filepath_length = len; } len = read_absolute_file_path_from_dentry(new_dentry, payload); - barrier_var(len); - if (len <= MAX_FILEPATH_LENGTH) { - barrier_var(len); + if (bpf_cmp_likely(len, <=, MAX_FILEPATH_LENGTH)) { payload += len; filemod_data->dst_filepath_length = len; } @@ -936,16 +912,12 @@ int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry, size_t len = bpf_probe_read_kernel_str(payload, MAX_FILEPATH_LENGTH, oldname); - barrier_var(len); - if (len <= MAX_FILEPATH_LENGTH) { - barrier_var(len); + if (bpf_cmp_likely(len, <=, MAX_FILEPATH_LENGTH)) { payload += len; filemod_data->src_filepath_length = len; } len = read_absolute_file_path_from_dentry(dentry, payload); - barrier_var(len); - if (len <= MAX_FILEPATH_LENGTH) { - barrier_var(len); + if (bpf_cmp_likely(len, <=, MAX_FILEPATH_LENGTH)) { payload += len; filemod_data->dst_filepath_length = len; } -- cgit v1.2.3 From 16b2f264983dc264c1560cc0170e760dec1bf54f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:23 -0800 Subject: bpf: sockmap, fix proto update hook to avoid dup calls When sockets are added to a sockmap or sockhash we allocate and init a psock. Then update the proto ops with sock_map_init_proto the flow is sock_hash_update_common sock_map_link psock = sock_map_psock_get_checked() <-returns existing psock sock_map_init_proto(sk, psock) <- updates sk_proto If the socket is already in a map this results in the sock_map_init_proto being called multiple times on the same socket. We do this because when a socket is added to multiple maps this might result in a new set of BPF programs being attached to the socket requiring an updated ops struct. This creates a rule where it must be safe to call psock_update_sk_prot multiple times. When we added a fix for UAF through unix sockets in patch 4dd9a38a753fc we broke this rule by adding a sock_hold in that path to ensure the sock is not released. The result is if a af_unix stream sock is placed in multiple maps it results in a memory leak because we call sock_hold multiple times with only a single sock_put on it. Fixes: 8866730aed51 ("bpf, sockmap: af_unix stream sockets need to hold ref for pair sock") Reported-by: Xingwei Lee Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-2-john.fastabend@gmail.com --- net/unix/unix_bpf.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 7ea7c3a0d0d0..bd84785bf8d6 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -161,15 +161,30 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r { struct sock *sk_pair; + /* Restore does not decrement the sk_pair reference yet because we must + * keep the a reference to the socket until after an RCU grace period + * and any pending sends have completed. + */ if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } - sk_pair = unix_peer(sk); - sock_hold(sk_pair); - psock->sk_pair = sk_pair; + /* psock_update_sk_prot can be called multiple times if psock is + * added to multiple maps and/or slots in the same map. There is + * also an edge case where replacing a psock with itself can trigger + * an extra psock_update_sk_prot during the insert process. So it + * must be safe to do multiple calls. Here we need to ensure we don't + * increment the refcnt through sock_hold many times. There will only + * be a single matching destroy operation. + */ + if (!psock->sk_pair) { + sk_pair = unix_peer(sk); + sock_hold(sk_pair); + psock->sk_pair = sk_pair; + } + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; -- cgit v1.2.3 From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:24 -0800 Subject: bpf: sockmap, added comments describing update proto rules Add a comment describing that the psock update proto callbback can be called multiple times and this must be safe. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c953b8c0d2f4..888a4b217829 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -100,6 +100,11 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + /* psock_update_sk_prot may be called with restore=false many times + * so the handler must be safe for this case. It will be called + * exactly once with restore=true when the psock is being destroyed + * and psock refcnt is zero, but before an RCU grace period. + */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; -- cgit v1.2.3 From 8c1b382a555adcd2008ae964047a35b739dfaf24 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:25 -0800 Subject: bpf: sockmap, add tests for proto updates many to single map Add test with a single map where each socket is inserted multiple times. Test protocols: TCP, UDP, stream af_unix and dgram af_unix. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-4-john.fastabend@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 71 +++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 7c2241fae19a..ebd05788cfa0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -555,6 +555,74 @@ static void test_sockmap_unconnected_unix(void) close(dgram); } +static void test_sockmap_many_socket(void) +{ + struct test_sockmap_pass_prog *skel; + int stream[2], dgram, udp, tcp; + int i, err, map, entry = 0; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + map = bpf_map__fd(skel->maps.sock_map_rx); + + dgram = xsocket(AF_UNIX, SOCK_DGRAM, 0); + if (dgram < 0) { + test_sockmap_pass_prog__destroy(skel); + return; + } + + tcp = connected_socket_v4(); + if (!ASSERT_GE(tcp, 0, "connected_socket_v4")) { + close(dgram); + test_sockmap_pass_prog__destroy(skel); + return; + } + + udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (udp < 0) { + close(dgram); + close(tcp); + test_sockmap_pass_prog__destroy(skel); + return; + } + + err = socketpair(AF_UNIX, SOCK_STREAM, 0, stream); + ASSERT_OK(err, "socketpair(af_unix, sock_stream)"); + if (err) + goto out; + + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map, &entry, &stream[0], BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(stream)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map, &entry, &dgram, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(dgram)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map, &entry, &udp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(udp)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map, &entry, &tcp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(tcp)"); + } + for (entry--; entry >= 0; entry--) { + err = bpf_map_delete_elem(map, &entry); + ASSERT_OK(err, "bpf_map_delete_elem(entry)"); + } + + close(stream[0]); + close(stream[1]); +out: + close(dgram); + close(tcp); + close(udp); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -597,7 +665,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); - if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); + if (test__start_subtest("sockmap one socket to many map entries")) + test_sockmap_many_socket(); } -- cgit v1.2.3 From f1300467dd9f67293a7aae86fd26471520fac36d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:26 -0800 Subject: bpf: sockmap, add tests for proto updates single socket to many map Add test with multiple maps where each socket is inserted in multiple maps. Test protocols: TCP, UDP, stream af_unix and dgram af_unix. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-5-john.fastabend@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index ebd05788cfa0..74bca07ebec2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -623,6 +623,78 @@ out: test_sockmap_pass_prog__destroy(skel); } +static void test_sockmap_many_maps(void) +{ + struct test_sockmap_pass_prog *skel; + int stream[2], dgram, udp, tcp; + int i, err, map[2], entry = 0; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + map[0] = bpf_map__fd(skel->maps.sock_map_rx); + map[1] = bpf_map__fd(skel->maps.sock_map_tx); + + dgram = xsocket(AF_UNIX, SOCK_DGRAM, 0); + if (dgram < 0) { + test_sockmap_pass_prog__destroy(skel); + return; + } + + tcp = connected_socket_v4(); + if (!ASSERT_GE(tcp, 0, "connected_socket_v4")) { + close(dgram); + test_sockmap_pass_prog__destroy(skel); + return; + } + + udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (udp < 0) { + close(dgram); + close(tcp); + test_sockmap_pass_prog__destroy(skel); + return; + } + + err = socketpair(AF_UNIX, SOCK_STREAM, 0, stream); + ASSERT_OK(err, "socketpair(af_unix, sock_stream)"); + if (err) + goto out; + + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map[i], &entry, &stream[0], BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(stream)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map[i], &entry, &dgram, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(dgram)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map[i], &entry, &udp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(udp)"); + } + for (i = 0; i < 2; i++, entry++) { + err = bpf_map_update_elem(map[i], &entry, &tcp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(tcp)"); + } + for (entry--; entry >= 0; entry--) { + err = bpf_map_delete_elem(map[1], &entry); + entry--; + ASSERT_OK(err, "bpf_map_delete_elem(entry)"); + err = bpf_map_delete_elem(map[0], &entry); + ASSERT_OK(err, "bpf_map_delete_elem(entry)"); + } + + close(stream[0]); + close(stream[1]); +out: + close(dgram); + close(tcp); + close(udp); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -669,4 +741,6 @@ void test_sockmap_basic(void) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) test_sockmap_many_socket(); + if (test__start_subtest("sockmap one socket to many maps")) + test_sockmap_many_maps(); } -- cgit v1.2.3 From bdbca46d3f84a4455cd5c15a7483666218851549 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:27 -0800 Subject: bpf: sockmap, add tests for proto updates replace socket Add test that replaces the same socket with itself. This exercises a corner case where old element and new element have the same posck. Test protocols: TCP, UDP, stream af_unix and dgram af_unix. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-6-john.fastabend@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 74bca07ebec2..77e26ecffa9d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -695,6 +695,73 @@ out: test_sockmap_pass_prog__destroy(skel); } +static void test_sockmap_same_sock(void) +{ + struct test_sockmap_pass_prog *skel; + int stream[2], dgram, udp, tcp; + int i, err, map, zero = 0; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + map = bpf_map__fd(skel->maps.sock_map_rx); + + dgram = xsocket(AF_UNIX, SOCK_DGRAM, 0); + if (dgram < 0) { + test_sockmap_pass_prog__destroy(skel); + return; + } + + tcp = connected_socket_v4(); + if (!ASSERT_GE(tcp, 0, "connected_socket_v4")) { + close(dgram); + test_sockmap_pass_prog__destroy(skel); + return; + } + + udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (udp < 0) { + close(dgram); + close(tcp); + test_sockmap_pass_prog__destroy(skel); + return; + } + + err = socketpair(AF_UNIX, SOCK_STREAM, 0, stream); + ASSERT_OK(err, "socketpair(af_unix, sock_stream)"); + if (err) + goto out; + + for (i = 0; i < 2; i++) { + err = bpf_map_update_elem(map, &zero, &stream[0], BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(stream)"); + } + for (i = 0; i < 2; i++) { + err = bpf_map_update_elem(map, &zero, &dgram, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(dgram)"); + } + for (i = 0; i < 2; i++) { + err = bpf_map_update_elem(map, &zero, &udp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(udp)"); + } + for (i = 0; i < 2; i++) { + err = bpf_map_update_elem(map, &zero, &tcp, BPF_ANY); + ASSERT_OK(err, "bpf_map_update_elem(tcp)"); + } + + err = bpf_map_delete_elem(map, &zero); + ASSERT_OK(err, "bpf_map_delete_elem(entry)"); + + close(stream[0]); + close(stream[1]); +out: + close(dgram); + close(tcp); + close(udp); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -743,4 +810,6 @@ void test_sockmap_basic(void) test_sockmap_many_socket(); if (test__start_subtest("sockmap one socket to many maps")) test_sockmap_many_maps(); + if (test__start_subtest("sockmap same socket replace")) + test_sockmap_same_sock(); } -- cgit v1.2.3 From 9beda16c257d55213f70adee2f16d7f13a8502e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:34 -0800 Subject: bpf: Avoid unnecessary extra percpu memory allocation Currently, for percpu memory allocation, say if the user requests allocation size to be 32 bytes, the actually calculated size will be 40 bytes and it further rounds to 64 bytes, and eventually 64 bytes are allocated, wasting 32-byte memory. Change bpf_mem_alloc() to calculate the cache index based on the user-provided allocation size so unnecessary extra memory can be avoided. Suggested-by: Hou Tao Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031734.1288400-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index aa0fbf000a12..288ec4a967d0 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -833,7 +833,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) if (!size) return NULL; - idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (!ma->percpu) + size += LLIST_NODE_SZ; + idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; -- cgit v1.2.3 From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:39 -0800 Subject: bpf: Add objcg to bpf_mem_alloc The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's are with the same objcg. This patch made such a property explicit. The next patch will use this property to save and restore objcg for percpu unit allocator. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + kernel/bpf/memalloc.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b21308..acef8c808599 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 288ec4a967d0..4a21050f0359 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -523,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -542,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { @@ -691,9 +693,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -709,8 +710,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } -- cgit v1.2.3 From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:45 -0800 Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 ++++++ kernel/bpf/memalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 37 +++++++++++++++++----------- 3 files changed, 86 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index acef8c808599..aaf004d94322 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -22,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 4a21050f0359..f71da07eb8a0 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,6 +121,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); @@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (cpu == 0 && c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + + return 0; +} + static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4e31f61de0e..e9699a2cfe4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - } - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); return -EINVAL; @@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { -- cgit v1.2.3 From 5b95e638f134e552b5ba2976326c02babe248615 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:50 -0800 Subject: bpf: Refill only one percpu element in memalloc Typically for percpu map element or data structure, once allocated, most operations are lookup or in-place update. Deletion are really rare. Currently, for percpu data strcture, 4 elements will be refilled if the size is <= 256. Let us just do with one element for percpu data. For example, for size 256 and 128 cpus, the potential saving will be 3 * 256 * 128 * 128 = 12MB. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031750.1289290-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index f71da07eb8a0..a8ee6fb8401c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -485,11 +485,16 @@ static void init_refill_work(struct bpf_mem_cache *c) static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { - /* To avoid consuming memory assume that 1st run of bpf - * prog won't be doing more than 4 map_update_elem from - * irq disabled region + int cnt = 1; + + /* To avoid consuming memory, for non-percpu allocation, assume that + * 1st run of bpf prog won't be doing more than 4 map_update_elem from + * irq disabled region if unit size is less than or equal to 256. + * For all other cases, let us just do one allocation. */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); + if (!c->percpu_size && c->unit_size <= 256) + cnt = 4; + alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. -- cgit v1.2.3 From 0e2ba9f96f9b82893ba19170ae48d46003f8ef44 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:55 -0800 Subject: bpf: Use smaller low/high marks for percpu allocation Currently, refill low/high marks are set with the assumption of normal non-percpu memory allocation. For example, for an allocation size 256, for non-percpu memory allocation, low mark is 32 and high mark is 96, resulting in the batch allocation of 48 elements and the allocated memory will be 48 * 256 = 12KB for this particular cpu. Assuming an 128-cpu system, the total memory consumption across all cpus will be 12K * 128 = 1.5MB memory. This might be okay for non-percpu allocation, but may not be good for percpu allocation, which will consume 1.5MB * 128 = 192MB memory in the worst case if every cpu has a chance of memory allocation. In practice, percpu allocation is very rare compared to non-percpu allocation. So let us have smaller low/high marks which can avoid unnecessary memory consumption. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031755.1289671-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a8ee6fb8401c..460c8f38fed6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -464,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. + * + * Percpu allocation is typically rare. To avoid potential unnecessary large + * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); - if (c->unit_size <= 256) { + if (c->percpu_size) { + c->low_watermark = 1; + c->high_watermark = 3; + } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { -- cgit v1.2.3 From 5c1a37653260ed5d9c8b26fb7fe7b99629612982 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:18:01 -0800 Subject: bpf: Limit up to 512 bytes for bpf_global_percpu_ma allocation For percpu data structure allocation with bpf_global_percpu_ma, the maximum data size is 4K. But for a system with large number of cpus, bigger data size (e.g., 2K, 4K) might consume a lot of memory. For example, the percpu memory consumption with unit size 2K and 1024 cpus will be 2K * 1K * 1k = 2GB memory. We should discourage such usage. Let us limit the maximum data size to be 512 for bpf_global_percpu_ma allocation. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031801.1290841-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9699a2cfe4f..d5f4ff1eb235 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -12160,6 +12162,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + if (!bpf_global_percpu_ma_set) { mutex_lock(&bpf_percpu_ma_lock); if (!bpf_global_percpu_ma_set) { -- cgit v1.2.3 From 21f5a801c171dff4e728e38f62cf626c4197d07c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:18:07 -0800 Subject: selftests/bpf: Cope with 512 bytes limit with bpf_global_percpu_ma In the previous patch, the maximum data size for bpf_global_percpu_ma is 512 bytes. This breaks selftest test_bpf_ma. The test is adjusted in two aspects: - Since the maximum allowed data size for bpf_global_percpu_ma is 512, remove all tests beyond that, names sizes 1024, 2048 and 4096. - Previously the percpu data size is bucket_size - 8 in order to avoid percpu allocation into the next bucket. This patch removed such data size adjustment thanks to Patch 1. Also, a better way to generate BTF type is used than adding a member to the value struct. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031807.1292853-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/test_bpf_ma.c | 20 ++++--- tools/testing/selftests/bpf/progs/test_bpf_ma.c | 66 +++++++++++----------- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c index d3491a84b3b9..ccae0b31ac6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c @@ -14,7 +14,8 @@ static void do_bpf_ma_test(const char *name) struct test_bpf_ma *skel; struct bpf_program *prog; struct btf *btf; - int i, err; + int i, err, id; + char tname[32]; skel = test_bpf_ma__open(); if (!ASSERT_OK_PTR(skel, "open")) @@ -25,16 +26,21 @@ static void do_bpf_ma_test(const char *name) goto out; for (i = 0; i < ARRAY_SIZE(skel->rodata->data_sizes); i++) { - char name[32]; - int id; - - snprintf(name, sizeof(name), "bin_data_%u", skel->rodata->data_sizes[i]); - id = btf__find_by_name_kind(btf, name, BTF_KIND_STRUCT); - if (!ASSERT_GT(id, 0, "bin_data")) + snprintf(tname, sizeof(tname), "bin_data_%u", skel->rodata->data_sizes[i]); + id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (!ASSERT_GT(id, 0, tname)) goto out; skel->rodata->data_btf_ids[i] = id; } + for (i = 0; i < ARRAY_SIZE(skel->rodata->percpu_data_sizes); i++) { + snprintf(tname, sizeof(tname), "percpu_bin_data_%u", skel->rodata->percpu_data_sizes[i]); + id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (!ASSERT_GT(id, 0, tname)) + goto out; + skel->rodata->percpu_data_btf_ids[i] = id; + } + prog = bpf_object__find_program_by_name(skel->obj, name); if (!ASSERT_OK_PTR(prog, "invalid prog name")) goto out; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_ma.c b/tools/testing/selftests/bpf/progs/test_bpf_ma.c index b78f4f702ae0..3494ca30fa7f 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_ma.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_ma.c @@ -20,6 +20,9 @@ char _license[] SEC("license") = "GPL"; const unsigned int data_sizes[] = {16, 32, 64, 96, 128, 192, 256, 512, 1024, 2048, 4096}; const volatile unsigned int data_btf_ids[ARRAY_SIZE(data_sizes)] = {}; +const unsigned int percpu_data_sizes[] = {8, 16, 32, 64, 96, 128, 192, 256, 512}; +const volatile unsigned int percpu_data_btf_ids[ARRAY_SIZE(data_sizes)] = {}; + int err = 0; u32 pid = 0; @@ -27,10 +30,10 @@ u32 pid = 0; struct bin_data_##_size { \ char data[_size - sizeof(void *)]; \ }; \ + /* See Commit 5d8d6634ccc, force btf generation for type bin_data_##_size */ \ + struct bin_data_##_size *__bin_data_##_size; \ struct map_value_##_size { \ struct bin_data_##_size __kptr * data; \ - /* To emit BTF info for bin_data_xx */ \ - struct bin_data_##_size not_used; \ }; \ struct { \ __uint(type, BPF_MAP_TYPE_ARRAY); \ @@ -40,8 +43,12 @@ u32 pid = 0; } array_##_size SEC(".maps") #define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \ + struct percpu_bin_data_##_size { \ + char data[_size]; \ + }; \ + struct percpu_bin_data_##_size *__percpu_bin_data_##_size; \ struct map_value_percpu_##_size { \ - struct bin_data_##_size __percpu_kptr * data; \ + struct percpu_bin_data_##_size __percpu_kptr * data; \ }; \ struct { \ __uint(type, BPF_MAP_TYPE_ARRAY); \ @@ -114,7 +121,7 @@ static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int return; } /* per-cpu allocator may not be able to refill in time */ - new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL); + new = bpf_percpu_obj_new_impl(percpu_data_btf_ids[idx], NULL); if (!new) continue; @@ -179,7 +186,7 @@ DEFINE_ARRAY_WITH_KPTR(1024); DEFINE_ARRAY_WITH_KPTR(2048); DEFINE_ARRAY_WITH_KPTR(4096); -/* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */ +DEFINE_ARRAY_WITH_PERCPU_KPTR(8); DEFINE_ARRAY_WITH_PERCPU_KPTR(16); DEFINE_ARRAY_WITH_PERCPU_KPTR(32); DEFINE_ARRAY_WITH_PERCPU_KPTR(64); @@ -188,9 +195,6 @@ DEFINE_ARRAY_WITH_PERCPU_KPTR(128); DEFINE_ARRAY_WITH_PERCPU_KPTR(192); DEFINE_ARRAY_WITH_PERCPU_KPTR(256); DEFINE_ARRAY_WITH_PERCPU_KPTR(512); -DEFINE_ARRAY_WITH_PERCPU_KPTR(1024); -DEFINE_ARRAY_WITH_PERCPU_KPTR(2048); -DEFINE_ARRAY_WITH_PERCPU_KPTR(4096); SEC("?fentry/" SYS_PREFIX "sys_nanosleep") int test_batch_alloc_free(void *ctx) @@ -246,20 +250,18 @@ int test_batch_percpu_alloc_free(void *ctx) if ((u32)bpf_get_current_pid_tgid() != pid) return 0; - /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, - * then free 128 16-bytes per-cpu objects in batch to trigger freeing. + /* Alloc 128 8-bytes per-cpu objects in batch to trigger refilling, + * then free 128 8-bytes per-cpu objects in batch to trigger freeing. */ - CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 0); - CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 1); - CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 2); - CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 3); - CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 4); - CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 5); - CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 6); - CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 7); - CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 8); - CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 9); - CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 10); + CALL_BATCH_PERCPU_ALLOC_FREE(8, 128, 0); + CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1); + CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2); + CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3); + CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4); + CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5); + CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6); + CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7); + CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8); return 0; } @@ -270,20 +272,18 @@ int test_percpu_free_through_map_free(void *ctx) if ((u32)bpf_get_current_pid_tgid() != pid) return 0; - /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, + /* Alloc 128 8-bytes per-cpu objects in batch to trigger refilling, * then free these object through map free. */ - CALL_BATCH_PERCPU_ALLOC(16, 128, 0); - CALL_BATCH_PERCPU_ALLOC(32, 128, 1); - CALL_BATCH_PERCPU_ALLOC(64, 128, 2); - CALL_BATCH_PERCPU_ALLOC(96, 128, 3); - CALL_BATCH_PERCPU_ALLOC(128, 128, 4); - CALL_BATCH_PERCPU_ALLOC(192, 128, 5); - CALL_BATCH_PERCPU_ALLOC(256, 128, 6); - CALL_BATCH_PERCPU_ALLOC(512, 64, 7); - CALL_BATCH_PERCPU_ALLOC(1024, 32, 8); - CALL_BATCH_PERCPU_ALLOC(2048, 16, 9); - CALL_BATCH_PERCPU_ALLOC(4096, 8, 10); + CALL_BATCH_PERCPU_ALLOC(8, 128, 0); + CALL_BATCH_PERCPU_ALLOC(16, 128, 1); + CALL_BATCH_PERCPU_ALLOC(32, 128, 2); + CALL_BATCH_PERCPU_ALLOC(64, 128, 3); + CALL_BATCH_PERCPU_ALLOC(96, 128, 4); + CALL_BATCH_PERCPU_ALLOC(128, 128, 5); + CALL_BATCH_PERCPU_ALLOC(192, 128, 6); + CALL_BATCH_PERCPU_ALLOC(256, 128, 7); + CALL_BATCH_PERCPU_ALLOC(512, 64, 8); return 0; } -- cgit v1.2.3 From adc8c4549d9e74d2359c217d2478b18ecdd15c91 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:18:12 -0800 Subject: selftests/bpf: Add a selftest with > 512-byte percpu allocation size Add a selftest to capture the verification failure when the allocation size is greater than 512. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031812.1293190-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/percpu_alloc_fail.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c index 1a891d30f1fe..f2b8eb2ff76f 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c @@ -17,6 +17,10 @@ struct val_with_rb_root_t { struct bpf_spin_lock lock; }; +struct val_600b_t { + char b[600]; +}; + struct elem { long sum; struct val_t __percpu_kptr *pc; @@ -161,4 +165,18 @@ int BPF_PROG(test_array_map_7) return 0; } +SEC("?fentry.s/bpf_fentry_test1") +__failure __msg("bpf_percpu_obj_new type size (600) is greater than 512") +int BPF_PROG(test_array_map_8) +{ + struct val_600b_t __percpu_kptr *p; + + p = bpf_percpu_obj_new(struct val_600b_t); + if (!p) + return 0; + + bpf_percpu_obj_drop(p); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From df7c3f7d3a3ddab31ca8cfa9b86a8729ec43fd2e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:39 -0800 Subject: libbpf: make uniform use of btf__fd() accessor inside libbpf It makes future grepping and code analysis a bit easier. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ebcfb2147fbd..f1521a400f02 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7050,7 +7050,7 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog load_attr.prog_ifindex = prog->prog_ifindex; /* specify func_info/line_info only if kernel supports them */ - btf_fd = bpf_object__btf_fd(obj); + btf_fd = btf__fd(obj->btf); if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { load_attr.prog_btf_fd = btf_fd; load_attr.func_info = prog->func_info; -- cgit v1.2.3 From fa98b54bff39f51c46fc96d3385c6292391c277b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:40 -0800 Subject: libbpf: use explicit map reuse flag to skip map creation steps Instead of inferring whether map already point to previously created/pinned BPF map (which user can specify with bpf_map__reuse_fd()) API), use explicit map->reused flag that is set in such case. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f1521a400f02..3b678b617213 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5465,7 +5465,7 @@ retry: } } - if (map->fd >= 0) { + if (map->reused) { pr_debug("map '%s': skipping creation (preset fd=%d)\n", map->name, map->fd); } else { -- cgit v1.2.3 From f08c18e083adfef92946ae1d44b07bb81e727e08 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:41 -0800 Subject: libbpf: don't rely on map->fd as an indicator of map being created With the upcoming switch to preallocated placeholder FDs for maps, switch various getters/setter away from checking map->fd. Use map_is_created() helper that detect whether BPF map can be modified based on map->obj->loaded state, with special provision for maps set up with bpf_map__reuse_fd(). For backwards compatibility, we take map_is_created() into account in bpf_map__fd() getter as well. This way before bpf_object__load() phase bpf_map__fd() will always return -1, just as before the changes in subsequent patches adding stable map->fd placeholders. We also get rid of all internal uses of bpf_map__fd() getter, as it's more oriented for uses external to libbpf. The above map_is_created() check actually interferes with some of the internal uses, if map FD is fetched through bpf_map__fd(). Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3b678b617213..6b27edd47c84 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5200,6 +5200,11 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); +static bool map_is_created(const struct bpf_map *map) +{ + return map->obj->loaded || map->reused; +} + static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { LIBBPF_OPTS(bpf_map_create_opts, create_attr); @@ -5231,7 +5236,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b map->name, err); return err; } - map->inner_map_fd = bpf_map__fd(map->inner_map); + map->inner_map_fd = map->inner_map->fd; } if (map->inner_map_fd >= 0) create_attr.inner_map_fd = map->inner_map_fd; @@ -5314,7 +5319,7 @@ static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) continue; targ_map = map->init_slots[i]; - fd = bpf_map__fd(targ_map); + fd = targ_map->fd; if (obj->gen_loader) { bpf_gen__populate_outer_map(obj->gen_loader, @@ -7135,7 +7140,7 @@ retry_load: if (map->libbpf_type != LIBBPF_MAP_RODATA) continue; - if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { + if (bpf_prog_bind_map(ret, map->fd, NULL)) { cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warn("prog '%s': failed to bind map '%s': %s\n", prog->name, map->real_name, cp); @@ -9601,7 +9606,11 @@ int libbpf_attach_type_by_name(const char *name, int bpf_map__fd(const struct bpf_map *map) { - return map ? map->fd : libbpf_err(-EINVAL); + if (!map) + return libbpf_err(-EINVAL); + if (!map_is_created(map)) + return -1; + return map->fd; } static bool map_uses_real_name(const struct bpf_map *map) @@ -9637,7 +9646,7 @@ enum bpf_map_type bpf_map__type(const struct bpf_map *map) int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.type = type; return 0; @@ -9650,7 +9659,7 @@ __u32 bpf_map__map_flags(const struct bpf_map *map) int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.map_flags = flags; return 0; @@ -9663,7 +9672,7 @@ __u64 bpf_map__map_extra(const struct bpf_map *map) int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->map_extra = map_extra; return 0; @@ -9676,7 +9685,7 @@ __u32 bpf_map__numa_node(const struct bpf_map *map) int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->numa_node = numa_node; return 0; @@ -9689,7 +9698,7 @@ __u32 bpf_map__key_size(const struct bpf_map *map) int bpf_map__set_key_size(struct bpf_map *map, __u32 size) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.key_size = size; return 0; @@ -9773,7 +9782,7 @@ static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) int bpf_map__set_value_size(struct bpf_map *map, __u32 size) { - if (map->fd >= 0) + if (map->obj->loaded || map->reused) return libbpf_err(-EBUSY); if (map->mmaped) { @@ -9814,8 +9823,11 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map) int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { + if (map->obj->loaded || map->reused) + return libbpf_err(-EBUSY); + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || - size != map->def.value_size || map->fd >= 0) + size != map->def.value_size) return libbpf_err(-EINVAL); memcpy(map->mmaped, data, size); @@ -9842,7 +9854,7 @@ __u32 bpf_map__ifindex(const struct bpf_map *map) int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->map_ifindex = ifindex; return 0; @@ -9947,7 +9959,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) static int validate_map_op(const struct bpf_map *map, size_t key_sz, size_t value_sz, bool check_value_sz) { - if (map->fd <= 0) + if (!map_is_created(map)) /* map is not yet created */ return -ENOENT; if (map->def.key_size != key_sz) { @@ -12400,7 +12412,7 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) __u32 zero = 0; int err; - if (!bpf_map__is_struct_ops(map) || map->fd < 0) + if (!bpf_map__is_struct_ops(map) || !map_is_created(map)) return -EINVAL; st_ops_link = container_of(link, struct bpf_link_struct_ops, link); @@ -13304,7 +13316,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) for (i = 0; i < s->map_cnt; i++) { struct bpf_map *map = *s->maps[i].map; size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); - int prot, map_fd = bpf_map__fd(map); + int prot, map_fd = map->fd; void **mmaped = s->maps[i].mmaped; if (!mmaped) -- cgit v1.2.3 From dac645b950ea4fc0896fe46a645365cb8d9ab92b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:42 -0800 Subject: libbpf: use stable map placeholder FDs Move map creation to later during BPF object loading by pre-creating stable placeholder FDs (utilizing memfd_create()). Use dup2() syscall to then atomically make those placeholder FDs point to real kernel BPF map objects. This change allows to delay BPF map creation to after all the BPF program relocations. That, in turn, allows to delay BTF finalization and loading into kernel to after all the relocations as well. We'll take advantage of the latter in subsequent patches to allow libbpf to adjust BTF in a way that helps with BPF global function usage. Clean up a few places where we close map->fd, which now shouldn't happen, because map->fd should be a valid FD regardless of whether map was created or not. Surprisingly and nicely it simplifies a bunch of error handling code. If this change doesn't backfire, I'm tempted to pre-create such stable FDs for other entities (progs, maybe even BTF). We previously did some manipulations to make gen_loader work with fake map FDs, with stable map FDs this hack is not necessary for maps (we still have it for BTF, but I left it as is for now). Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 101 +++++++++++++++++++++++++--------------- tools/lib/bpf/libbpf_internal.h | 14 ++++++ 2 files changed, 77 insertions(+), 38 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6b27edd47c84..a58569b7e4bf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1503,6 +1503,16 @@ static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *nam return ERR_PTR(-ENOENT); } +static int create_placeholder_fd(void) +{ + int fd; + + fd = ensure_good_fd(memfd_create("libbpf-placeholder-fd", MFD_CLOEXEC)); + if (fd < 0) + return -errno; + return fd; +} + static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) { struct bpf_map *map; @@ -1515,7 +1525,21 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) map = &obj->maps[obj->nr_maps++]; map->obj = obj; - map->fd = -1; + /* Preallocate map FD without actually creating BPF map just yet. + * These map FD "placeholders" will be reused later without changing + * FD value when map is actually created in the kernel. + * + * This is useful to be able to perform BPF program relocations + * without having to create BPF maps before that step. This allows us + * to finalize and load BTF very late in BPF object's loading phase, + * right before BPF maps have to be created and BPF programs have to + * be loaded. By having these map FD placeholders we can perform all + * the sanitizations, relocations, and any other adjustments before we + * start creating actual BPF kernel objects (BTF, maps, progs). + */ + map->fd = create_placeholder_fd(); + if (map->fd < 0) + return ERR_PTR(map->fd); map->inner_map_fd = -1; map->autocreate = true; @@ -2607,7 +2631,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, map->inner_map = calloc(1, sizeof(*map->inner_map)); if (!map->inner_map) return -ENOMEM; - map->inner_map->fd = -1; + map->inner_map->fd = create_placeholder_fd(); + if (map->inner_map->fd < 0) + return map->inner_map->fd; map->inner_map->sec_idx = sec_idx; map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); if (!map->inner_map->name) @@ -4549,14 +4575,12 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) goto err_free_new_name; } - err = zclose(map->fd); - if (err) { - err = -errno; - goto err_close_new_fd; - } + err = reuse_fd(map->fd, new_fd); + if (err) + goto err_free_new_name; + free(map->name); - map->fd = new_fd; map->name = new_name; map->def.type = info.type; map->def.key_size = info.key_size; @@ -4570,8 +4594,6 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) return 0; -err_close_new_fd: - close(new_fd); err_free_new_name: free(new_name); return libbpf_err(err); @@ -5210,7 +5232,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b LIBBPF_OPTS(bpf_map_create_opts, create_attr); struct bpf_map_def *def = &map->def; const char *map_name = NULL; - int err = 0; + int err = 0, map_fd; if (kernel_supports(obj, FEAT_PROG_NAME)) map_name = map->name; @@ -5269,17 +5291,19 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b bpf_gen__map_create(obj->gen_loader, def->type, map_name, def->key_size, def->value_size, def->max_entries, &create_attr, is_inner ? -1 : map - obj->maps); - /* Pretend to have valid FD to pass various fd >= 0 checks. - * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + /* We keep pretenting we have valid FD to pass various fd >= 0 + * checks by just keeping original placeholder FDs in place. + * See bpf_object__add_map() comment. + * This placeholder fd will not be used with any syscall and + * will be reset to -1 eventually. */ - map->fd = 0; + map_fd = map->fd; } else { - map->fd = bpf_map_create(def->type, map_name, - def->key_size, def->value_size, - def->max_entries, &create_attr); + map_fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); } - if (map->fd < 0 && (create_attr.btf_key_type_id || - create_attr.btf_value_type_id)) { + if (map_fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { char *cp, errmsg[STRERR_BUFSIZE]; err = -errno; @@ -5291,13 +5315,11 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; - map->fd = bpf_map_create(def->type, map_name, - def->key_size, def->value_size, - def->max_entries, &create_attr); + map_fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); } - err = map->fd < 0 ? -errno : 0; - if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { if (obj->gen_loader) map->inner_map->fd = -1; @@ -5305,7 +5327,19 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b zfree(&map->inner_map); } - return err; + if (map_fd < 0) + return map_fd; + + /* obj->gen_loader case, prevent reuse_fd() from closing map_fd */ + if (map->fd == map_fd) + return 0; + + /* Keep placeholder FD value but now point it to the BPF map object. + * This way everything that relied on this map's FD (e.g., relocated + * ldimm64 instructions) will stay valid and won't need adjustments. + * map->fd stays valid but now point to what map_fd points to. + */ + return reuse_fd(map->fd, map_fd); } static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) @@ -5389,10 +5423,8 @@ static int bpf_object_init_prog_arrays(struct bpf_object *obj) continue; err = init_prog_array_slots(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) return err; - } } return 0; } @@ -5483,25 +5515,20 @@ retry: if (bpf_map__is_internal(map)) { err = bpf_object__populate_internal_map(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) goto err_out; - } } if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { err = init_map_in_map_slots(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) goto err_out; - } } } if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { - zclose(map->fd); if (!retried && err == -EEXIST) { retried = true; goto retry; @@ -8075,8 +8102,8 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); - err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); err = err ? : bpf_object_prepare_struct_ops(obj); @@ -8085,8 +8112,6 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch /* reset FDs */ if (obj->btf) btf__set_fd(obj->btf, -1); - for (i = 0; i < obj->nr_maps; i++) - obj->maps[i].fd = -1; if (!err) err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index b5d334754e5d..27e4e320e1a6 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -555,6 +555,20 @@ static inline int ensure_good_fd(int fd) return fd; } +/* Point *fixed_fd* to the same file that *tmp_fd* points to. + * Regardless of success, *tmp_fd* is closed. + * Whatever *fixed_fd* pointed to is closed silently. + */ +static inline int reuse_fd(int fixed_fd, int tmp_fd) +{ + int err; + + err = dup2(tmp_fd, fixed_fd); + err = err < 0 ? -errno : 0; + close(tmp_fd); /* clean up temporary FD */ + return err; +} + /* The following two functions are exposed to bpftool */ int bpf_core_add_cands(struct bpf_core_cand *local_cand, size_t local_essent_len, -- cgit v1.2.3 From fb03be7c4a27c25696287df4ee06c5aafa31267c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:43 -0800 Subject: libbpf: move exception callbacks assignment logic into relocation step Move the logic of finding and assigning exception callback indices from BTF sanitization step to program relocations step, which seems more logical and will unblock moving BTF loading to after relocation step. Exception callbacks discovery and assignment has no dependency on BTF being loaded into the kernel, it only uses BTF information. It does need to happen before subprogram relocations happen, though. Which is why the split. No functional changes. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 165 +++++++++++++++++++++++++------------------------ 1 file changed, 85 insertions(+), 80 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a58569b7e4bf..01d45f0c40d0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3192,86 +3192,6 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) } } - if (!kernel_supports(obj, FEAT_BTF_DECL_TAG)) - goto skip_exception_cb; - for (i = 0; i < obj->nr_programs; i++) { - struct bpf_program *prog = &obj->programs[i]; - int j, k, n; - - if (prog_is_subprog(obj, prog)) - continue; - n = btf__type_cnt(obj->btf); - for (j = 1; j < n; j++) { - const char *str = "exception_callback:", *name; - size_t len = strlen(str); - struct btf_type *t; - - t = btf_type_by_id(obj->btf, j); - if (!btf_is_decl_tag(t) || btf_decl_tag(t)->component_idx != -1) - continue; - - name = btf__str_by_offset(obj->btf, t->name_off); - if (strncmp(name, str, len)) - continue; - - t = btf_type_by_id(obj->btf, t->type); - if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) { - pr_warn("prog '%s': exception_callback: decl tag not applied to the main program\n", - prog->name); - return -EINVAL; - } - if (strcmp(prog->name, btf__str_by_offset(obj->btf, t->name_off))) - continue; - /* Multiple callbacks are specified for the same prog, - * the verifier will eventually return an error for this - * case, hence simply skip appending a subprog. - */ - if (prog->exception_cb_idx >= 0) { - prog->exception_cb_idx = -1; - break; - } - - name += len; - if (str_is_empty(name)) { - pr_warn("prog '%s': exception_callback: decl tag contains empty value\n", - prog->name); - return -EINVAL; - } - - for (k = 0; k < obj->nr_programs; k++) { - struct bpf_program *subprog = &obj->programs[k]; - - if (!prog_is_subprog(obj, subprog)) - continue; - if (strcmp(name, subprog->name)) - continue; - /* Enforce non-hidden, as from verifier point of - * view it expects global functions, whereas the - * mark_btf_static fixes up linkage as static. - */ - if (!subprog->sym_global || subprog->mark_btf_static) { - pr_warn("prog '%s': exception callback %s must be a global non-hidden function\n", - prog->name, subprog->name); - return -EINVAL; - } - /* Let's see if we already saw a static exception callback with the same name */ - if (prog->exception_cb_idx >= 0) { - pr_warn("prog '%s': multiple subprogs with same name as exception callback '%s'\n", - prog->name, subprog->name); - return -EINVAL; - } - prog->exception_cb_idx = k; - break; - } - - if (prog->exception_cb_idx >= 0) - continue; - pr_warn("prog '%s': cannot find exception callback '%s'\n", prog->name, name); - return -ENOENT; - } - } -skip_exception_cb: - sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; @@ -6661,6 +6581,88 @@ static void bpf_object__sort_relos(struct bpf_object *obj) } } +static int bpf_prog_assign_exc_cb(struct bpf_object *obj, struct bpf_program *prog) +{ + const char *str = "exception_callback:"; + size_t pfx_len = strlen(str); + int i, j, n; + + if (!obj->btf || !kernel_supports(obj, FEAT_BTF_DECL_TAG)) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + const char *name; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (!btf_is_decl_tag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strncmp(name, str, pfx_len) != 0) + continue; + + t = btf_type_by_id(obj->btf, t->type); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + pr_warn("prog '%s': exception_callback: decl tag not applied to the main program\n", + prog->name); + return -EINVAL; + } + if (strcmp(prog->name, btf__str_by_offset(obj->btf, t->name_off)) != 0) + continue; + /* Multiple callbacks are specified for the same prog, + * the verifier will eventually return an error for this + * case, hence simply skip appending a subprog. + */ + if (prog->exception_cb_idx >= 0) { + prog->exception_cb_idx = -1; + break; + } + + name += pfx_len; + if (str_is_empty(name)) { + pr_warn("prog '%s': exception_callback: decl tag contains empty value\n", + prog->name); + return -EINVAL; + } + + for (j = 0; j < obj->nr_programs; j++) { + struct bpf_program *subprog = &obj->programs[j]; + + if (!prog_is_subprog(obj, subprog)) + continue; + if (strcmp(name, subprog->name) != 0) + continue; + /* Enforce non-hidden, as from verifier point of + * view it expects global functions, whereas the + * mark_btf_static fixes up linkage as static. + */ + if (!subprog->sym_global || subprog->mark_btf_static) { + pr_warn("prog '%s': exception callback %s must be a global non-hidden function\n", + prog->name, subprog->name); + return -EINVAL; + } + /* Let's see if we already saw a static exception callback with the same name */ + if (prog->exception_cb_idx >= 0) { + pr_warn("prog '%s': multiple subprogs with same name as exception callback '%s'\n", + prog->name, subprog->name); + return -EINVAL; + } + prog->exception_cb_idx = j; + break; + } + + if (prog->exception_cb_idx >= 0) + continue; + + pr_warn("prog '%s': cannot find exception callback '%s'\n", prog->name, name); + return -ENOENT; + } + + return 0; +} + static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) { @@ -6721,6 +6723,9 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return err; } + err = bpf_prog_assign_exc_cb(obj, prog); + if (err) + return err; /* Now, also append exception callback if it has not been done already. */ if (prog->exception_cb_idx >= 0) { struct bpf_program *subprog = &obj->programs[prog->exception_cb_idx]; -- cgit v1.2.3 From 1004742d7ff03a088e74133af2401556ac80092b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:44 -0800 Subject: libbpf: move BTF loading step after relocation step With all the preparations in previous patches done we are ready to postpone BTF loading and sanitization step until after all the relocations are performed. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 01d45f0c40d0..836986974de3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8104,10 +8104,10 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = bpf_object__probe_loading(obj); err = err ? : bpf_object__load_vmlinux_btf(obj, false); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); - err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); -- cgit v1.2.3 From 2f38fe689470055440bf80fc644920023a643a82 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:45 -0800 Subject: libbpf: implement __arg_ctx fallback logic Out of all special global func arg tag annotations, __arg_ctx is practically is the most immediately useful and most critical to have working across multitude kernel version, if possible. This would allow end users to write much simpler code if __arg_ctx semantics worked for older kernels that don't natively understand btf_decl_tag("arg:ctx") in verifier logic. Luckily, it is possible to ensure __arg_ctx works on old kernels through a bit of extra work done by libbpf, at least in a lot of common cases. To explain the overall idea, we need to go back at how context argument was supported in global funcs before __arg_ctx support was added. This was done based on special struct name checks in kernel. E.g., for BPF_PROG_TYPE_PERF_EVENT the expectation is that argument type `struct bpf_perf_event_data *` mark that argument as PTR_TO_CTX. This is all good as long as global function is used from the same BPF program types only, which is often not the case. If the same subprog has to be called from, say, kprobe and perf_event program types, there is no single definition that would satisfy BPF verifier. Subprog will have context argument either for kprobe (if using bpf_user_pt_regs_t struct name) or perf_event (with bpf_perf_event_data struct name), but not both. This limitation was the reason to add btf_decl_tag("arg:ctx"), making the actual argument type not important, so that user can just define "generic" signature: __noinline int global_subprog(void *ctx __arg_ctx) { ... } I won't belabor how libbpf is implementing subprograms, see a huge comment next to bpf_object_relocate_calls() function. The idea is that each main/entry BPF program gets its own copy of global_subprog's code appended. This per-program copy of global subprog code *and* associated func_info .BTF.ext information, pointing to FUNC -> FUNC_PROTO BTF type chain allows libbpf to simulate __arg_ctx behavior transparently, even if the kernel doesn't yet support __arg_ctx annotation natively. The idea is straightforward: each time we append global subprog's code and func_info information, we adjust its FUNC -> FUNC_PROTO type information, if necessary (that is, libbpf can detect the presence of btf_decl_tag("arg:ctx") just like BPF verifier would do it). The rest is just mechanical and somewhat painful BTF manipulation code. It's painful because we need to clone FUNC -> FUNC_PROTO, instead of reusing it, as same FUNC -> FUNC_PROTO chain might be used by another main BPF program within the same BPF object, so we can't just modify it in-place (and cloning BTF types within the same struct btf object is painful due to constant memory invalidation, see comments in code). Uploaded BPF object's BTF information has to work for all BPF programs at the same time. Once we have FUNC -> FUNC_PROTO clones, we make sure that instead of using some `void *ctx` parameter definition, we have an expected `struct bpf_perf_event_data *ctx` definition (as far as BPF verifier and kernel is concerned), which will mark it as context for BPF verifier. Same global subprog relocated and copied into another main BPF program will get different type information according to main program's type. It all works out in the end in a completely transparent way for end user. Libbpf maintains internal program type -> expected context struct name mapping internally. Note, not all BPF program types have named context struct, so this approach won't work for such programs (just like it didn't before __arg_ctx). So native __arg_ctx is still important to have in kernel to have generic context support across all BPF program types. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 252 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 836986974de3..c5a42ac309fd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6181,7 +6181,7 @@ reloc_prog_func_and_line_info(const struct bpf_object *obj, int err; /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't - * supprot func/line info + * support func/line info */ if (!obj->btf_ext || !kernel_supports(obj, FEAT_BTF_FUNC)) return 0; @@ -6663,8 +6663,247 @@ static int bpf_prog_assign_exc_cb(struct bpf_object *obj, struct bpf_program *pr return 0; } -static int -bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) +static struct { + enum bpf_prog_type prog_type; + const char *ctx_name; +} global_ctx_map[] = { + { BPF_PROG_TYPE_CGROUP_DEVICE, "bpf_cgroup_dev_ctx" }, + { BPF_PROG_TYPE_CGROUP_SKB, "__sk_buff" }, + { BPF_PROG_TYPE_CGROUP_SOCK, "bpf_sock" }, + { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, "bpf_sock_addr" }, + { BPF_PROG_TYPE_CGROUP_SOCKOPT, "bpf_sockopt" }, + { BPF_PROG_TYPE_CGROUP_SYSCTL, "bpf_sysctl" }, + { BPF_PROG_TYPE_FLOW_DISSECTOR, "__sk_buff" }, + { BPF_PROG_TYPE_KPROBE, "bpf_user_pt_regs_t" }, + { BPF_PROG_TYPE_LWT_IN, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_OUT, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_SEG6LOCAL, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_XMIT, "__sk_buff" }, + { BPF_PROG_TYPE_NETFILTER, "bpf_nf_ctx" }, + { BPF_PROG_TYPE_PERF_EVENT, "bpf_perf_event_data" }, + { BPF_PROG_TYPE_RAW_TRACEPOINT, "bpf_raw_tracepoint_args" }, + { BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, "bpf_raw_tracepoint_args" }, + { BPF_PROG_TYPE_SCHED_ACT, "__sk_buff" }, + { BPF_PROG_TYPE_SCHED_CLS, "__sk_buff" }, + { BPF_PROG_TYPE_SK_LOOKUP, "bpf_sk_lookup" }, + { BPF_PROG_TYPE_SK_MSG, "sk_msg_md" }, + { BPF_PROG_TYPE_SK_REUSEPORT, "sk_reuseport_md" }, + { BPF_PROG_TYPE_SK_SKB, "__sk_buff" }, + { BPF_PROG_TYPE_SOCK_OPS, "bpf_sock_ops" }, + { BPF_PROG_TYPE_SOCKET_FILTER, "__sk_buff" }, + { BPF_PROG_TYPE_XDP, "xdp_md" }, + /* all other program types don't have "named" context structs */ +}; + +static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_program *prog) +{ + int fn_id, fn_proto_id, ret_type_id, orig_proto_id; + int i, err, arg_cnt, fn_name_off, linkage; + struct btf_type *fn_t, *fn_proto_t, *t; + struct btf_param *p; + + /* caller already validated FUNC -> FUNC_PROTO validity */ + fn_t = btf_type_by_id(btf, orig_fn_id); + fn_proto_t = btf_type_by_id(btf, fn_t->type); + + /* Note that each btf__add_xxx() operation invalidates + * all btf_type and string pointers, so we need to be + * very careful when cloning BTF types. BTF type + * pointers have to be always refetched. And to avoid + * problems with invalidated string pointers, we + * add empty strings initially, then just fix up + * name_off offsets in place. Offsets are stable for + * existing strings, so that works out. + */ + fn_name_off = fn_t->name_off; /* we are about to invalidate fn_t */ + linkage = btf_func_linkage(fn_t); + orig_proto_id = fn_t->type; /* original FUNC_PROTO ID */ + ret_type_id = fn_proto_t->type; /* fn_proto_t will be invalidated */ + arg_cnt = btf_vlen(fn_proto_t); + + /* clone FUNC_PROTO and its params */ + fn_proto_id = btf__add_func_proto(btf, ret_type_id); + if (fn_proto_id < 0) + return -EINVAL; + + for (i = 0; i < arg_cnt; i++) { + int name_off; + + /* copy original parameter data */ + t = btf_type_by_id(btf, orig_proto_id); + p = &btf_params(t)[i]; + name_off = p->name_off; + + err = btf__add_func_param(btf, "", p->type); + if (err) + return err; + + fn_proto_t = btf_type_by_id(btf, fn_proto_id); + p = &btf_params(fn_proto_t)[i]; + p->name_off = name_off; /* use remembered str offset */ + } + + /* clone FUNC now, btf__add_func() enforces non-empty name, so use + * entry program's name as a placeholder, which we replace immediately + * with original name_off + */ + fn_id = btf__add_func(btf, prog->name, linkage, fn_proto_id); + if (fn_id < 0) + return -EINVAL; + + fn_t = btf_type_by_id(btf, fn_id); + fn_t->name_off = fn_name_off; /* reuse original string */ + + return fn_id; +} + +/* Check if main program or global subprog's function prototype has `arg:ctx` + * argument tags, and, if necessary, substitute correct type to match what BPF + * verifier would expect, taking into account specific program type. This + * allows to support __arg_ctx tag transparently on old kernels that don't yet + * have a native support for it in the verifier, making user's life much + * easier. + */ +static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_program *prog) +{ + const char *ctx_name = NULL, *ctx_tag = "arg:ctx"; + struct bpf_func_info_min *func_rec; + struct btf_type *fn_t, *fn_proto_t; + struct btf *btf = obj->btf; + const struct btf_type *t; + struct btf_param *p; + int ptr_id = 0, struct_id, tag_id, orig_fn_id; + int i, n, arg_idx, arg_cnt, err, rec_idx; + int *orig_ids; + + /* no .BTF.ext, no problem */ + if (!obj->btf_ext || !prog->func_info) + return 0; + + /* some BPF program types just don't have named context structs, so + * this fallback mechanism doesn't work for them + */ + for (i = 0; i < ARRAY_SIZE(global_ctx_map); i++) { + if (global_ctx_map[i].prog_type != prog->type) + continue; + ctx_name = global_ctx_map[i].ctx_name; + break; + } + if (!ctx_name) + return 0; + + /* remember original func BTF IDs to detect if we already cloned them */ + orig_ids = calloc(prog->func_info_cnt, sizeof(*orig_ids)); + if (!orig_ids) + return -ENOMEM; + for (i = 0; i < prog->func_info_cnt; i++) { + func_rec = prog->func_info + prog->func_info_rec_size * i; + orig_ids[i] = func_rec->type_id; + } + + /* go through each DECL_TAG with "arg:ctx" and see if it points to one + * of our subprogs; if yes and subprog is global and needs adjustment, + * clone and adjust FUNC -> FUNC_PROTO combo + */ + for (i = 1, n = btf__type_cnt(btf); i < n; i++) { + /* only DECL_TAG with "arg:ctx" value are interesting */ + t = btf__type_by_id(btf, i); + if (!btf_is_decl_tag(t)) + continue; + if (strcmp(btf__str_by_offset(btf, t->name_off), ctx_tag) != 0) + continue; + + /* only global funcs need adjustment, if at all */ + orig_fn_id = t->type; + fn_t = btf_type_by_id(btf, orig_fn_id); + if (!btf_is_func(fn_t) || btf_func_linkage(fn_t) != BTF_FUNC_GLOBAL) + continue; + + /* sanity check FUNC -> FUNC_PROTO chain, just in case */ + fn_proto_t = btf_type_by_id(btf, fn_t->type); + if (!fn_proto_t || !btf_is_func_proto(fn_proto_t)) + continue; + + /* find corresponding func_info record */ + func_rec = NULL; + for (rec_idx = 0; rec_idx < prog->func_info_cnt; rec_idx++) { + if (orig_ids[rec_idx] == t->type) { + func_rec = prog->func_info + prog->func_info_rec_size * rec_idx; + break; + } + } + /* current main program doesn't call into this subprog */ + if (!func_rec) + continue; + + /* some more sanity checking of DECL_TAG */ + arg_cnt = btf_vlen(fn_proto_t); + arg_idx = btf_decl_tag(t)->component_idx; + if (arg_idx < 0 || arg_idx >= arg_cnt) + continue; + + /* check if existing parameter already matches verifier expectations */ + p = &btf_params(fn_proto_t)[arg_idx]; + t = skip_mods_and_typedefs(btf, p->type, NULL); + if (btf_is_ptr(t) && + (t = skip_mods_and_typedefs(btf, t->type, NULL)) && + btf_is_struct(t) && + strcmp(btf__str_by_offset(btf, t->name_off), ctx_name) == 0) { + continue; /* no need for fix up */ + } + + /* clone fn/fn_proto, unless we already did it for another arg */ + if (func_rec->type_id == orig_fn_id) { + int fn_id; + + fn_id = clone_func_btf_info(btf, orig_fn_id, prog); + if (fn_id < 0) { + err = fn_id; + goto err_out; + } + + /* point func_info record to a cloned FUNC type */ + func_rec->type_id = fn_id; + } + + /* create PTR -> STRUCT type chain to mark PTR_TO_CTX argument; + * we do it just once per main BPF program, as all global + * funcs share the same program type, so need only PTR -> + * STRUCT type chain + */ + if (ptr_id == 0) { + struct_id = btf__add_struct(btf, ctx_name, 0); + ptr_id = btf__add_ptr(btf, struct_id); + if (ptr_id < 0 || struct_id < 0) { + err = -EINVAL; + goto err_out; + } + } + + /* for completeness, clone DECL_TAG and point it to cloned param */ + tag_id = btf__add_decl_tag(btf, ctx_tag, func_rec->type_id, arg_idx); + if (tag_id < 0) { + err = -EINVAL; + goto err_out; + } + + /* all the BTF manipulations invalidated pointers, refetch them */ + fn_t = btf_type_by_id(btf, func_rec->type_id); + fn_proto_t = btf_type_by_id(btf, fn_t->type); + + /* fix up type ID pointed to by param */ + p = &btf_params(fn_proto_t)[arg_idx]; + p->type = ptr_id; + } + + free(orig_ids); + return 0; +err_out: + free(orig_ids); + return err; +} + +static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) { struct bpf_program *prog; size_t i, j; @@ -6745,19 +6984,28 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) } } } - /* Process data relos for main programs */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) continue; if (!prog->autoload) continue; + + /* Process data relos for main programs */ err = bpf_object__relocate_data(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate data references: %d\n", prog->name, err); return err; } + + /* Fix up .BTF.ext information, if necessary */ + err = bpf_program_fixup_func_info(obj, prog); + if (err) { + pr_warn("prog '%s': failed to perform .BTF.ext fix ups: %d\n", + prog->name, err); + return err; + } } return 0; -- cgit v1.2.3 From 67fe459144dd629855bd9fb4b12bd9c4f792a8cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:46 -0800 Subject: selftests/bpf: add arg:ctx cases to test_global_funcs tests Add a few extra cases of global funcs with context arguments. This time rely on "arg:ctx" decl_tag (__arg_ctx macro), but put it next to "classic" cases where context argument has to be of an exact type that BPF verifier expects (e.g., bpf_user_pt_regs_t for kprobe/uprobe). Colocating all these cases separately from other global func args that rely on arg:xxx decl tags (in verifier_global_subprogs.c) allows for simpler backwards compatibility testing on old kernels. All the cases in test_global_func_ctx_args.c are supposed to work on older kernels, which was manually validated during development. Acked-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/test_global_func_ctx_args.c | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c index 7faa8eef0598..9a06e5eb1fbe 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c +++ b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c @@ -102,3 +102,52 @@ int perf_event_ctx(void *ctx) { return perf_event_ctx_subprog(ctx); } + +/* this global subprog can be now called from many types of entry progs, each + * with different context type + */ +__weak int subprog_ctx_tag(void *ctx __arg_ctx) +{ + return bpf_get_stack(ctx, stack, sizeof(stack), 0); +} + +struct my_struct { int x; }; + +__weak int subprog_multi_ctx_tags(void *ctx1 __arg_ctx, + struct my_struct *mem, + void *ctx2 __arg_ctx) +{ + if (!mem) + return 0; + + return bpf_get_stack(ctx1, stack, sizeof(stack), 0) + + mem->x + + bpf_get_stack(ctx2, stack, sizeof(stack), 0); +} + +SEC("?raw_tp") +__success __log_level(2) +int arg_tag_ctx_raw_tp(void *ctx) +{ + struct my_struct x = { .x = 123 }; + + return subprog_ctx_tag(ctx) + subprog_multi_ctx_tags(ctx, &x, ctx); +} + +SEC("?perf_event") +__success __log_level(2) +int arg_tag_ctx_perf(void *ctx) +{ + struct my_struct x = { .x = 123 }; + + return subprog_ctx_tag(ctx) + subprog_multi_ctx_tags(ctx, &x, ctx); +} + +SEC("?kprobe") +__success __log_level(2) +int arg_tag_ctx_kprobe(void *ctx) +{ + struct my_struct x = { .x = 123 }; + + return subprog_ctx_tag(ctx) + subprog_multi_ctx_tags(ctx, &x, ctx); +} -- cgit v1.2.3 From 95226f5a36695fd5740e130016d9ed697cfb2bad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Jan 2024 17:38:47 -0800 Subject: selftests/bpf: add __arg_ctx BTF rewrite test Add a test validating that libbpf uploads BTF and func_info with rewritten type information for arguments of global subprogs that are marked with __arg_ctx tag. Suggested-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240104013847.3875810-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/test_global_funcs.c | 106 +++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index e0879df38639..67d4ef9e62b3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -20,6 +20,109 @@ #include "test_global_func17.skel.h" #include "test_global_func_ctx_args.skel.h" +#include "bpf/libbpf_internal.h" +#include "btf_helpers.h" + +static void check_ctx_arg_type(const struct btf *btf, const struct btf_param *p) +{ + const struct btf_type *t; + const char *s; + + t = btf__type_by_id(btf, p->type); + if (!ASSERT_EQ(btf_kind(t), BTF_KIND_PTR, "ptr_t")) + return; + + s = btf_type_raw_dump(btf, t->type); + if (!ASSERT_HAS_SUBSTR(s, "STRUCT 'bpf_perf_event_data' size=0 vlen=0", + "ctx_struct_t")) + return; +} + +static void subtest_ctx_arg_rewrite(void) +{ + struct test_global_func_ctx_args *skel = NULL; + struct bpf_prog_info info; + char func_info_buf[1024] __attribute__((aligned(8))); + struct bpf_func_info_min *rec; + struct btf *btf = NULL; + __u32 info_len = sizeof(info); + int err, fd, i; + + skel = test_global_func_ctx_args__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bpf_program__set_autoload(skel->progs.arg_tag_ctx_perf, true); + + err = test_global_func_ctx_args__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto out; + + memset(&info, 0, sizeof(info)); + info.func_info = ptr_to_u64(&func_info_buf); + info.nr_func_info = 3; + info.func_info_rec_size = sizeof(struct bpf_func_info_min); + + fd = bpf_program__fd(skel->progs.arg_tag_ctx_perf); + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "prog_info")) + goto out; + + if (!ASSERT_EQ(info.nr_func_info, 3, "nr_func_info")) + goto out; + + btf = btf__load_from_kernel_by_id(info.btf_id); + if (!ASSERT_OK_PTR(btf, "obj_kern_btf")) + goto out; + + rec = (struct bpf_func_info_min *)func_info_buf; + for (i = 0; i < info.nr_func_info; i++, rec = (void *)rec + info.func_info_rec_size) { + const struct btf_type *fn_t, *proto_t; + const char *name; + + if (rec->insn_off == 0) + continue; /* main prog, skip */ + + fn_t = btf__type_by_id(btf, rec->type_id); + if (!ASSERT_OK_PTR(fn_t, "fn_type")) + goto out; + if (!ASSERT_EQ(btf_kind(fn_t), BTF_KIND_FUNC, "fn_type_kind")) + goto out; + proto_t = btf__type_by_id(btf, fn_t->type); + if (!ASSERT_OK_PTR(proto_t, "proto_type")) + goto out; + + name = btf__name_by_offset(btf, fn_t->name_off); + if (strcmp(name, "subprog_ctx_tag") == 0) { + /* int subprog_ctx_tag(void *ctx __arg_ctx) */ + if (!ASSERT_EQ(btf_vlen(proto_t), 1, "arg_cnt")) + goto out; + + /* arg 0 is PTR -> STRUCT bpf_perf_event_data */ + check_ctx_arg_type(btf, &btf_params(proto_t)[0]); + } else if (strcmp(name, "subprog_multi_ctx_tags") == 0) { + /* int subprog_multi_ctx_tags(void *ctx1 __arg_ctx, + * struct my_struct *mem, + * void *ctx2 __arg_ctx) + */ + if (!ASSERT_EQ(btf_vlen(proto_t), 3, "arg_cnt")) + goto out; + + /* arg 0 is PTR -> STRUCT bpf_perf_event_data */ + check_ctx_arg_type(btf, &btf_params(proto_t)[0]); + /* arg 2 is PTR -> STRUCT bpf_perf_event_data */ + check_ctx_arg_type(btf, &btf_params(proto_t)[2]); + } else { + ASSERT_FAIL("unexpected subprog %s", name); + goto out; + } + } + +out: + btf__free(btf); + test_global_func_ctx_args__destroy(skel); +} + void test_test_global_funcs(void) { RUN_TESTS(test_global_func1); @@ -40,4 +143,7 @@ void test_test_global_funcs(void) RUN_TESTS(test_global_func16); RUN_TESTS(test_global_func17); RUN_TESTS(test_global_func_ctx_args); + + if (test__start_subtest("ctx_arg_rewrite")) + subtest_ctx_arg_rewrite(); } -- cgit v1.2.3 From 9ddf872b47e3ac8f27dbfc4a4737a976c7588de6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Jan 2024 08:57:44 -0800 Subject: bpf: Remove unnecessary cpu == 0 check in memalloc After merging the patch set [1] to reduce memory usage for bpf_global_percpu_ma, Alexei found a redundant check (cpu == 0) in function bpf_mem_alloc_percpu_unit_init() ([2]). Indeed, the check is unnecessary since c->unit_size will be all NULL or all non-NULL for all cpus before for_each_possible_cpu() loop. Removing the check makes code less confusing. [1] https://lore.kernel.org/all/20231222031729.1287957-1-yonghong.song@linux.dev/ [2] https://lore.kernel.org/all/20231222031745.1289082-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240104165744.702239-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 460c8f38fed6..550f02e2cb13 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -613,7 +613,7 @@ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); c = &cc->cache[i]; - if (cpu == 0 && c->unit_size) + if (c->unit_size) break; c->unit_size = unit_size; -- cgit v1.2.3 From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Tue, 26 Dec 2023 14:07:42 +0100 Subject: bpfilter: remove bpfilter bpfilter was supposed to convert iptables filtering rules into BPF programs on the fly, from the kernel, through a usermode helper. The base code for the UMH was introduced in 2018, and couple of attempts (2, 3) tried to introduce the BPF program generate features but were abandoned. bpfilter now sits in a kernel tree unused and unusable, occasionally causing confusion amongst Linux users (4, 5). As bpfilter is now developed in a dedicated repository on GitHub (6), it was suggested a couple of times this year (LSFMM/BPF 2023, LPC 2023) to remove the deprecated kernel part of the project. This is the purpose of this patch. [1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/ [2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t [3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/ [4]: https://dxuuu.xyz/bpfilter.html [5]: https://github.com/linuxkit/linuxkit/pull/3904 [6]: https://github.com/facebook/bpfilter Signed-off-by: Quentin Deslandes Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de Signed-off-by: Alexei Starovoitov --- arch/loongarch/configs/loongson3_defconfig | 1 - include/linux/bpfilter.h | 24 ----- include/uapi/linux/bpfilter.h | 21 ----- net/Kconfig | 2 - net/Makefile | 1 - net/bpfilter/.gitignore | 2 - net/bpfilter/Kconfig | 23 ----- net/bpfilter/Makefile | 20 ----- net/bpfilter/bpfilter_kern.c | 136 ----------------------------- net/bpfilter/bpfilter_umh_blob.S | 7 -- net/bpfilter/main.c | 64 -------------- net/bpfilter/msgfmt.h | 17 ---- net/ipv4/Makefile | 2 - net/ipv4/bpfilter/Makefile | 2 - net/ipv4/bpfilter/sockopt.c | 71 --------------- net/ipv4/ip_sockglue.c | 12 --- tools/bpf/bpftool/feature.c | 4 - tools/testing/selftests/bpf/config.aarch64 | 1 - tools/testing/selftests/bpf/config.s390x | 1 - tools/testing/selftests/bpf/config.x86_64 | 1 - tools/testing/selftests/hid/config | 1 - 21 files changed, 413 deletions(-) delete mode 100644 include/linux/bpfilter.h delete mode 100644 include/uapi/linux/bpfilter.h delete mode 100644 net/bpfilter/.gitignore delete mode 100644 net/bpfilter/Kconfig delete mode 100644 net/bpfilter/Makefile delete mode 100644 net/bpfilter/bpfilter_kern.c delete mode 100644 net/bpfilter/bpfilter_umh_blob.S delete mode 100644 net/bpfilter/main.c delete mode 100644 net/bpfilter/msgfmt.h delete mode 100644 net/ipv4/bpfilter/Makefile delete mode 100644 net/ipv4/bpfilter/sockopt.c diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 9c333d133c30..60e331af9839 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -276,7 +276,6 @@ CONFIG_BRIDGE_EBT_T_NAT=m CONFIG_BRIDGE_EBT_ARP=m CONFIG_BRIDGE_EBT_IP=m CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BPFILTER=y CONFIG_IP_SCTP=m CONFIG_RDS=y CONFIG_L2TP=m diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h deleted file mode 100644 index 736ded4905e0..000000000000 --- a/include/linux/bpfilter.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BPFILTER_H -#define _LINUX_BPFILTER_H - -#include -#include -#include - -struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen); - -struct bpfilter_umh_ops { - struct umd_info info; - /* since ip_getsockopt() can run in parallel, serialize access to umh */ - struct mutex lock; - int (*sockopt)(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set); - int (*start)(void); -}; -extern struct bpfilter_umh_ops bpfilter_ops; -#endif diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h deleted file mode 100644 index cbc1f5813f50..000000000000 --- a/include/uapi/linux/bpfilter.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_BPFILTER_H -#define _UAPI_LINUX_BPFILTER_H - -#include - -enum { - BPFILTER_IPT_SO_SET_REPLACE = 64, - BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, - BPFILTER_IPT_SET_MAX, -}; - -enum { - BPFILTER_IPT_SO_GET_INFO = 64, - BPFILTER_IPT_SO_GET_ENTRIES = 65, - BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, - BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, - BPFILTER_IPT_GET_MAX, -}; - -#endif /* _UAPI_LINUX_BPFILTER_H */ diff --git a/net/Kconfig b/net/Kconfig index 3ec6bc98fa05..4adc47d0c9c2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -233,8 +233,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/bpfilter/Kconfig" - source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" diff --git a/net/Makefile b/net/Makefile index 4c4dc535453d..b06b5539e7a6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ obj-y += ipv6/ -obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore deleted file mode 100644 index f34e85ee8204..000000000000 --- a/net/bpfilter/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -bpfilter_umh diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig deleted file mode 100644 index 3d4a21462458..000000000000 --- a/net/bpfilter/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig BPFILTER - bool "BPF based packet filtering framework (BPFILTER)" - depends on BPF && INET - select USERMODE_DRIVER - help - This builds experimental bpfilter framework that is aiming to - provide netfilter compatible functionality via BPF - -if BPFILTER -config BPFILTER_UMH - tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC - default m - help - This builds bpfilter kernel module with embedded user mode helper - - Note: To compile this as built-in, your toolchain must support - building static binaries, since rootfs isn't mounted at the time - when __init functions are called and do_execv won't be able to find - the elf interpreter. -endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile deleted file mode 100644 index cdac82b8c53a..000000000000 --- a/net/bpfilter/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the Linux BPFILTER layer. -# - -userprogs := bpfilter_umh -bpfilter_umh-objs := main.o -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi - -ifeq ($(CONFIG_BPFILTER_UMH), y) -# builtin bpfilter_umh should be linked with -static -# since rootfs isn't mounted at the time of __init -# function is called and do_execv won't find elf interpreter -userldflags += -static -endif - -$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh - -obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c deleted file mode 100644 index 97e129e3f31c..000000000000 --- a/net/bpfilter/bpfilter_kern.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include "msgfmt.h" - -extern char bpfilter_umh_start; -extern char bpfilter_umh_end; - -static void shutdown_umh(void) -{ - struct umd_info *info = &bpfilter_ops.info; - struct pid *tgid = info->tgid; - - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(info); - } -} - -static void __stop_umh(void) -{ - if (IS_ENABLED(CONFIG_INET)) - shutdown_umh(); -} - -static int bpfilter_send_req(struct mbox_request *req) -{ - struct mbox_reply reply; - loff_t pos = 0; - ssize_t n; - - if (!bpfilter_ops.info.tgid) - return -EFAULT; - pos = 0; - n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), - &pos); - if (n != sizeof(*req)) { - pr_err("write fail %zd\n", n); - goto stop; - } - pos = 0; - n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), - &pos); - if (n != sizeof(reply)) { - pr_err("read fail %zd\n", n); - goto stop; - } - return reply.status; -stop: - __stop_umh(); - return -EFAULT; -} - -static int bpfilter_process_sockopt(struct sock *sk, int optname, - sockptr_t optval, unsigned int optlen, - bool is_set) -{ - struct mbox_request req = { - .is_set = is_set, - .pid = current->pid, - .cmd = optname, - .addr = (uintptr_t)optval.user, - .len = optlen, - }; - if (sockptr_is_kernel(optval)) { - pr_err("kernel access not supported\n"); - return -EFAULT; - } - return bpfilter_send_req(&req); -} - -static int start_umh(void) -{ - struct mbox_request req = { .pid = current->pid }; - int err; - - /* fork usermode process */ - err = fork_usermode_driver(&bpfilter_ops.info); - if (err) - return err; - pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); - - /* health check that usermode process started correctly */ - if (bpfilter_send_req(&req) != 0) { - shutdown_umh(); - return -EFAULT; - } - - return 0; -} - -static int __init load_umh(void) -{ - int err; - - err = umd_load_blob(&bpfilter_ops.info, - &bpfilter_umh_start, - &bpfilter_umh_end - &bpfilter_umh_start); - if (err) - return err; - - mutex_lock(&bpfilter_ops.lock); - err = start_umh(); - if (!err && IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = &bpfilter_process_sockopt; - bpfilter_ops.start = &start_umh; - } - mutex_unlock(&bpfilter_ops.lock); - if (err) - umd_unload_blob(&bpfilter_ops.info); - return err; -} - -static void __exit fini_umh(void) -{ - mutex_lock(&bpfilter_ops.lock); - if (IS_ENABLED(CONFIG_INET)) { - shutdown_umh(); - bpfilter_ops.start = NULL; - bpfilter_ops.sockopt = NULL; - } - mutex_unlock(&bpfilter_ops.lock); - - umd_unload_blob(&bpfilter_ops.info); -} -module_init(load_umh); -module_exit(fini_umh); -MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S deleted file mode 100644 index 40311d10d2f2..000000000000 --- a/net/bpfilter/bpfilter_umh_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpfilter_umh_start -bpfilter_umh_start: - .incbin "net/bpfilter/bpfilter_umh" - .global bpfilter_umh_end -bpfilter_umh_end: diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c deleted file mode 100644 index 291a92546246..000000000000 --- a/net/bpfilter/main.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include "../../include/uapi/linux/bpf.h" -#include -#include "msgfmt.h" - -FILE *debug_f; - -static int handle_get_cmd(struct mbox_request *cmd) -{ - switch (cmd->cmd) { - case 0: - return 0; - default: - break; - } - return -ENOPROTOOPT; -} - -static int handle_set_cmd(struct mbox_request *cmd) -{ - return -ENOPROTOOPT; -} - -static void loop(void) -{ - while (1) { - struct mbox_request req; - struct mbox_reply reply; - int n; - - n = read(0, &req, sizeof(req)); - if (n != sizeof(req)) { - fprintf(debug_f, "invalid request %d\n", n); - return; - } - - reply.status = req.is_set ? - handle_set_cmd(&req) : - handle_get_cmd(&req); - - n = write(1, &reply, sizeof(reply)); - if (n != sizeof(reply)) { - fprintf(debug_f, "reply failed %d\n", n); - return; - } - } -} - -int main(void) -{ - debug_f = fopen("/dev/kmsg", "w"); - setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "<5>Started bpfilter\n"); - loop(); - fclose(debug_f); - return 0; -} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h deleted file mode 100644 index 98d121c62945..000000000000 --- a/net/bpfilter/msgfmt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_BPFILTER_MSGFMT_H -#define _NET_BPFILTER_MSGFMT_H - -struct mbox_request { - __u64 addr; - __u32 len; - __u32 is_set; - __u32 cmd; - __u32 pid; -}; - -struct mbox_reply { - __u32 status; -}; - -#endif diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e144a02a6a61..ec36d2ec059e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -16,8 +16,6 @@ obj-y := route.o inetpeer.o protocol.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ metrics.o netlink.o nexthop.o udp_tunnel_stub.o -obj-$(CONFIG_BPFILTER) += bpfilter/ - obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile deleted file mode 100644 index 00af5305e05a..000000000000 --- a/net/ipv4/bpfilter/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BPFILTER) += sockopt.o diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c deleted file mode 100644 index 193bcc2acccc..000000000000 --- a/net/ipv4/bpfilter/sockopt.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct bpfilter_umh_ops bpfilter_ops; -EXPORT_SYMBOL_GPL(bpfilter_ops); - -static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set) -{ - int err; - mutex_lock(&bpfilter_ops.lock); - if (!bpfilter_ops.sockopt) { - mutex_unlock(&bpfilter_ops.lock); - request_module("bpfilter"); - mutex_lock(&bpfilter_ops.lock); - - if (!bpfilter_ops.sockopt) { - err = -ENOPROTOOPT; - goto out; - } - } - if (bpfilter_ops.info.tgid && - thread_group_exited(bpfilter_ops.info.tgid)) - umd_cleanup_helper(&bpfilter_ops.info); - - if (!bpfilter_ops.info.tgid) { - err = bpfilter_ops.start(); - if (err) - goto out; - } - err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); -out: - mutex_unlock(&bpfilter_ops.lock); - return err; -} - -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen) -{ - return bpfilter_mbox_request(sk, optname, optval, optlen, true); -} - -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) -{ - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, - false); -} - -static int __init bpfilter_sockopt_init(void) -{ - mutex_init(&bpfilter_ops.lock); - bpfilter_ops.info.tgid = NULL; - bpfilter_ops.info.driver_name = "bpfilter_umh"; - - return 0; -} -device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 66247e8b429e..7aa9dc0e6760 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,8 +47,6 @@ #include #include -#include - /* * SOL_IP control messages. */ @@ -1411,11 +1409,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_SET_REPLACE && - optname < BPFILTER_IPT_SET_MAX) - err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1763,11 +1756,6 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index edda4fc2c4d0..708733b0ea06 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -426,10 +426,6 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_STREAM_PARSER", }, /* xt_bpf module for passing BPF programs to netfilter */ { "CONFIG_NETFILTER_XT_MATCH_BPF", }, - /* bpfilter back-end for iptables */ - { "CONFIG_BPFILTER", }, - /* bpftilter module with "user mode helper" */ - { "CONFIG_BPFILTER_UMH", }, /* test_bpf module for BPF tests */ { "CONFIG_TEST_BPF", }, diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index 29c8635c5722..3720b7611523 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -11,7 +11,6 @@ CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_SD=y CONFIG_BONDING=y -CONFIG_BPFILTER=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_PRELOAD_UMD=y diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x index e93330382849..706931a8c2c6 100644 --- a/tools/testing/selftests/bpf/config.s390x +++ b/tools/testing/selftests/bpf/config.s390x @@ -9,7 +9,6 @@ CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_PRELOAD=y CONFIG_BPF_PRELOAD_UMD=y -CONFIG_BPFILTER=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_FREEZER=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index b946088017f1..5680befae8c6 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -19,7 +19,6 @@ CONFIG_BOOTTIME_TRACING=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_PRELOAD=y CONFIG_BPF_PRELOAD_UMD=y -CONFIG_BPFILTER=y CONFIG_BSD_DISKLABEL=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_CFS_BANDWIDTH=y diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config index 4f425178b56f..1758b055f295 100644 --- a/tools/testing/selftests/hid/config +++ b/tools/testing/selftests/hid/config @@ -1,5 +1,4 @@ CONFIG_BPF_EVENTS=y -CONFIG_BPFILTER=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT=y CONFIG_BPF_KPROBE_OVERRIDE=y -- cgit v1.2.3 From ecba66cb36e3428e9f0c2362b45e213ad43ba8d0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jan 2024 20:30:35 +0100 Subject: s390/bpf: Fix gotol with large offsets The gotol implementation uses a wrong data type for the offset: it should be s32, not s16. Fixes: c690191e23d8 ("s390/bpf: Implement unconditional jump with 32-bit offset") Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240102193531.3169422-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 7f0a7b97ef4c..b418333bb086 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -779,7 +779,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i, bool extra_pass, u32 stack_depth) { struct bpf_insn *insn = &fp->insnsi[i]; - s16 branch_oc_off = insn->off; + s32 branch_oc_off = insn->off; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; int last, insn_count = 1; -- cgit v1.2.3 From 445aea5afda4759c13dc5c492b309cc1d5c1c486 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jan 2024 20:30:36 +0100 Subject: selftests/bpf: Double the size of test_loader log Testing long jumps requires having >32k instructions. That many instructions require the verifier log buffer of 2 megabytes. The regular test_progs run doesn't need an increased buffer, since gotol test with 40k instructions doesn't request a log, but test_progs -v will set the verifier log level. Hence to avoid breaking gotol test with -v increase the buffer size. Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240102193531.3169422-3-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 74ceb7877ae2..f01391021218 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -12,7 +12,7 @@ #define str_has_pfx(str, pfx) \ (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) -#define TEST_LOADER_LOG_BUF_SZ 1048576 +#define TEST_LOADER_LOG_BUF_SZ 2097152 #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" -- cgit v1.2.3 From 63fac34669e4cc666f943173ed2aa76b8db999f0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jan 2024 20:30:37 +0100 Subject: selftests/bpf: Test gotol with large offsets Test gotol with offsets that don't fit into a short (i.e., larger than 32k or smaller than -32k). Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240102193531.3169422-4-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_gotol.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_gotol.c b/tools/testing/selftests/bpf/progs/verifier_gotol.c index d1edbcff9a18..05a329ee45ee 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotol.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotol.c @@ -33,6 +33,25 @@ l3_%=: \ : __clobber_all); } +SEC("socket") +__description("gotol, large_imm") +__success __failure_unpriv __retval(40000) +__naked void gotol_large_imm(void) +{ + asm volatile (" \ + gotol 1f; \ +0: \ + r0 = 0; \ + .rept 40000; \ + r0 += 1; \ + .endr; \ + exit; \ +1: gotol 0b; \ +" : + : + : __clobber_all); +} + #else SEC("socket") -- cgit v1.2.3 From 00bc8988807985e32f5103f1ac099baf593bd8a3 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Thu, 4 Jan 2024 22:22:23 +0800 Subject: bpf, x86: Use emit_nops to replace memcpy x86_nops Move emit_nops() before emit_prologue() and replace memcpy(prog, x86_nops[5], X86_PATCH_SIZE) with emit_nops(&prog, X86_PATCH_SIZE). Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240104142226.87869-2-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 47 +++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bdacbb84456d..fe30b9ebb8de 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -307,6 +307,25 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +static void emit_nops(u8 **pprog, int len) +{ + u8 *prog = *pprog; + int i, noplen; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(x86_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; +} + /* * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c @@ -385,8 +404,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, x86_nops[5], X86_PATCH_SIZE); - prog += X86_PATCH_SIZE; + emit_nops(&prog, X86_PATCH_SIZE); if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) /* When it's the entry of the whole tailcall context, @@ -692,8 +710,7 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog, if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); - memcpy(prog, x86_nops[5], X86_PATCH_SIZE); - prog += X86_PATCH_SIZE; + emit_nops(&prog, X86_PATCH_SIZE); /* out: */ ctx->tail_call_direct_label = prog - start; @@ -1055,25 +1072,6 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } -static void emit_nops(u8 **pprog, int len) -{ - u8 *prog = *pprog; - int i, noplen; - - while (len > 0) { - noplen = len; - - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - - for (i = 0; i < noplen; i++) - EMIT1(x86_nops[noplen][i]); - len -= noplen; - } - - *pprog = prog; -} - /* emit the 3-byte VEX prefix * * r: same as rex.r, extra bit for ModRM reg field @@ -2700,8 +2698,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); im->ip_after_call = image + (prog - (u8 *)rw_image); - memcpy(prog, x86_nops[5], X86_PATCH_SIZE); - prog += X86_PATCH_SIZE; + emit_nops(&prog, X86_PATCH_SIZE); } if (fmod_ret->nr_links) { -- cgit v1.2.3 From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:44 +0100 Subject: bpf: Relax tracing prog recursive attach rules Currently, it's not allowed to attach an fentry/fexit prog to another one fentry/fexit. At the same time it's not uncommon to see a tracing program with lots of logic in use, and the attachment limitation prevents usage of fentry/fexit for performance analysis (e.g. with "bpftool prog profile" command) in this case. An example could be falcosecurity libs project that uses tp_btf tracing programs. Following the corresponding discussion [1], the reason for that is to avoid tracing progs call cycles without introducing more complex solutions. But currently it seems impossible to load and attach tracing programs in a way that will form such a cycle. The limitation is coming from the fact that attach_prog_fd is specified at the prog load (thus making it impossible to attach to a program loaded after it in this way), as well as tracing progs not implementing link_detach. Replace "no same type" requirement with verification that no more than one level of attachment nesting is allowed. In this way only one fentry/fexit program could be attached to another fentry/fexit to cover profiling use case, and still no cycle could be formed. To implement, add a new field into bpf_prog_aux to track nested attachment for tracing programs. [1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/ Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 23 ++++++++++++++++++++++- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e..e30100597d0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1449,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1bf9805ee185..d15f9998bc20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2738,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; } + /* + * Bookkeeping for managing the program attachment chain. + * + * It might be tempting to set attach_tracing_prog flag at the attachment + * time, but this will not prevent from loading bunch of tracing prog + * first, then attach them one to another. + * + * The flag attach_tracing_prog is set for the whole program lifecycle, and + * doesn't have to be cleared in bpf_tracing_link_release, since tracing + * programs cannot change attachment target. + */ + if (type == BPF_PROG_TYPE_TRACING && dst_prog && + dst_prog->type == BPF_PROG_TYPE_TRACING) { + prog->aux->attach_tracing_prog = true; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) @@ -3171,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, } if (tgt_prog_fd) { - /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + /* + * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this + * part would be changed to implement the same for + * BPF_PROG_TYPE_TRACING, do not forget to update the way how + * attach_tracing_prog flag is set. + */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d5f4ff1eb235..adbf330d364b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20317,6 +20317,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info) { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; + bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; const char prefix[] = "btf_trace_"; int ret = 0, subprog = -1, i; const struct btf_type *t; @@ -20387,10 +20388,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } - if (tgt_prog->type == prog->type) { - /* Cannot fentry/fexit another fentry/fexit program. - * Cannot attach program extension to another extension. - * It's ok to attach fentry/fexit to extension program. + if (prog_tracing) { + if (aux->attach_tracing_prog) { + /* + * Target program is an fentry/fexit which is already attached + * to another tracing program. More levels of nesting + * attachment are not allowed. + */ + bpf_log(log, "Cannot nest tracing program attach more than once\n"); + return -EINVAL; + } + } else if (tgt_prog->type == prog->type) { + /* + * To avoid potential call chain cycles, prevent attaching of a + * program extension to another extension. It's ok to attach + * fentry/fexit to extension program. */ bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; @@ -20403,16 +20415,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance * analysis, stats and can be attached to any program - * type except themselves. When extension program is - * replacing XDP function it is necessary to allow - * performance analysis of all functions. Both original - * XDP program and its program extension. Hence - * attaching fentry/fexit to BPF_PROG_TYPE_EXT is - * allowed. If extending of fentry/fexit was allowed it - * would be possible to create long call chain - * fentry->extension->fentry->extension beyond - * reasonable stack size. Hence extending fentry is not - * allowed. + * type. When extension program is replacing XDP function + * it is necessary to allow performance analysis of all + * functions. Both original XDP program and its program + * extension. Hence attaching fentry/fexit to + * BPF_PROG_TYPE_EXT is allowed. If extending of + * fentry/fexit was allowed it would be possible to create + * long call chain fentry->extension->fentry->extension + * beyond reasonable stack size. Hence extending fentry + * is not allowed. */ bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; -- cgit v1.2.3 From 5c5371e069e1ffc204dda8b20c609b170b823165 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:45 +0100 Subject: selftests/bpf: Add test for recursive attachment of tracing progs Verify the fact that only one fentry prog could be attached to another fentry, building up an attachment chain of limited size. Use existing bpf_testmod as a start of the chain. Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-3-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/recursive_attach.c | 107 +++++++++++++++++++++ .../testing/selftests/bpf/progs/fentry_recursive.c | 14 +++ .../selftests/bpf/progs/fentry_recursive_target.c | 16 +++ 3 files changed, 137 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/recursive_attach.c create mode 100644 tools/testing/selftests/bpf/progs/fentry_recursive.c create mode 100644 tools/testing/selftests/bpf/progs/fentry_recursive_target.c diff --git a/tools/testing/selftests/bpf/prog_tests/recursive_attach.c b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c new file mode 100644 index 000000000000..4bd0a0e4231e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Red Hat, Inc. */ +#include +#include "fentry_recursive.skel.h" +#include "fentry_recursive_target.skel.h" +#include +#include "bpf/libbpf_internal.h" + +/* Test recursive attachment of tracing progs with more than one nesting level + * is not possible. Create a chain of attachment, verify that the last prog + * will fail. Depending on the arguments, following cases are tested: + * + * - Recursive loading of tracing progs, without attaching (attach = false, + * detach = false). The chain looks like this: + * load target + * load fentry1 -> target + * load fentry2 -> fentry1 (fail) + * + * - Recursive attach of tracing progs (attach = true, detach = false). The + * chain looks like this: + * load target + * load fentry1 -> target + * attach fentry1 -> target + * load fentry2 -> fentry1 (fail) + * + * - Recursive attach and detach of tracing progs (attach = true, detach = + * true). This validates that attach_tracing_prog flag will be set throughout + * the whole lifecycle of an fentry prog, independently from whether it's + * detached. The chain looks like this: + * load target + * load fentry1 -> target + * attach fentry1 -> target + * detach fentry1 + * load fentry2 -> fentry1 (fail) + */ +static void test_recursive_fentry_chain(bool attach, bool detach) +{ + struct fentry_recursive_target *target_skel = NULL; + struct fentry_recursive *tracing_chain[2] = {}; + struct bpf_program *prog; + int prev_fd, err; + + target_skel = fentry_recursive_target__open_and_load(); + if (!ASSERT_OK_PTR(target_skel, "fentry_recursive_target__open_and_load")) + return; + + /* Create an attachment chain with two fentry progs */ + for (int i = 0; i < 2; i++) { + tracing_chain[i] = fentry_recursive__open(); + if (!ASSERT_OK_PTR(tracing_chain[i], "fentry_recursive__open")) + goto close_prog; + + /* The first prog in the chain is going to be attached to the target + * fentry program, the second one to the previous in the chain. + */ + prog = tracing_chain[i]->progs.recursive_attach; + if (i == 0) { + prev_fd = bpf_program__fd(target_skel->progs.test1); + err = bpf_program__set_attach_target(prog, prev_fd, "test1"); + } else { + prev_fd = bpf_program__fd(tracing_chain[i-1]->progs.recursive_attach); + err = bpf_program__set_attach_target(prog, prev_fd, "recursive_attach"); + } + + if (!ASSERT_OK(err, "bpf_program__set_attach_target")) + goto close_prog; + + err = fentry_recursive__load(tracing_chain[i]); + /* The first attach should succeed, the second fail */ + if (i == 0) { + if (!ASSERT_OK(err, "fentry_recursive__load")) + goto close_prog; + + if (attach) { + err = fentry_recursive__attach(tracing_chain[i]); + if (!ASSERT_OK(err, "fentry_recursive__attach")) + goto close_prog; + } + + if (detach) { + /* Flag attach_tracing_prog should still be set, preventing + * attachment of the following prog. + */ + fentry_recursive__detach(tracing_chain[i]); + } + } else { + if (!ASSERT_ERR(err, "fentry_recursive__load")) + goto close_prog; + } + } + +close_prog: + fentry_recursive_target__destroy(target_skel); + for (int i = 0; i < 2; i++) { + fentry_recursive__destroy(tracing_chain[i]); + } +} + +void test_recursive_fentry(void) +{ + if (test__start_subtest("attach")) + test_recursive_fentry_chain(true, false); + if (test__start_subtest("load")) + test_recursive_fentry_chain(false, false); + if (test__start_subtest("detach")) + test_recursive_fentry_chain(true, true); +} diff --git a/tools/testing/selftests/bpf/progs/fentry_recursive.c b/tools/testing/selftests/bpf/progs/fentry_recursive.c new file mode 100644 index 000000000000..2c9fb5ac42b2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fentry_recursive.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Red Hat, Inc. */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* Dummy fentry bpf prog for testing fentry attachment chains */ +SEC("fentry/XXX") +int BPF_PROG(recursive_attach, int a) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fentry_recursive_target.c b/tools/testing/selftests/bpf/progs/fentry_recursive_target.c new file mode 100644 index 000000000000..cb18a8bfffac --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fentry_recursive_target.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Red Hat, Inc. */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* Dummy fentry bpf prog for testing fentry attachment chains. It's going to be + * a start of the chain. + */ +SEC("fentry/bpf_testmod_fentry_test1") +int BPF_PROG(test1, int a) +{ + return 0; +} -- cgit v1.2.3 From 715d82ba636cb3629a6e18a33bb9dbe53f9936ee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 3 Jan 2024 20:05:46 +0100 Subject: bpf: Fix re-attachment branch in bpf_tracing_prog_attach The following case can cause a crash due to missing attach_btf: 1) load rawtp program 2) load fentry program with rawtp as target_fd 3) create tracing link for fentry program with target_fd = 0 4) repeat 3 In the end we have: - prog->aux->dst_trampoline == NULL - tgt_prog == NULL (because we did not provide target_fd to link_create) - prog->aux->attach_btf == NULL (the program was loaded with attach_prog_fd=X) - the program was loaded for tgt_prog but we have no way to find out which one BUG: kernel NULL pointer dereference, address: 0000000000000058 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x15b/0x430 ? fixup_exception+0x22/0x330 ? exc_page_fault+0x6f/0x170 ? asm_exc_page_fault+0x22/0x30 ? bpf_tracing_prog_attach+0x279/0x560 ? btf_obj_id+0x5/0x10 bpf_tracing_prog_attach+0x439/0x560 __sys_bpf+0x1cf4/0x2de0 __x64_sys_bpf+0x1c/0x30 do_syscall_64+0x41/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Return -EINVAL in this situation. Fixes: f3a95075549e0 ("bpf: Allow trampoline re-attach for tracing and lsm programs") Cc: stable@vger.kernel.org Signed-off-by: Jiri Olsa Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-4-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d15f9998bc20..a1f18681721c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3237,6 +3237,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program * was detached and is going for re-attachment. + * + * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf + * are NULL, then program was already attached and user did not provide + * tgt_prog_fd so we have no way to find out or create trampoline */ if (!prog->aux->dst_trampoline && !tgt_prog) { /* @@ -3250,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -EINVAL; goto out_unlock; } + /* We can allow re-attach only if we have valid attach_btf. */ + if (!prog->aux->attach_btf) { + err = -EINVAL; + goto out_unlock; + } btf_id = prog->aux->attach_btf_id; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } -- cgit v1.2.3 From e02feb3f1f47509ec1e07b604bfbeff8c3b4e639 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:47 +0100 Subject: selftests/bpf: Test re-attachment fix for bpf_tracing_prog_attach Add a test case to verify the fix for "prog->aux->dst_trampoline and tgt_prog is NULL" branch in bpf_tracing_prog_attach. The sequence of events: 1. load rawtp program 2. load fentry program with rawtp as target_fd 3. create tracing link for fentry program with target_fd = 0 4. repeat 3 Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-5-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/recursive_attach.c | 44 ++++++++++++++++++++++ .../selftests/bpf/progs/fentry_recursive_target.c | 9 +++++ 2 files changed, 53 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/recursive_attach.c b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c index 4bd0a0e4231e..8100509e561b 100644 --- a/tools/testing/selftests/bpf/prog_tests/recursive_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c @@ -105,3 +105,47 @@ void test_recursive_fentry(void) if (test__start_subtest("detach")) test_recursive_fentry_chain(true, true); } + +/* Test that a tracing prog reattachment (when we land in + * "prog->aux->dst_trampoline and tgt_prog is NULL" branch in + * bpf_tracing_prog_attach) does not lead to a crash due to missing attach_btf + */ +void test_fentry_attach_btf_presence(void) +{ + struct fentry_recursive_target *target_skel = NULL; + struct fentry_recursive *tracing_skel = NULL; + struct bpf_program *prog; + int err, link_fd, tgt_prog_fd; + + target_skel = fentry_recursive_target__open_and_load(); + if (!ASSERT_OK_PTR(target_skel, "fentry_recursive_target__open_and_load")) + goto close_prog; + + tracing_skel = fentry_recursive__open(); + if (!ASSERT_OK_PTR(tracing_skel, "fentry_recursive__open")) + goto close_prog; + + prog = tracing_skel->progs.recursive_attach; + tgt_prog_fd = bpf_program__fd(target_skel->progs.fentry_target); + err = bpf_program__set_attach_target(prog, tgt_prog_fd, "fentry_target"); + if (!ASSERT_OK(err, "bpf_program__set_attach_target")) + goto close_prog; + + err = fentry_recursive__load(tracing_skel); + if (!ASSERT_OK(err, "fentry_recursive__load")) + goto close_prog; + + tgt_prog_fd = bpf_program__fd(tracing_skel->progs.recursive_attach); + link_fd = bpf_link_create(tgt_prog_fd, 0, BPF_TRACE_FENTRY, NULL); + if (!ASSERT_GE(link_fd, 0, "link_fd")) + goto close_prog; + + fentry_recursive__detach(tracing_skel); + + err = fentry_recursive__attach(tracing_skel); + ASSERT_ERR(err, "fentry_recursive__attach"); + +close_prog: + fentry_recursive_target__destroy(target_skel); + fentry_recursive__destroy(tracing_skel); +} diff --git a/tools/testing/selftests/bpf/progs/fentry_recursive_target.c b/tools/testing/selftests/bpf/progs/fentry_recursive_target.c index cb18a8bfffac..267c876d0aba 100644 --- a/tools/testing/selftests/bpf/progs/fentry_recursive_target.c +++ b/tools/testing/selftests/bpf/progs/fentry_recursive_target.c @@ -14,3 +14,12 @@ int BPF_PROG(test1, int a) { return 0; } + +/* Dummy bpf prog for testing attach_btf presence when attaching an fentry + * program. + */ +SEC("raw_tp/sys_enter") +int BPF_PROG(fentry_target, struct pt_regs *regs, long id) +{ + return 0; +} -- cgit v1.2.3