From 0c9a7a7e2049859d7869e15dd8f70ca5aeae460e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 2 Aug 2022 14:46:38 -0700 Subject: bpf: Verifier cleanups This patch cleans up a few things in the verifier: * type_is_pkt_pointer(): Future work (skb + xdp dynptrs [0]) will be using the reg type PTR_TO_PACKET | PTR_MAYBE_NULL. type_is_pkt_pointer() should return true for any type whose base type is PTR_TO_PACKET, regardless of flags attached to it. * reg_type_may_be_refcounted_or_null(): Get the base type at the start of the function to avoid having to recompute it / improve readability * check_func_proto(): remove unnecessary 'meta' arg * check_helper_call(): Use switch casing on the base type of return value instead of nested ifs on the full type There are no functional behavior changes. [0] https://lore.kernel.org/bpf/20220726184706.954822-1-joannelkoong@gmail.com/ Signed-off-by: Joanne Koong Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220802214638.3643235-1-joannelkoong@gmail.com --- kernel/bpf/verifier.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096fdac70165..843a966cd02b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -427,6 +427,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, static bool type_is_pkt_pointer(enum bpf_reg_type type) { + type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } @@ -456,10 +457,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return base_type(type) == PTR_TO_SOCKET || - base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM || - base_type(type) == PTR_TO_BTF_ID; + type = base_type(type); + return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || + type == PTR_TO_MEM || type == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -6498,8 +6498,7 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id, - struct bpf_call_arg_meta *meta) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -7218,7 +7217,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id, &meta); + err = check_func_proto(fn, func_id); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -7359,13 +7358,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* update return register (already marked as written above) */ ret_type = fn->ret_type; - ret_flag = type_flag(fn->ret_type); - if (ret_type == RET_INTEGER) { + ret_flag = type_flag(ret_type); + + switch (base_type(ret_type)) { + case RET_INTEGER: /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (ret_type == RET_VOID) { + break; + case RET_VOID: regs[BPF_REG_0].type = NOT_INIT; - } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { + break; + case RET_PTR_TO_MAP_VALUE: /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -7384,20 +7387,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn map_value_has_spin_lock(meta.map_ptr)) { regs[BPF_REG_0].id = ++env->id_gen; } - } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { + break; + case RET_PTR_TO_SOCKET: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { + break; + case RET_PTR_TO_SOCK_COMMON: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { + break; + case RET_PTR_TO_TCP_SOCK: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { + break; + case RET_PTR_TO_ALLOC_MEM: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { + break; + case RET_PTR_TO_MEM_OR_BTF_ID: + { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -7429,7 +7438,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + break; + } + case RET_PTR_TO_BTF_ID: + { struct btf *ret_btf; int ret_btf_id; @@ -7450,7 +7462,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - } else { + break; + } + default: verbose(env, "unknown return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; -- cgit v1.2.3 From fa96b24204af42274ec13dfb2f2e6990d7510e55 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 5 Aug 2022 14:48:14 -0700 Subject: btf: Add a new kfunc flag which allows to mark a function to be sleepable This allows to declare a kfunc as sleepable and prevents its use in a non sleepable program. Signed-off-by: Benjamin Tissoires Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220805214821.1058337-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 6 ++++++ include/linux/btf.h | 1 + kernel/bpf/btf.c | 9 +++++++++ 3 files changed, 16 insertions(+) (limited to 'kernel/bpf') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index c0b7dae6dbf5..c8b21de1c772 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -146,6 +146,12 @@ that operate (change some property, perform some operation) on an object that was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to ensure the integrity of the operation being performed on the expected object. +2.4.6 KF_SLEEPABLE flag +----------------------- + +The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only +be called by sleepable BPF programs (BPF_F_SLEEPABLE). + 2.5 Registering the kfuncs -------------------------- diff --git a/include/linux/btf.h b/include/linux/btf.h index cdb376d53238..976cbdd2981f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,6 +49,7 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ struct btf; struct btf_member; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e64447659f3..d3e4c86b8fcd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6175,6 +6175,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_arg = false; + bool sleepable = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -6212,6 +6213,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, rel = kfunc_flags & KF_RELEASE; kptr_get = kfunc_flags & KF_KPTR_GET; trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; + sleepable = kfunc_flags & KF_SLEEPABLE; } /* check that BTF function arguments match actual types that the @@ -6419,6 +6421,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, func_name); return -EINVAL; } + + if (sleepable && !env->prog->aux->sleepable) { + bpf_log(log, "kernel function %s is sleepable but the program is not\n", + func_name); + return -EINVAL; + } + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } -- cgit v1.2.3 From be3bb83dab2df838cd9e681e3e9dcde87bfe4f95 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Fri, 5 Aug 2022 14:48:16 -0700 Subject: bpf, iter: Fix the condition on p when calling stop. In bpf_seq_read, seq->op->next() could return an ERR and jump to the label stop. However, the existing code in stop does not handle the case when p (returned from next()) is an ERR. Adds the handling of ERR of p by converting p into an error and jumping to done. Because all the current implementations do not have a case that returns ERR from next(), so this patch doesn't have behavior changes right now. Acked-by: Yonghong Song Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220805214821.1058337-4-haoluo@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2726a5950cfa..4b112aa8bba3 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -197,6 +197,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } stop: offs = seq->count; + if (IS_ERR(p)) { + seq->op->stop(seq, NULL); + err = PTR_ERR(p); + goto done; + } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { -- cgit v1.2.3 From b2d8ef19c6e7ed71ba5092feb0710063a751834f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 8 Aug 2022 10:15:59 -0700 Subject: bpf: Cleanup check_refcount_ok Discussion around a recently-submitted patch provided historical context for check_refcount_ok [0]. Specifically, the function and its helpers - may_be_acquire_function and arg_type_may_be_refcounted - predate the OBJ_RELEASE type flag and the addition of many more helpers with acquire/release semantics. The purpose of check_refcount_ok is to ensure: 1) Helper doesn't have multiple uses of return reg's ref_obj_id 2) Helper with release semantics only has one arg needing to be released, since that's tracked using meta->ref_obj_id With current verifier, it's safe to remove check_refcount_ok and its helpers. Since addition of OBJ_RELEASE type flag, case 2) has been handled by the arg_type_is_release check in check_func_arg. To ensure case 1) won't result in verifier silently prioritizing one use of ref_obj_id, this patch adds a helper_multiple_ref_obj_use check which fails loudly if a helper passes > 1 test for use of ref_obj_id. [0]: lore.kernel.org/bpf/20220713234529.4154673-1-davemarchevsky@fb.com Signed-off-by: Dave Marchevsky Acked-by: Martin KaFai Lau Acked-by: Joanne Koong Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220808171559.3251090-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 74 ++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 45 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 843a966cd02b..01e7f48b4d8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -467,25 +467,11 @@ static bool type_is_rdonly_mem(u32 type) return type & MEM_RDONLY; } -static bool arg_type_may_be_refcounted(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_SOCK_COMMON; -} - static bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; } -static bool may_be_acquire_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem || - func_id == BPF_FUNC_ringbuf_reserve; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -518,6 +504,26 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } +static bool is_dynptr_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_dynptr_data; +} + +static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + int ref_obj_uses = 0; + + if (is_ptr_cast_function(func_id)) + ref_obj_uses++; + if (is_acquire_function(func_id, map)) + ref_obj_uses++; + if (is_dynptr_acquire_function(func_id)) + ref_obj_uses++; + + return ref_obj_uses > 1; +} + static bool is_cmpxchg_insn(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_STX && @@ -6453,33 +6459,6 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) -{ - int count = 0; - - if (arg_type_may_be_refcounted(fn->arg1_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg2_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg3_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg4_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg5_type)) - count++; - - /* A reference acquiring function cannot acquire - * another refcounted ptr. - */ - if (may_be_acquire_function(func_id) && count) - return false; - - /* We only support one arg being unreferenced at the moment, - * which is sufficient for the helper functions we have right now. - */ - return count <= 1; -} - static bool check_btf_id_ok(const struct bpf_func_proto *fn) { int i; @@ -6502,8 +6481,7 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_btf_id_ok(fn) && - check_refcount_ok(fn, func_id) ? 0 : -EINVAL; + check_btf_id_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -7473,6 +7451,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; + if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", + func_id_name(func_id), func_id); + return -EFAULT; + } + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -7485,10 +7469,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; - } else if (func_id == BPF_FUNC_dynptr_data) { + } else if (is_dynptr_acquire_function(func_id)) { int dynptr_id = 0, i; - /* Find the id of the dynptr we're acquiring a reference to */ + /* Find the id of the dynptr we're tracking the reference of */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { if (dynptr_id) { -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 5 files changed, 42 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 534e33fb1029..7d1e2794d83e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..639437f36928 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2623,6 +2623,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..a95eb9fb01ff 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -198,6 +198,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_tai_ns) +{ + /* NMI safe access to clock tai */ + return ktime_get_tai_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { + .func = bpf_ktime_get_tai_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -1617,6 +1629,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_tai_ns: + return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f58d58e1d547..e174ad28aeb7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From a00ed8430199abbc9d9bf43ea31795bfe98998ca Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 7 Aug 2022 10:51:16 -0700 Subject: bpf: Always return corresponding btf_type in __get_type_size() Currently in funciton __get_type_size(), the corresponding btf_type is returned only in invalid cases. Let us always return btf_type regardless of valid or invalid cases. Such a new functionality will be used in subsequent patches. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220807175116.4179242-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d3e4c86b8fcd..903719b89238 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5864,26 +5864,25 @@ again: } static int __get_type_size(struct btf *btf, u32 btf_id, - const struct btf_type **bad_type) + const struct btf_type **ret_type) { const struct btf_type *t; + *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!t) { - *bad_type = btf_type_by_id(btf, 0); + if (!t) return -EINVAL; - } + *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; - *bad_type = t; return -EINVAL; } -- cgit v1.2.3 From 883743422ced8c961ab05dc63ec81b75a4e56052 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 9 Aug 2022 14:40:54 -0700 Subject: bpf: Fix ref_obj_id for dynptr data slices in verifier When a data slice is obtained from a dynptr (through the bpf_dynptr_data API), the ref obj id of the dynptr must be found and then associated with the data slice. The ref obj id of the dynptr must be found *before* the caller saved regs are reset. Without this fix, the ref obj id tracking is not correct for dynptrs that are at an offset from the frame pointer. Please also note that the data slice's ref obj id must be assigned after the ret types are parsed, since RET_PTR_TO_ALLOC_MEM-type return regs get zero-marked. Fixes: 34d4ef5775f7 ("bpf: Add dynptr data slices") Signed-off-by: Joanne Koong Acked-by: David Vernet Link: https://lore.kernel.org/r/20220809214055.4050604-1-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 01e7f48b4d8c..28b02dc67a2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,7 +504,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -static bool is_dynptr_acquire_function(enum bpf_func_id func_id) +static bool is_dynptr_ref_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_dynptr_data; } @@ -518,7 +518,7 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, ref_obj_uses++; if (is_acquire_function(func_id, map)) ref_obj_uses++; - if (is_dynptr_acquire_function(func_id)) + if (is_dynptr_ref_function(func_id)) ref_obj_uses++; return ref_obj_uses > 1; @@ -7320,6 +7320,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } break; + case BPF_FUNC_dynptr_data: + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (meta.ref_obj_id) { + verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + return -EFAULT; + } + /* Find the id of the dynptr we're tracking the reference of */ + meta.ref_obj_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + break; + } + } + if (i == MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + return -EFAULT; + } + break; } if (err) @@ -7457,7 +7474,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (is_ptr_cast_function(func_id)) { + if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map_ptr)) { @@ -7469,21 +7486,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; - } else if (is_dynptr_acquire_function(func_id)) { - int dynptr_id = 0, i; - - /* Find the id of the dynptr we're tracking the reference of */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (dynptr_id) { - verbose(env, "verifier internal error: multiple dynptr args in func\n"); - return -EFAULT; - } - dynptr_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); - } - } - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = dynptr_id; } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); -- cgit v1.2.3 From 4dd48c6f1f83290d4bc61b43e61d86f8bc6c310e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:03 +0200 Subject: bpf: add destructive kfunc flag Add KF_DESTRUCTIVE flag for destructive functions. Functions with this flag set will require CAP_SYS_BOOT capabilities. Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-2-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 9 +++++++++ include/linux/btf.h | 3 ++- kernel/bpf/verifier.c | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index c8b21de1c772..781731749e55 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -152,6 +152,15 @@ ensure the integrity of the operation being performed on the expected object. The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only be called by sleepable BPF programs (BPF_F_SLEEPABLE). +2.4.7 KF_DESTRUCTIVE flag +-------------------------- + +The KF_DESTRUCTIVE flag is used to indicate functions calling which is +destructive to the system. For example such a call can result in system +rebooting or panicking. Due to this additional restrictions apply to these +calls. At the moment they only require CAP_SYS_BOOT capability, but more can be +added later. + 2.5 Registering the kfuncs -------------------------- diff --git a/include/linux/btf.h b/include/linux/btf.h index 976cbdd2981f..ad93c2d9cc1c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,7 +49,8 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ -#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ struct btf; struct btf_member; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28b02dc67a2a..2c1f8069f7b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7584,6 +7584,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name); return -EACCES; } + if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) { + verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n"); + return -EACCES; + } + acq = *kfunc_flags & KF_ACQUIRE; /* Check the arguments */ -- cgit v1.2.3 From 133790596406ce2658f0864eb7eac64987c2b12f Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:04 +0200 Subject: bpf: export crash_kexec() as destructive kfunc Allow properly marked bpf programs to call crash_kexec(). Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-3-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a95eb9fb01ff..3c1b9bbcf971 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1725,3 +1725,21 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; } } + +BTF_SET8_START(tracing_btf_ids) +#ifdef CONFIG_KEXEC_CORE +BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) +#endif +BTF_SET8_END(tracing_btf_ids) + +static const struct btf_kfunc_id_set tracing_kfunc_set = { + .owner = THIS_MODULE, + .set = &tracing_btf_ids, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); +} + +late_initcall(kfunc_init); -- cgit v1.2.3 From 083818156d1e98f22b1ac612a3957bc553e7ba57 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 10 Aug 2022 15:18:26 +0000 Subject: bpf: Remove unneeded memset in queue_stack_map creation __GFP_ZERO will clear the memory, so we don't need to memset it. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20220810151840.16394-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/queue_stack_maps.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index a1c0794ae49d..8a5e060de63b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -78,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) if (!qs) return ERR_PTR(-ENOMEM); - memset(qs, 0, sizeof(*qs)); - bpf_map_init_from_attr(&qs->map, attr); qs->size = size; -- cgit v1.2.3 From 8f58ee54c2eae790f50c51dfa64a153601451f08 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 10 Aug 2022 15:18:27 +0000 Subject: bpf: Use bpf_map_area_free instread of kvfree bpf_map_area_alloc() should be paired with bpf_map_area_free(). Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20220810151840.16394-3-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/ringbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index ded4faeca192..3fb54feb39d4 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -116,7 +116,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); return NULL; } @@ -190,7 +190,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb) vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) -- cgit v1.2.3 From 992c9e13f5939437037627c67bcb51e674b64265 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 10 Aug 2022 15:18:28 +0000 Subject: bpf: Make __GFP_NOWARN consistent in bpf map creation Some of the bpf maps are created with __GFP_NOWARN, i.e. arraymap, bloom_filter, bpf_local_storage, bpf_struct_ops, lpm_trie, queue_stack_maps, reuseport_array, stackmap and xskmap, while others are created without __GFP_NOWARN, i.e. cpumap, devmap, hashtab, local_storage, offload, ringbuf and sock_map. But there are not key differences between the creation of these maps. So let make this allocation flag consistent in all bpf maps creation. Then we can use a generic helper to alloc all bpf maps. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20220810151840.16394-4-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/local_storage.c | 4 ++-- kernel/bpf/offload.c | 2 +- kernel/bpf/ringbuf.c | 2 +- net/core/sock_map.c | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f4860ac756cd..b25ca9d603a6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); + cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!cmap) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0e02b009487..88feaa094de8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -163,7 +163,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); + dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da7578426a46..f1e5303fe26e 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -495,7 +495,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) struct bpf_htab *htab; int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 49ef0ce040c7..a64255e20f87 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -313,8 +313,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); + map = kzalloc_node(sizeof(struct bpf_cgroup_storage_map), + GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT, numa_node); if (!map) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index bd09290e3648..5a629a1b971c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); - offmap = kzalloc(sizeof(*offmap), GFP_USER); + offmap = kzalloc(sizeof(*offmap), GFP_USER | __GFP_NOWARN); if (!offmap) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 3fb54feb39d4..df8062cb258c 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -164,7 +164,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); + rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!rb_map) return ERR_PTR(-ENOMEM); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 028813dfecb0..763d77162d0c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -41,7 +41,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); @@ -1076,7 +1076,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 73cf09a36bf7bfb3e5a3ff23755c36d49137c44d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 10 Aug 2022 15:18:29 +0000 Subject: bpf: Use bpf_map_area_alloc consistently on bpf map creation Let's use the generic helper bpf_map_area_alloc() instead of the open-coded kzalloc helpers in bpf maps creation path. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20220810151840.16394-5-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 6 +++--- kernel/bpf/cpumap.c | 6 +++--- kernel/bpf/devmap.c | 6 +++--- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 5 ++--- kernel/bpf/lpm_trie.c | 4 ++-- kernel/bpf/offload.c | 6 +++--- kernel/bpf/ringbuf.c | 6 +++--- net/core/sock_map.c | 12 ++++++------ 9 files changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 8ce40fd869f6..4ee2e7286c23 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -582,7 +582,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, synchronize_rcu(); kvfree(smap->buckets); - kfree(smap); + bpf_map_area_free(smap); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) @@ -610,7 +610,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -623,7 +623,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - kfree(smap); + bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b25ca9d603a6..b5ba34ddd4b6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); @@ -118,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_cmap: - kfree(cmap); + bpf_map_area_free(cmap); return ERR_PTR(err); } @@ -623,7 +623,7 @@ static void cpu_map_free(struct bpf_map *map) __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } bpf_map_area_free(cmap->cpu_map); - kfree(cmap); + bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 88feaa094de8..f9a87dcc5535 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { - kfree(dtab); + bpf_map_area_free(dtab); return ERR_PTR(err); } @@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - kfree(dtab); + bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f1e5303fe26e..8392f7f8a8ac 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -495,7 +495,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) struct bpf_htab *htab; int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -579,7 +579,7 @@ free_map_locked: bpf_map_area_free(htab->buckets); free_htab: lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -1496,7 +1496,7 @@ static void htab_map_free(struct bpf_map *map) for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a64255e20f87..098cf336fae6 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - map = kzalloc_node(sizeof(struct bpf_cgroup_storage_map), - GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT, numa_node); + map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); @@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); - kfree(map); + bpf_map_area_free(map); } static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d789e3b831ad..d833496e9e42 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -558,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE); if (!trie) return ERR_PTR(-ENOMEM); @@ -609,7 +609,7 @@ static void trie_free(struct bpf_map *map) } out: - kfree(trie); + bpf_map_area_free(trie); } static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 5a629a1b971c..13e4efc971e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); - offmap = kzalloc(sizeof(*offmap), GFP_USER | __GFP_NOWARN); + offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE); if (!offmap) return ERR_PTR(-ENOMEM); @@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) err_unlock: up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); return ERR_PTR(err); } @@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map) up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index df8062cb258c..b483aea35f41 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -164,7 +164,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); @@ -172,7 +172,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { - kfree(rb_map); + bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } @@ -199,7 +199,7 @@ static void ringbuf_map_free(struct bpf_map *map) rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); - kfree(rb_map); + bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 763d77162d0c..d0c43384d8bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -41,7 +41,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); @@ -52,7 +52,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { - kfree(stab); + bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } @@ -361,7 +361,7 @@ static void sock_map_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(stab->sks); - kfree(stab); + bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) @@ -1076,7 +1076,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -1106,7 +1106,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return &htab->map; free_htab: - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -1159,7 +1159,7 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(htab->buckets); - kfree(htab); + bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) -- cgit v1.2.3 From 2b5a2ecbfdc507af3f2f032bfe7366fba4dabff0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:51 -0700 Subject: bpf: Initialize the bpf_run_ctx in bpf_iter_run_prog() The bpf-iter-prog for tcp and unix sk can do bpf_setsockopt() which needs has_current_bpf_ctx() to decide if it is called by a bpf prog. This patch initializes the bpf_run_ctx in bpf_iter_run_prog() for the has_current_bpf_ctx() to use. Acked-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061751.4177657-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 97bb57493ed5..5dc307bdeaeb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -694,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { + struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->aux->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } -- cgit v1.2.3 From dea6a4e17013382b20717664ebf3d7cc405e0952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:51 -0700 Subject: bpf: Introduce cgroup_{common,current}_func_proto Split cgroup_base_func_proto into the following: * cgroup_common_func_proto - common helpers for all cgroup hooks * cgroup_current_func_proto - common helpers for all cgroup hooks running in the process context (== have meaningful 'current'). Move bpf_{g,s}et_retval and other cgroup-related helpers into kernel/bpf/cgroup.c so they closer to where they are being used. Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220823222555.523590-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 17 +++++++ kernel/bpf/cgroup.c | 117 +++++++++++++++++++++++++++++++++++++-------- kernel/bpf/helpers.c | 34 ------------- 3 files changed, 115 insertions(+), 53 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2bd1b5f8de9b..57e9e109257e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); + +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + +static inline const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 59b7eb60d5b4..189380ec452f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1527,6 +1527,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. + */ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; + void *ptr; + + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = @@ -1558,32 +1589,26 @@ const struct bpf_func_proto bpf_set_retval_proto = { }; static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; - case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } } -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -2096,6 +2121,16 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_strtol: return &bpf_strtol_proto; @@ -2111,8 +2146,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2233,6 +2270,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2254,8 +2301,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2420,3 +2469,33 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { const struct bpf_prog_ops cg_sockopt_prog_ops = { }; + +/* Common helpers for cgroup hooks. */ +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; + default: + return NULL; + } +} + +/* Common helpers for cgroup hooks with valid process context. */ +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + default: + return NULL; + } +} diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3c1b9bbcf971..6439a877c18b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -428,40 +428,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .arg1_type = ARG_ANYTHING, }; -#ifdef CONFIG_CGROUP_BPF - -BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) -{ - /* flags argument is not used now, - * but provides an ability to extend the API. - * verifier checks that its value is correct. - */ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; - struct bpf_cg_run_ctx *ctx; - void *ptr; - - /* get current cgroup storage from BPF run context */ - ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); - storage = ctx->prog_item->cgroup_storage[stype]; - - if (stype == BPF_CGROUP_STORAGE_SHARED) - ptr = &READ_ONCE(storage->buf)->data[0]; - else - ptr = this_cpu_ptr(storage->percpu_buf); - - return (unsigned long)ptr; -} - -const struct bpf_func_proto bpf_get_local_storage_proto = { - .func = bpf_get_local_storage, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; -#endif - #define BPF_STRTOX_BASE_MASK 0x1F static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, -- cgit v1.2.3 From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:52 -0700 Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks The following hooks are per-cgroup hooks but they are not using cgroup_{common,current}_func_proto, fix it: * BPF_PROG_TYPE_CGROUP_SKB (cg_skb) * BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr) * BPF_PROG_TYPE_CGROUP_SOCK (cg_sock) * BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP Also: * move common func_proto's into cgroup func_proto handlers * make sure bpf_{g,s}et_retval are not accessible from recvmsg, getpeername and getsockname (return/errno is ignored in these places) * as a side effect, expose get_current_pid_tgid, get_current_comm_proto, get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup hooks Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/bpf_lsm.c | 17 ++++++----- kernel/bpf/cgroup.c | 40 ++++++++++++++++++++++++-- net/core/filter.c | 80 ++++++++++++++++++++++------------------------------ 4 files changed, 80 insertions(+), 58 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39bd36359c1e..99fc7a64564f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2375,6 +2375,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..5a9743001ceb 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -189,6 +189,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = { static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + if (prog->expected_attach_type == BPF_LSM_CGROUP) { + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + } + switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; @@ -212,15 +220,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; - case BPF_FUNC_get_local_storage: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_local_storage_proto : NULL; - case BPF_FUNC_set_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_set_retval_proto : NULL; - case BPF_FUNC_get_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_retval_proto : NULL; #ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 189380ec452f..0bf2d70adfdb 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2478,9 +2478,35 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_get_retval_proto; + } case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_set_retval_proto; + } default: return NULL; } @@ -2493,8 +2519,18 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return NULL; } diff --git a/net/core/filter.c b/net/core/filter.c index 1acfaffeaf32..63e25d8ce501 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3009,7 +3009,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -7581,34 +7581,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7621,12 +7610,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7639,24 +7633,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7737,9 +7715,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7979,6 +7961,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -7992,8 +7980,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: -- cgit v1.2.3 From 8a67f2de9b1dc3cf8b75b4bf589efb1f08e3e9b8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:53 -0700 Subject: bpf: expose bpf_strtol and bpf_strtoul to all program types bpf_strncmp is already exposed everywhere. The motivation is to keep those helpers in kernel/bpf/helpers.c. Otherwise it's tempting to move them under kernel/bpf/cgroup.c because they are currently only used by sysctl prog types. Suggested-by: Martin KaFai Lau Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 4 ---- kernel/bpf/helpers.c | 6 +++++- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0bf2d70adfdb..121b5a5edb64 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2132,10 +2132,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func_proto; switch (func_id) { - case BPF_FUNC_strtol: - return &bpf_strtol_proto; - case BPF_FUNC_strtoul: - return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6439a877c18b..2f4709378740 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -427,6 +427,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; +#endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F @@ -555,7 +556,6 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; -#endif BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { @@ -1619,6 +1619,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: -- cgit v1.2.3 From 5679ff2f138f77b281c468959dc5022cc524d400 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:31:17 +0200 Subject: bpf: Move bpf_loop and bpf_for_each_map_elem under CAP_BPF They would require func_info which needs prog BTF anyway. Loading BTF and setting the prog btf_fd while loading the prog indirectly requires CAP_BPF, so just to reduce confusion, move both these helpers taking callback under bpf_capable() protection as well, since they cannot be used without CAP_BPF. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013117.24916-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2f4709378740..fc08035f14ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1613,10 +1613,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_loop: - return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; case BPF_FUNC_strtol: @@ -1659,6 +1655,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; default: break; } -- cgit v1.2.3 From 9d9d00ac29d0ef7ce426964de46fa6b380357d0a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:31:25 +0200 Subject: bpf: Fix reference state management for synchronous callbacks Currently, verifier verifies callback functions (sync and async) as if they will be executed once, (i.e. it explores execution state as if the function was being called once). The next insn to explore is set to start of subprog and the exit from nested frame is handled using curframe > 0 and prepare_func_exit. In case of async callback it uses a customized variant of push_stack simulating a kind of branch to set up custom state and execution context for the async callback. While this approach is simple and works when callback really will be executed only once, it is unsafe for all of our current helpers which are for_each style, i.e. they execute the callback multiple times. A callback releasing acquired references of the caller may do so multiple times, but currently verifier sees it as one call inside the frame, which then returns to caller. Hence, it thinks it released some reference that the cb e.g. got access through callback_ctx (register filled inside cb from spilled typed register on stack). Similarly, it may see that an acquire call is unpaired inside the callback, so the caller will copy the reference state of callback and then will have to release the register with new ref_obj_ids. But again, the callback may execute multiple times, but the verifier will only account for acquired references for a single symbolic execution of the callback, which will cause leaks. Note that for async callback case, things are different. While currently we have bpf_timer_set_callback which only executes it once, even for multiple executions it would be safe, as reference state is NULL and check_reference_leak would force program to release state before BPF_EXIT. The state is also unaffected by analysis for the caller frame. Hence async callback is safe. Since we want the reference state to be accessible, e.g. for pointers loaded from stack through callback_ctx's PTR_TO_STACK, we still have to copy caller's reference_state to callback's bpf_func_state, but we enforce that whatever references it adds to that reference_state has been released before it hits BPF_EXIT. This requires introducing a new callback_ref member in the reference state to distinguish between caller vs callee references. Hence, check_reference_leak now errors out if it sees we are in callback_fn and we have not released callback_ref refs. Since there can be multiple nested callbacks, like frame 0 -> cb1 -> cb2 etc. we need to also distinguish between whether this particular ref belongs to this callback frame or parent, and only error for our own, so we store state->frameno (which is always non-zero for callbacks). In short, callbacks can read parent reference_state, but cannot mutate it, to be able to use pointers acquired by the caller. They must only undo their changes (by releasing their own acquired_refs before BPF_EXIT) on top of caller reference_state before returning (at which point the caller and callback state will match anyway, so no need to copy it back to caller). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013125.24938-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++++++++++ kernel/bpf/verifier.c | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bad8640dc..1fdddbf3546b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,17 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; }; /* state of the program: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c1f8069f7b7..0194a36d0b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1092,6 +1092,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) id = ++env->id_gen; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } @@ -1104,6 +1105,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { + /* Cannot release caller references in callbacks */ + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -6915,10 +6919,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; + /* callback_fn frame should have released its own additions to parent's + * reference state at this point, or check_reference_leak would + * complain, hence it must be the same as the caller. There is no need + * to copy it back. + */ + if (!callee->in_callback_fn) { + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; + } *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { @@ -7042,13 +7053,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); + bool refs_lingering = false; int i; + if (state->frameno && !state->in_callback_fn) + return 0; + for (i = 0; i < state->acquired_refs; i++) { + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); + refs_lingering = true; } - return state->acquired_refs ? -EINVAL : 0; + return refs_lingering ? -EINVAL : 0; } static int check_bpf_snprintf_call(struct bpf_verifier_env *env, @@ -12337,6 +12355,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback + * function, for which reference_state must + * match caller reference state when it exits. + */ + err = check_reference_leak(env); + if (err) + return err; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -12346,10 +12374,6 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_reference_leak(env); - if (err) - return err; - err = check_return_code(env); if (err) return err; -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 + include/uapi/linux/bpf.h | 30 +++ kernel/bpf/Makefile | 3 + kernel/bpf/cgroup_iter.c | 284 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 30 +++ tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 +- 6 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/cgroup_iter.c (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99fc7a64564f..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 644600dbb114..0f61f09f467a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..00e05b69a4df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..cf6d763a57d5 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include +#include +#include +#include +#include + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_ITER_ANCESTORS_UP) + return p->start_css; + else /* BPF_ITER_SELF_ONLY */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + p->start_css = &cgrp->self; + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_ITER_DESCENDANTS_PRE && + order != BPF_ITER_DESCENDANTS_POST && + order != BPF_ITER_ANCESTORS_UP && + order != BPF_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4fb685591035..5056cef2112f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 5fce7008d1ff..84c1cfaa2b02 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,8 +764,8 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}", - { .map = { .map_fd = 1 }}); + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (__u32)1,.cgroup_fd = (__u32)1,},}", + { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so * complex, we don't do a string comparison, just verify we return -- cgit v1.2.3 From d4ffb6f39f1a1b260966b43a4ffdb64779c650dd Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 25 Aug 2022 15:39:36 -0700 Subject: bpf: Add CGROUP prefix to cgroup_iter_order bpf_cgroup_iter_order is globally visible but the entries do not have CGROUP prefix. As requested by Andrii, put a CGROUP in the names in bpf_cgroup_iter_order. This patch fixes two previous commits: one introduced the API and the other uses the API in bpf selftest (that is, the selftest cgroup_hierarchical_stats). I tested this patch via the following command: test_progs -t cgroup,iter,btf_dump Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Fixes: 88886309d2e8 ("selftests/bpf: add a selftest for cgroup hierarchical stats collection") Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220825223936.1865810-1-haoluo@google.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 10 +++---- kernel/bpf/cgroup_iter.c | 32 +++++++++++----------- tools/include/uapi/linux/bpf.h | 10 +++---- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- .../bpf/prog_tests/cgroup_hierarchical_stats.c | 2 +- .../testing/selftests/bpf/prog_tests/cgroup_iter.c | 10 +++---- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f61f09f467a..bdf4bc6d8d6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index cf6d763a57d5..c69bce2f4403 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -74,13 +74,13 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) ++*pos; p->terminate = false; p->visited_all = false; - if (p->order == BPF_ITER_DESCENDANTS_PRE) + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) return css_next_descendant_pre(NULL, p->start_css); - else if (p->order == BPF_ITER_DESCENDANTS_POST) + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); - else if (p->order == BPF_ITER_ANCESTORS_UP) + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return p->start_css; - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ return p->start_css; } @@ -109,13 +109,13 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (p->terminate) return NULL; - if (p->order == BPF_ITER_DESCENDANTS_PRE) + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) return css_next_descendant_pre(curr, p->start_css); - else if (p->order == BPF_ITER_DESCENDANTS_POST) + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(curr, p->start_css); - else if (p->order == BPF_ITER_ANCESTORS_UP) + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -188,10 +188,10 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_ITER_DESCENDANTS_PRE && - order != BPF_ITER_DESCENDANTS_POST && - order != BPF_ITER_ANCESTORS_UP && - order != BPF_ITER_SELF_ONLY) + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) return -EINVAL; if (fd && id) @@ -239,13 +239,13 @@ static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, kfree(buf); show_order: - if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE) + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) seq_puts(seq, "order: descendants_pre\n"); - else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST) + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) seq_puts(seq, "order: descendants_post\n"); - else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP) + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5056cef2112f..92f7387e378a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index a1bae92be1fc..7b5bbe21b549 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c index 101a6d70b863..bed1661596f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c @@ -275,7 +275,7 @@ static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, * traverse one cgroup, so set the traversal order to "self". */ linfo.cgroup.cgroup_fd = cgroup_fd; - linfo.cgroup.order = BPF_ITER_SELF_ONLY; + linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY; opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 38958c37b9ce..c4a2adb38da1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -134,7 +134,7 @@ static void test_walk_preorder(struct cgroup_iter *skel) cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_PRE, "preorder"); + BPF_CGROUP_ITER_DESCENDANTS_PRE, "preorder"); } /* Postorder walk prints child and parent in order. */ @@ -145,7 +145,7 @@ static void test_walk_postorder(struct cgroup_iter *skel) cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_POST, "postorder"); + BPF_CGROUP_ITER_DESCENDANTS_POST, "postorder"); } /* Walking parents prints parent and then root. */ @@ -159,7 +159,7 @@ static void test_walk_ancestors_up(struct cgroup_iter *skel) cg_id[PARENT], cg_id[ROOT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_ANCESTORS_UP, "ancestors_up"); + BPF_CGROUP_ITER_ANCESTORS_UP, "ancestors_up"); skel->bss->terminal_cgroup = 0; } @@ -174,7 +174,7 @@ static void test_early_termination(struct cgroup_iter *skel) PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_PRE, "early_termination"); + BPF_CGROUP_ITER_DESCENDANTS_PRE, "early_termination"); skel->bss->terminate_early = 0; } @@ -186,7 +186,7 @@ static void test_walk_self_only(struct cgroup_iter *skel) PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_SELF_ONLY, "self_only"); + BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } void test_cgroup_iter(void) -- cgit v1.2.3 From b88df6979682333815536a0bf43bd56f9499f071 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 24 Aug 2022 15:40:36 +0200 Subject: bpf: prepare for more bpf syscall to be used from kernel and user space. Add BPF_MAP_GET_FD_BY_ID and BPF_MAP_DELETE_PROG. Only BPF_MAP_GET_FD_BY_ID needs to be amended to be able to access the bpf pointer either from the userspace or the kernel. Acked-by: Yonghong Song Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220824134055.1328882-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d40d98428a..4e9d4622aef7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1437,9 +1437,9 @@ err_put: #define BPF_MAP_DELETE_ELEM_LAST_FIELD key -static int map_delete_elem(union bpf_attr *attr) +static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -1459,7 +1459,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -4941,7 +4941,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: - err = map_delete_elem(&attr); + err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); @@ -5073,8 +5073,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { switch (cmd) { case BPF_MAP_CREATE: + case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: + case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: -- cgit v1.2.3 From 2775da21628738ce073a3a6a806adcbaada0f091 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 31 Aug 2022 12:26:27 +0800 Subject: bpf: Disable preemption when increasing per-cpu map_locked Per-cpu htab->map_locked is used to prohibit the concurrent accesses from both NMI and non-NMI contexts. But since commit 74d862b682f5 ("sched: Make migrate_disable/enable() independent of RT"), migrate_disable() is also preemptible under CONFIG_PREEMPT case, so now map_locked also disallows concurrent updates from normal contexts (e.g. userspace processes) unexpectedly as shown below: process A process B htab_map_update_elem() htab_lock_bucket() migrate_disable() /* return 1 */ __this_cpu_inc_return() /* preempted by B */ htab_map_update_elem() /* the same bucket as A */ htab_lock_bucket() migrate_disable() /* return 2, so lock fails */ __this_cpu_inc_return() return -EBUSY A fix that seems feasible is using in_nmi() in htab_lock_bucket() and only checking the value of map_locked for nmi context. But it will re-introduce dead-lock on bucket lock if htab_lock_bucket() is re-entered through non-tracing program (e.g. fentry program). One cannot use preempt_disable() to fix this issue as htab_use_raw_lock being false causes the bucket lock to be a spin lock which can sleep and does not work with preempt_disable(). Therefore, use migrate_disable() when using the spinlock instead of preempt_disable() and defer fixing concurrent updates to when the kernel has its own BPF memory allocator. Fixes: 74d862b682f5 ("sched: Make migrate_disable/enable() independent of RT") Reviewed-by: Hao Luo Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220831042629.130006-2-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/hashtab.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b301a63afa2f..6fb3b7fd1622 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -162,17 +162,25 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, unsigned long *pflags) { unsigned long flags; + bool use_raw_lock; hash = hash & HASHTAB_MAP_LOCK_MASK; - migrate_disable(); + use_raw_lock = htab_use_raw_lock(htab); + if (use_raw_lock) + preempt_disable(); + else + migrate_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); return -EBUSY; } - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); @@ -185,13 +193,18 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { + bool use_raw_lock = htab_use_raw_lock(htab); + hash = hash & HASHTAB_MAP_LOCK_MASK; - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); -- cgit v1.2.3 From 66a7a92e4d0d091e79148a4c6ec15d1da65f4280 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 31 Aug 2022 12:26:28 +0800 Subject: bpf: Propagate error from htab_lock_bucket() to userspace In __htab_map_lookup_and_delete_batch() if htab_lock_bucket() returns -EBUSY, it will go to next bucket. Going to next bucket may not only skip the elements in current bucket silently, but also incur out-of-bound memory access or expose kernel memory to userspace if current bucket_cnt is greater than bucket_size or zero. Fixing it by stopping batch operation and returning -EBUSY when htab_lock_bucket() fails, and the application can retry or skip the busy batch as needed. Fixes: 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked") Reported-by: Hao Sun Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220831042629.130006-3-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/hashtab.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6fb3b7fd1622..eb1263f03e9b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1704,8 +1704,11 @@ again_nocopy: /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); - if (ret) - goto next_batch; + if (ret) { + rcu_read_unlock(); + bpf_enable_instrumentation(); + goto after_loop; + } } bucket_cnt = 0; -- cgit v1.2.3 From 197827a05e13808c60f52632e9887eede63f1c16 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 1 Sep 2022 14:19:35 +0800 Subject: bpf: Use this_cpu_{inc|dec|inc_return} for bpf_task_storage_busy Now migrate_disable() does not disable preemption and under some architectures (e.g. arm64) __this_cpu_{inc|dec|inc_return} are neither preemption-safe nor IRQ-safe, so for fully preemptible kernel concurrent lookups or updates on the same task local storage and on the same CPU may make bpf_task_storage_busy be imbalanced, and bpf_task_storage_trylock() on the specific cpu will always fail. Fixing it by using this_cpu_{inc|dec|inc_return} when manipulating bpf_task_storage_busy. Fixes: bc235cdb423a ("bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete]") Signed-off-by: Hou Tao Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20220901061938.3789460-2-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_local_storage.c | 4 ++-- kernel/bpf/bpf_task_storage.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 4ee2e7286c23..802fc15b0d73 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); - __this_cpu_inc(*busy_counter); + this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, false); if (busy_counter) { - __this_cpu_dec(*busy_counter); + this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e9014dc62682..6f290623347e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { migrate_disable(); - __this_cpu_inc(bpf_task_storage_busy); + this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { - __this_cpu_dec(bpf_task_storage_busy); + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); } static bool bpf_task_storage_trylock(void) { migrate_disable(); - if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - __this_cpu_dec(bpf_task_storage_busy); + if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); return false; } -- cgit v1.2.3 From c89e843a11f1075d27684f6b42256213e4592383 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 1 Sep 2022 14:19:36 +0800 Subject: bpf: Use this_cpu_{inc_return|dec} for prog->active Both __this_cpu_inc_return() and __this_cpu_dec() are not preemption safe and now migrate_disable() doesn't disable preemption, so the update of prog-active is not atomic and in theory under fully preemptible kernel recurisve prevention may do not work. Fixing by using the preemption-safe and IRQ-safe variants. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Hou Tao Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20220901061938.3789460-3-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/trampoline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ff87e38af8a7..ad76940b02cc 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -895,7 +895,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } @@ -930,7 +930,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } @@ -966,7 +966,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } @@ -982,7 +982,7 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } -- cgit v1.2.3 From ccf365eac0c7705591dee0158ae5c198d9e8f858 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 31 Aug 2022 10:16:18 +0800 Subject: bpf: Remove useless else if The assignment of the else and else if branches is the same, so the else if here is redundant, so we remove it and add a comment to make the code here readable. ./kernel/bpf/cgroup_iter.c:81:6-8: WARNING: possible condition with no effect (if == else). Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2016 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220831021618.86770-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/cgroup_iter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index c69bce2f4403..0d200a993489 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -78,9 +78,7 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); - else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) - return p->start_css; - else /* BPF_CGROUP_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } -- cgit v1.2.3 From ef331a8d4c0061ea4d353cd0db1c9b33fd45f0f2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 1 Sep 2022 14:51:26 +0800 Subject: bpf: Only add BTF IDs for socket security hooks when CONFIG_SECURITY_NETWORK is on When CONFIG_SECURITY_NETWORK is disabled, there will be build warnings from resolve_btfids: WARN: resolve_btfids: unresolved symbol bpf_lsm_socket_socketpair ...... WARN: resolve_btfids: unresolved symbol bpf_lsm_inet_conn_established Fixing it by wrapping these BTF ID definitions by CONFIG_SECURITY_NETWORK. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Fixes: 9113d7e48e91 ("bpf: expose bpf_{g,s}etsockopt to lsm cgroup") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220901065126.3856297-1-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_lsm.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 5a9743001ceb..4fd845bc5a12 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks) */ BTF_SET_START(bpf_lsm_current_hooks) /* operate on freshly allocated sk without any cgroup association */ +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif BTF_SET_END(bpf_lsm_current_hooks) /* List of LSM hooks that trigger while the socket is properly locked. */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) +#endif BTF_SET_END(bpf_lsm_locked_sockopt_hooks) /* List of LSM hooks that trigger while the socket is _not_ locked, @@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks) * in the early init phase. */ BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_CGROUP_BPF -- cgit v1.2.3 From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:43 -0700 Subject: bpf: Introduce any context BPF specific memory allocator. Tracing BPF programs can attach to kprobe and fentry. Hence they run in unknown context where calling plain kmalloc() might not be safe. Front-end kmalloc() with minimal per-cpu cache of free elements. Refill this cache asynchronously from irq_work. BPF programs always run with migration disabled. It's safe to allocate from cache of the current cpu with irqs disabled. Free-ing is always done into bucket of the current cpu as well. irq_work trims extra free elements from buckets with kfree and refills them with kmalloc, so global kmalloc logic takes care of freeing objects allocated by one cpu and freed on another. struct bpf_mem_alloc supports two modes: - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. This is typical bpf hash map use case when all elements have equal size. - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on kmalloc/kfree. Max allocation size is 4096 in this case. This is bpf_dynptr and bpf_kptr use case. bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 26 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/memalloc.c | 480 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_mem_alloc.h create mode 100644 kernel/bpf/memalloc.c (limited to 'kernel/bpf') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 00e05b69a4df..341c94f208f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c new file mode 100644 index 000000000000..1c46763d855e --- /dev/null +++ b/kernel/bpf/memalloc.c @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include + +/* Any context (including NMI) BPF specific memory allocator. + * + * Tracing BPF programs can attach to kprobe and fentry. Hence they + * run in unknown context where calling plain kmalloc() might not be safe. + * + * Front-end kmalloc() with per-cpu per-bucket cache of free elements. + * Refill this cache asynchronously from irq_work. + * + * CPU_0 buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * ... + * CPU_N buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * + * The buckets are prefilled at the start. + * BPF programs always run with migration disabled. + * It's safe to allocate from cache of the current cpu with irqs disabled. + * Free-ing is always done into bucket of the current cpu as well. + * irq_work trims extra free elements from buckets with kfree + * and refills them with kmalloc, so global kmalloc logic takes care + * of freeing objects allocated by one cpu and freed on another. + * + * Every allocated objected is padded with extra 8 bytes that contains + * struct llist_node. + */ +#define LLIST_NODE_SZ sizeof(struct llist_node) + +/* similar to kmalloc, but sizeof == 8 bucket is gone */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 3, /* 16 */ + 4, /* 24 */ + 4, /* 32 */ + 5, /* 40 */ + 5, /* 48 */ + 5, /* 56 */ + 5, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 6, /* 104 */ + 6, /* 112 */ + 6, /* 120 */ + 6, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static int bpf_mem_cache_idx(size_t size) +{ + if (!size || size > 4096) + return -1; + + if (size <= 192) + return size_index[(size - 1) / 8] - 1; + + return fls(size - 1) - 1; +} + +#define NUM_CACHES 11 + +struct bpf_mem_cache { + /* per-cpu list of free objects of size 'unit_size'. + * All accesses are done with interrupts disabled and 'active' counter + * protection with __llist_add() and __llist_del_first(). + */ + struct llist_head free_llist; + local_t active; + + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill + * are sequenced by per-cpu 'active' counter. But unit_free() cannot + * fail. When 'active' is busy the unit_free() will add an object to + * free_llist_extra. + */ + struct llist_head free_llist_extra; + + /* kmem_cache != NULL when bpf_mem_alloc was created for specific + * element size. + */ + struct kmem_cache *kmem_cache; + struct irq_work refill_work; + struct obj_cgroup *objcg; + int unit_size; + /* count of objects in free_llist */ + int free_cnt; +}; + +struct bpf_mem_caches { + struct bpf_mem_cache cache[NUM_CACHES]; +}; + +static struct llist_node notrace *__llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *next; + + entry = head->first; + if (!entry) + return NULL; + next = entry->next; + head->first = next; + return entry; +} + +#define BATCH 48 +#define LOW_WATERMARK 32 +#define HIGH_WATERMARK 96 +/* Assuming the average number of elements per bucket is 64, when all buckets + * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... + + * 64*4096*32 ~ 20Mbyte + */ + +static void *__alloc(struct bpf_mem_cache *c, int node) +{ + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + + if (c->kmem_cache) + return kmem_cache_alloc_node(c->kmem_cache, flags, node); + + return kmalloc_node(c->unit_size, flags, node); +} + +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) +{ +#ifdef CONFIG_MEMCG_KMEM + if (c->objcg) + return get_mem_cgroup_from_objcg(c->objcg); +#endif + +#ifdef CONFIG_MEMCG + return root_mem_cgroup; +#else + return NULL; +#endif +} + +/* Mostly runs from irq_work except __init phase. */ +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +{ + struct mem_cgroup *memcg = NULL, *old_memcg; + unsigned long flags; + void *obj; + int i; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (i = 0; i < cnt; i++) { + obj = __alloc(c, node); + if (!obj) + break; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + } + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +static void free_one(struct bpf_mem_cache *c, void *obj) +{ + if (c->kmem_cache) + kmem_cache_free(c->kmem_cache, obj); + else + kfree(obj); +} + +static void free_bulk(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + int cnt; + + do { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + else + cnt = 0; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + free_one(c, llnode); + } while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2); + + /* and drain free_llist_extra */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +static void bpf_mem_refill(struct irq_work *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); + int cnt; + + /* Racy access to free_cnt. It doesn't need to be 100% accurate */ + cnt = c->free_cnt; + if (cnt < LOW_WATERMARK) + /* irq_work runs on this cpu and kmalloc will allocate + * from the current numa node which is what we want here. + */ + alloc_bulk(c, BATCH, NUMA_NO_NODE); + else if (cnt > HIGH_WATERMARK) + free_bulk(c); +} + +static void notrace irq_work_raise(struct bpf_mem_cache *c) +{ + irq_work_queue(&c->refill_work); +} + +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + init_irq_work(&c->refill_work, bpf_mem_refill); + /* To avoid consuming memory assume that 1st run of bpf + * prog won't be doing more than 4 map_update_elem from + * irq disabled region + */ + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); +} + +/* When size != 0 create kmem_cache and bpf_mem_cache for each cpu. + * This is typical bpf hash map use case when all elements have equal size. + * + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on + * kmalloc/kfree. Max allocation size is 4096 in this case. + * This is bpf_dynptr and bpf_kptr use case. + */ +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) +{ + static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_cache *c, __percpu *pc; + struct kmem_cache *kmem_cache; + struct obj_cgroup *objcg = NULL; + char buf[32]; + int cpu, i; + + if (size) { + pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); + if (!pc) + return -ENOMEM; + size += LLIST_NODE_SZ; /* room for llist_node */ + snprintf(buf, sizeof(buf), "bpf-%u", size); + kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); + if (!kmem_cache) { + free_percpu(pc); + return -ENOMEM; + } +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(pc, cpu); + c->kmem_cache = kmem_cache; + c->unit_size = size; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + ma->cache = pc; + return 0; + } + + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->unit_size = sizes[i]; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + } + ma->caches = pcc; + return 0; +} + +static void drain_mem_cache(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + drain_mem_cache(c); + } + /* kmem_cache and memcg are the same across cpus */ + kmem_cache_destroy(c->kmem_cache); + if (c->objcg) + obj_cgroup_put(c->objcg); + free_percpu(ma->cache); + ma->cache = NULL; + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + drain_mem_cache(c); + } + } + if (c->objcg) + obj_cgroup_put(c->objcg); + free_percpu(ma->caches); + ma->caches = NULL; + } +} + +/* notrace is necessary here and in other functions to make sure + * bpf programs cannot attach to them and cause llist corruptions. + */ +static void notrace *unit_alloc(struct bpf_mem_cache *c) +{ + struct llist_node *llnode = NULL; + unsigned long flags; + int cnt = 0; + + /* Disable irqs to prevent the following race for majority of prog types: + * prog_A + * bpf_mem_alloc + * preemption or irq -> prog_B + * bpf_mem_alloc + * + * but prog_B could be a perf_event NMI prog. + * Use per-cpu 'active' counter to order free_list access between + * unit_alloc/unit_free/bpf_mem_refill. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + } + local_dec(&c->active); + local_irq_restore(flags); + + WARN_ON(cnt < 0); + + if (cnt < LOW_WATERMARK) + irq_work_raise(c); + return llnode; +} + +/* Though 'ptr' object could have been allocated on a different cpu + * add it to the free_llist of the current cpu. + * Let kfree() logic deal with it when it's later called from irq_work. + */ +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + int cnt = 0; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(llnode, &c->free_llist); + cnt = ++c->free_cnt; + } else { + /* unit_free() cannot fail. Therefore add an object to atomic + * llist. free_bulk() will drain it. Though free_llist_extra is + * a per-cpu list we have to use atomic llist_add here, since + * it also can be interrupted by bpf nmi prog that does another + * unit_free() into the same free_llist_extra. + */ + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (cnt > HIGH_WATERMARK) + /* free few objects from current cpu into global kmalloc pool */ + irq_work_raise(c); +} + +/* Called from BPF program or from sys_bpf syscall. + * In both cases migration is disabled. + */ +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) +{ + int idx; + void *ret; + + if (!size) + return ZERO_SIZE_PTR; + + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (idx < 0) + return NULL; + + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) +{ + void *ret; + + ret = unit_alloc(this_cpu_ptr(ma->cache)); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free(this_cpu_ptr(ma->cache), ptr); +} -- cgit v1.2.3 From fba1a1c6c912b383f86bf5d4aea732dcad3ec420 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:44 -0700 Subject: bpf: Convert hash map to bpf_mem_alloc. Convert bpf hash map to use bpf memory allocator. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-3-alexei.starovoitov@gmail.com --- kernel/bpf/hashtab.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index eb1263f03e9b..508e64351f87 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,6 +14,7 @@ #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" +#include #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -92,6 +93,7 @@ struct bucket { struct bpf_htab { struct bpf_map map; + struct bpf_mem_alloc ma; struct bucket *buckets; void *elems; union { @@ -576,6 +578,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_prealloc; } + } else { + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size); + if (err) + goto free_map_locked; } return &htab->map; @@ -586,6 +592,7 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); @@ -862,7 +869,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); check_and_free_fields(htab, l); - kfree(l); + bpf_mem_cache_free(&htab->ma, l); } static void htab_elem_free_rcu(struct rcu_head *head) @@ -986,9 +993,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-E2BIG); goto dec_count; } - l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_NOWAIT | __GFP_NOWARN, - htab->map.numa_node); + l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -1007,7 +1012,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = bpf_map_alloc_percpu(&htab->map, size, 8, GFP_NOWAIT | __GFP_NOWARN); if (!pptr) { - kfree(l_new); + bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } @@ -1429,6 +1434,10 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; + /* It's called from a worker thread, so disable migration here, + * since bpf_mem_cache_free() relies on that. + */ + migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1439,6 +1448,7 @@ static void delete_all_elements(struct bpf_htab *htab) htab_elem_free(htab, l); } } + migrate_enable(); } static void htab_free_malloced_timers(struct bpf_htab *htab) @@ -1502,6 +1512,7 @@ static void htab_map_free(struct bpf_map *map) bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->ma); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); -- cgit v1.2.3 From 34dd3bad1a6f1dc7d18ee8dd53f1d31bffd2aee8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:47 -0700 Subject: bpf: Relax the requirement to use preallocated hash maps in tracing progs. Since bpf hash map was converted to use bpf_mem_alloc it is safe to use from tracing programs and in RT kernels. But per-cpu hash map is still using dynamic allocation for per-cpu map values, hence keep the warning for this map type. In the future alloc_percpu_gfp can be front-end-ed with bpf_mem_cache and this restriction will be completely lifted. perf_event (NMI) bpf programs have to use preallocated hash maps, because free_htab_elem() is using call_rcu which might crash if re-entered. Sleepable bpf programs have to use preallocated hash maps, because life time of the map elements is not protected by rcu_read_lock/unlock. This restriction can be lifted in the future as well. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-6-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0194a36d0b36..3dce3166855f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12629,10 +12629,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, * For programs attached to PERF events this is mandatory as the * perf NMI can hit any arbitrary code sequence. * - * All other trace types using preallocated hash maps are unsafe as - * well because tracepoint or kprobes can be inside locked regions - * of the memory allocator or at a place where a recursion into the - * memory allocator would see inconsistent state. + * All other trace types using non-preallocated per-cpu hash maps are + * unsafe as well because tracepoint or kprobes can be inside locked + * regions of the per-cpu memory allocator or at a place where a + * recursion into the per-cpu memory allocator would see inconsistent + * state. Non per-cpu hash maps are using bpf_mem_alloc-tor which is + * safe to use from kprobe/fentry and in RT. * * On RT enabled kernels run-time allocation of all trace type * programs is strictly prohibited due to lock type constraints. On @@ -12642,15 +12644,26 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, */ if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { + /* perf_event bpf progs have to use preallocated hash maps + * because non-prealloc is still relying on call_rcu to free + * elements. + */ verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - verbose(env, "trace type programs can only use preallocated hash map\n"); - return -EINVAL; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + (map->inner_map_meta && + map->inner_map_meta->map_type == BPF_MAP_TYPE_PERCPU_HASH)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, + "trace type programs can only use preallocated per-cpu hash map\n"); + return -EINVAL; + } + WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); + verbose(env, + "trace type programs with run-time allocated per-cpu hash maps are unsafe." + " Switch to preallocated hash maps.\n"); } - WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); - verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } if (map_value_has_spin_lock(map)) { -- cgit v1.2.3 From 86fe28f7692d96d20232af0fc6d7632d5cc89a01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:48 -0700 Subject: bpf: Optimize element count in non-preallocated hash map. The atomic_inc/dec might cause extreme cache line bouncing when multiple cpus access the same bpf map. Based on specified max_entries for the hash map calculate when percpu_counter becomes faster than atomic_t and use it for such maps. For example samples/bpf/map_perf_test is using hash map with max_entries 1000. On a system with 16 cpus the 'map_perf_test 4' shows 14k events per second using atomic_t. On a system with 15 cpus it shows 100k events per second using percpu. map_perf_test is an extreme case where all cpus colliding on atomic_t which causes extreme cache bouncing. Note that the slow path of percpu_counter is 5k events per secound vs 14k for atomic, so the heuristic is necessary. See comment in the code why the heuristic is based on num_online_cpus(). Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-7-alexei.starovoitov@gmail.com --- kernel/bpf/hashtab.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 508e64351f87..36aa16dc43ad 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -101,7 +101,12 @@ struct bpf_htab { struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; - atomic_t count; /* number of elements in this hashtable */ + /* number of elements in non-preallocated hashtable are kept + * in either pcount or count + */ + struct percpu_counter pcount; + atomic_t count; + bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; @@ -565,6 +570,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab_init_buckets(htab); +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + */ +#define PERCPU_COUNTER_BATCH 32 + if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) + htab->use_percpu_counter = true; + + if (htab->use_percpu_counter) { + err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); + if (err) + goto free_map_locked; + } + if (prealloc) { err = prealloc_init(htab); if (err) @@ -891,6 +919,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); @@ -899,7 +952,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - atomic_dec(&htab->count); + dec_elem_count(htab); l->htab = htab; call_rcu(&l->rcu, htab_elem_free_rcu); } @@ -983,16 +1036,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = container_of(l, struct htab_elem, fnode); } } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) - if (!old_elem) { + if (is_map_full(htab)) + if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ - l_new = ERR_PTR(-E2BIG); - goto dec_count; - } + return ERR_PTR(-E2BIG); + inc_elem_count(htab); l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); @@ -1034,7 +1086,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; dec_count: - atomic_dec(&htab->count); + dec_elem_count(htab); return l_new; } @@ -1513,6 +1565,8 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->ma); + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); -- cgit v1.2.3 From 0fd7c5d43339b783ee3301a05f925d1e52ac87c9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:49 -0700 Subject: bpf: Optimize call_rcu in non-preallocated hash map. Doing call_rcu() million times a second becomes a bottle neck. Convert non-preallocated hash map from call_rcu to SLAB_TYPESAFE_BY_RCU. The rcu critical section is no longer observed for one htab element which makes non-preallocated hash map behave just like preallocated hash map. The map elements are released back to kernel memory after observing rcu critical section. This improves 'map_perf_test 4' performance from 100k events per second to 250k events per second. bpf_mem_alloc + percpu_counter + typesafe_by_rcu provide 10x performance boost to non-preallocated hash map and make it within few % of preallocated map while consuming fraction of memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-8-alexei.starovoitov@gmail.com --- kernel/bpf/hashtab.c | 8 ++++++-- kernel/bpf/memalloc.c | 2 +- tools/testing/selftests/bpf/progs/timer.c | 11 ----------- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 36aa16dc43ad..0d888a90a805 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -953,8 +953,12 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } else { + htab_elem_free(htab, l); + } } } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 1c46763d855e..da0721f8c28f 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -281,7 +281,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) return -ENOMEM; size += LLIST_NODE_SZ; /* room for llist_node */ snprintf(buf, sizeof(buf), "bpf-%u", size); - kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); + kmem_cache = kmem_cache_create(buf, size, 8, SLAB_TYPESAFE_BY_RCU, NULL); if (!kmem_cache) { free_percpu(pc); return -ENOMEM; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 5f5309791649..0053c5402173 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -208,17 +208,6 @@ static int timer_cb2(void *map, int *key, struct hmap_elem *val) */ bpf_map_delete_elem(map, key); - /* in non-preallocated hashmap both 'key' and 'val' are RCU - * protected and still valid though this element was deleted - * from the map. Arm this timer for ~35 seconds. When callback - * finishes the call_rcu will invoke: - * htab_elem_free_rcu - * check_and_free_timer - * bpf_timer_cancel_and_free - * to cancel this 35 second sleep and delete the timer for real. - */ - if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0) - err |= 256; ok |= 4; } return 0; -- cgit v1.2.3 From 7c266178aa51dd2d4fda1312c5990a8a82c83d70 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:50 -0700 Subject: bpf: Adjust low/high watermarks in bpf_mem_cache The same low/high watermarks for every bucket in bpf_mem_cache consume significant amount of memory. Preallocating 64 elements of 4096 bytes each in the free list is not efficient. Make low/high watermarks and batching value dependent on element size. This change brings significant memory savings. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-9-alexei.starovoitov@gmail.com --- kernel/bpf/memalloc.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index da0721f8c28f..7e5df6866d92 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -100,6 +100,7 @@ struct bpf_mem_cache { int unit_size; /* count of objects in free_llist */ int free_cnt; + int low_watermark, high_watermark, batch; }; struct bpf_mem_caches { @@ -118,14 +119,6 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) return entry; } -#define BATCH 48 -#define LOW_WATERMARK 32 -#define HIGH_WATERMARK 96 -/* Assuming the average number of elements per bucket is 64, when all buckets - * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... + - * 64*4096*32 ~ 20Mbyte - */ - static void *__alloc(struct bpf_mem_cache *c, int node) { /* Allocate, but don't deplete atomic reserves that typical @@ -220,7 +213,7 @@ static void free_bulk(struct bpf_mem_cache *c) if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(flags); free_one(c, llnode); - } while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2); + } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) @@ -234,12 +227,12 @@ static void bpf_mem_refill(struct irq_work *work) /* Racy access to free_cnt. It doesn't need to be 100% accurate */ cnt = c->free_cnt; - if (cnt < LOW_WATERMARK) + if (cnt < c->low_watermark) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here. */ - alloc_bulk(c, BATCH, NUMA_NO_NODE); - else if (cnt > HIGH_WATERMARK) + alloc_bulk(c, c->batch, NUMA_NO_NODE); + else if (cnt > c->high_watermark) free_bulk(c); } @@ -248,9 +241,38 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) irq_work_queue(&c->refill_work); } +/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket + * the freelist cache will be elem_size * 64 (or less) on each cpu. + * + * For bpf programs that don't have statically known allocation sizes and + * assuming (low_mark + high_mark) / 2 as an average number of elements per + * bucket and all buckets are used the total amount of memory in freelists + * on each cpu will be: + * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 + * == ~ 116 Kbyte using below heuristic. + * Initialized, but unused bpf allocator (not bpf map specific one) will + * consume ~ 11 Kbyte per cpu. + * Typical case will be between 11K and 116K closer to 11K. + * bpf progs can and should share bpf_mem_cache when possible. + */ + static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { init_irq_work(&c->refill_work, bpf_mem_refill); + if (c->unit_size <= 256) { + c->low_watermark = 32; + c->high_watermark = 96; + } else { + /* When page_size == 4k, order-0 cache will have low_mark == 2 + * and high_mark == 6 with batch alloc of 3 individual pages at + * a time. + * 8k allocs and above low == 1, high == 3, batch == 1. + */ + c->low_watermark = max(32 * 256 / c->unit_size, 1); + c->high_watermark = max(96 * 256 / c->unit_size, 3); + } + c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); + /* To avoid consuming memory assume that 1st run of bpf * prog won't be doing more than 4 map_update_elem from * irq disabled region @@ -392,7 +414,7 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) WARN_ON(cnt < 0); - if (cnt < LOW_WATERMARK) + if (cnt < c->low_watermark) irq_work_raise(c); return llnode; } @@ -425,7 +447,7 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) local_dec(&c->active); local_irq_restore(flags); - if (cnt > HIGH_WATERMARK) + if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); } -- cgit v1.2.3 From 8d5a8011b35d387c490a5c977b1d9eb4798aa071 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:51 -0700 Subject: bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU. SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change solves the memory consumption issue, avoids kmem_cache_destroy latency and keeps bpf hash map performance the same. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-10-alexei.starovoitov@gmail.com --- kernel/bpf/memalloc.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/syscall.c | 5 +++- 2 files changed, 66 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 7e5df6866d92..5d8648a01b5c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -101,6 +101,11 @@ struct bpf_mem_cache { /* count of objects in free_llist */ int free_cnt; int low_watermark, high_watermark, batch; + + struct rcu_head rcu; + struct llist_head free_by_rcu; + struct llist_head waiting_for_gp; + atomic_t call_rcu_in_progress; }; struct bpf_mem_caches { @@ -194,6 +199,45 @@ static void free_one(struct bpf_mem_cache *c, void *obj) kfree(obj); } +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + struct llist_node *pos, *t; + + llist_for_each_safe(pos, t, llnode) + free_one(c, pos); + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void enque_to_free(struct bpf_mem_cache *c, void *obj) +{ + struct llist_node *llnode = obj; + + /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. + * Nothing races to add to free_by_rcu list. + */ + __llist_add(llnode, &c->free_by_rcu); +} + +static void do_call_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) + return; + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + /* There is no concurrent __llist_add(waiting_for_gp) access. + * It doesn't race with llist_del_all either. + * But there could be two concurrent llist_del_all(waiting_for_gp): + * from __free_rcu() and from drain_mem_cache(). + */ + __llist_add(llnode, &c->waiting_for_gp); + call_rcu(&c->rcu, __free_rcu); +} + static void free_bulk(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; @@ -212,12 +256,13 @@ static void free_bulk(struct bpf_mem_cache *c) local_dec(&c->active); if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(flags); - free_one(c, llnode); + enque_to_free(c, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) - free_one(c, llnode); + enque_to_free(c, llnode); + do_call_rcu(c); } static void bpf_mem_refill(struct irq_work *work) @@ -303,7 +348,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) return -ENOMEM; size += LLIST_NODE_SZ; /* room for llist_node */ snprintf(buf, sizeof(buf), "bpf-%u", size); - kmem_cache = kmem_cache_create(buf, size, 8, SLAB_TYPESAFE_BY_RCU, NULL); + kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); if (!kmem_cache) { free_percpu(pc); return -ENOMEM; @@ -345,6 +390,15 @@ static void drain_mem_cache(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; + /* The caller has done rcu_barrier() and no progs are using this + * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for + * all remaining elements and they can be in free_by_rcu or in + * waiting_for_gp lists, so drain those lists now. + */ + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) + free_one(c, llnode); llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) free_one(c, llnode); llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) @@ -366,6 +420,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) kmem_cache_destroy(c->kmem_cache); if (c->objcg) obj_cgroup_put(c->objcg); + /* c->waiting_for_gp list was drained, but __free_rcu might + * still execute. Wait for it now before we free 'c'. + */ + rcu_barrier(); free_percpu(ma->cache); ma->cache = NULL; } @@ -379,6 +437,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) } if (c->objcg) obj_cgroup_put(c->objcg); + rcu_barrier(); free_percpu(ma->caches); ma->caches = NULL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e9d4622aef7..074c901fbb4e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); } } -- cgit v1.2.3 From 4ab67149f3c6e97c5c506a726f0ebdec38241679 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:52 -0700 Subject: bpf: Add percpu allocation support to bpf_mem_alloc. Extend bpf_mem_alloc to cache free list of fixed size per-cpu allocations. Once such cache is created bpf_mem_cache_alloc() will return per-cpu objects. bpf_mem_cache_free() will free them back into global per-cpu pool after observing RCU grace period. per-cpu flavor of bpf_mem_alloc is going to be used by per-cpu hash maps. The free list cache consists of tuples { llist_node, per-cpu pointer } Unlike alloc_percpu() that returns per-cpu pointer the bpf_mem_cache_alloc() returns a pointer to per-cpu pointer and bpf_mem_cache_free() expects to receive it back. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-11-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/memalloc.c | 44 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 804733070f8d..653ed1584a03 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -12,7 +12,7 @@ struct bpf_mem_alloc { struct bpf_mem_cache __percpu *cache; }; -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0d888a90a805..70b02ff4445e 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -607,7 +607,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_prealloc; } } else { - err = bpf_mem_alloc_init(&htab->ma, htab->elem_size); + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); if (err) goto free_map_locked; } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5d8648a01b5c..f7b07787581b 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -101,6 +101,7 @@ struct bpf_mem_cache { /* count of objects in free_llist */ int free_cnt; int low_watermark, high_watermark, batch; + bool percpu; struct rcu_head rcu; struct llist_head free_by_rcu; @@ -133,6 +134,19 @@ static void *__alloc(struct bpf_mem_cache *c, int node) */ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + if (c->percpu) { + void **obj = kmem_cache_alloc_node(c->kmem_cache, flags, node); + void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + + if (!obj || !pptr) { + free_percpu(pptr); + kfree(obj); + return NULL; + } + obj[1] = pptr; + return obj; + } + if (c->kmem_cache) return kmem_cache_alloc_node(c->kmem_cache, flags, node); @@ -193,6 +207,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) static void free_one(struct bpf_mem_cache *c, void *obj) { + if (c->percpu) { + free_percpu(((void **)obj)[1]); + kmem_cache_free(c->kmem_cache, obj); + return; + } + if (c->kmem_cache) kmem_cache_free(c->kmem_cache, obj); else @@ -332,21 +352,30 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) * kmalloc/kfree. Max allocation size is 4096 in this case. * This is bpf_dynptr and bpf_kptr use case. */ -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; - struct kmem_cache *kmem_cache; + struct kmem_cache *kmem_cache = NULL; struct obj_cgroup *objcg = NULL; char buf[32]; - int cpu, i; + int cpu, i, unit_size; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - size += LLIST_NODE_SZ; /* room for llist_node */ + + if (percpu) { + unit_size = size; + /* room for llist_node and per-cpu pointer */ + size = LLIST_NODE_SZ + sizeof(void *); + } else { + size += LLIST_NODE_SZ; /* room for llist_node */ + unit_size = size; + } + snprintf(buf, sizeof(buf), "bpf-%u", size); kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); if (!kmem_cache) { @@ -359,14 +388,19 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size) for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->kmem_cache = kmem_cache; - c->unit_size = size; + c->unit_size = unit_size; c->objcg = objcg; + c->percpu = percpu; prefill_mem_cache(c, cpu); } ma->cache = pc; return 0; } + /* size == 0 && percpu is an invalid combination */ + if (WARN_ON_ONCE(percpu)) + return -EINVAL; + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; -- cgit v1.2.3 From ee4ed53c5eb62f49f23560cc2642353547e46c32 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:53 -0700 Subject: bpf: Convert percpu hash map to per-cpu bpf_mem_alloc. Convert dynamic allocations in percpu hash map from alloc_percpu() to bpf_mem_cache_alloc() from per-cpu bpf_mem_alloc. Since bpf_mem_alloc frees objects after RCU gp the call_rcu() is removed. pcpu_init_value() now needs to zero-fill per-cpu allocations, since dynamically allocated map elements are now similar to full prealloc, since alloc_percpu() is not called inline and the elements are reused in the freelist. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-12-alexei.starovoitov@gmail.com --- kernel/bpf/hashtab.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 70b02ff4445e..a77b9c4a4e48 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -94,6 +94,7 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; + struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { @@ -121,14 +122,14 @@ struct htab_elem { struct { void *padding; union { - struct bpf_htab *htab; struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { - struct rcu_head rcu; + /* pointer to per-cpu pointer */ + void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; @@ -448,8 +449,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); @@ -610,6 +609,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); if (err) goto free_map_locked; + if (percpu) { + err = bpf_mem_alloc_init(&htab->pcpu_ma, + round_up(htab->map.value_size, 8), true); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -620,6 +625,7 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); @@ -895,19 +901,11 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) - free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); check_and_free_fields(htab, l); bpf_mem_cache_free(&htab->ma, l); } -static void htab_elem_free_rcu(struct rcu_head *head) -{ - struct htab_elem *l = container_of(head, struct htab_elem, rcu); - struct bpf_htab *htab = l->htab; - - htab_elem_free(htab, l); -} - static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; @@ -953,12 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); - if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); - } else { - htab_elem_free(htab, l); - } + htab_elem_free(htab, l); } } @@ -983,13 +976,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { - /* When using prealloc and not setting the initial value on all cpus, - * zero-fill element values for other cpus (just as what happens when - * not using prealloc). Otherwise, bpf program has no way to ensure + /* When not setting the initial value on all cpus, zero-fill element + * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ - if (htab_is_prealloc(htab) && !onallcpus) { + if (!onallcpus) { u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; @@ -1060,18 +1052,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_NOWAIT | __GFP_NOWARN); + pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } + l_new->ptr_to_pptr = pptr; + pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -1568,6 +1560,7 @@ static void htab_map_free(struct bpf_map *map) bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); -- cgit v1.2.3 From 96da3f7d489d11b43e7c1af90d876b9a2492cca8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:54 -0700 Subject: bpf: Remove tracing program restriction on map types The hash map is now fully converted to bpf_mem_alloc. Its implementation is not allocating synchronously and not calling call_rcu() directly. It's now safe to use non-preallocated hash maps in all types of tracing programs including BPF_PROG_TYPE_PERF_EVENT that runs out of NMI context. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-13-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3dce3166855f..57ec06b1d09d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12623,48 +12623,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - /* - * Validate that trace type programs use preallocated hash maps. - * - * For programs attached to PERF events this is mandatory as the - * perf NMI can hit any arbitrary code sequence. - * - * All other trace types using non-preallocated per-cpu hash maps are - * unsafe as well because tracepoint or kprobes can be inside locked - * regions of the per-cpu memory allocator or at a place where a - * recursion into the per-cpu memory allocator would see inconsistent - * state. Non per-cpu hash maps are using bpf_mem_alloc-tor which is - * safe to use from kprobe/fentry and in RT. - * - * On RT enabled kernels run-time allocation of all trace type - * programs is strictly prohibited due to lock type constraints. On - * !RT kernels it is allowed for backwards compatibility reasons for - * now, but warnings are emitted so developers are made aware of - * the unsafety and can fix their programs before this is enforced. - */ - if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { - if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { - /* perf_event bpf progs have to use preallocated hash maps - * because non-prealloc is still relying on call_rcu to free - * elements. - */ - verbose(env, "perf_event programs can only use preallocated hash map\n"); - return -EINVAL; - } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - (map->inner_map_meta && - map->inner_map_meta->map_type == BPF_MAP_TYPE_PERCPU_HASH)) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - verbose(env, - "trace type programs can only use preallocated per-cpu hash map\n"); - return -EINVAL; - } - WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); - verbose(env, - "trace type programs with run-time allocated per-cpu hash maps are unsafe." - " Switch to preallocated hash maps.\n"); - } - } if (map_value_has_spin_lock(map)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { -- cgit v1.2.3 From dccb4a9013a68ddcb8303cd60f2fca1742014f3f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:55 -0700 Subject: bpf: Prepare bpf_mem_alloc to be used by sleepable bpf programs. Use call_rcu_tasks_trace() to wait for sleepable progs to finish. Then use call_rcu() to wait for normal progs to finish and finally do free_one() on each element when freeing objects into global memory pool. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-14-alexei.starovoitov@gmail.com --- kernel/bpf/memalloc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index f7b07787581b..8895c016dcdb 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -230,6 +230,13 @@ static void __free_rcu(struct rcu_head *head) atomic_set(&c->call_rcu_in_progress, 0); } +static void __free_rcu_tasks_trace(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + call_rcu(&c->rcu, __free_rcu); +} + static void enque_to_free(struct bpf_mem_cache *c, void *obj) { struct llist_node *llnode = obj; @@ -255,7 +262,11 @@ static void do_call_rcu(struct bpf_mem_cache *c) * from __free_rcu() and from drain_mem_cache(). */ __llist_add(llnode, &c->waiting_for_gp); - call_rcu(&c->rcu, __free_rcu); + /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu() to wait for normal progs to finish + * and finally do free_one() on each element. + */ + call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); } static void free_bulk(struct bpf_mem_cache *c) @@ -457,6 +468,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) /* c->waiting_for_gp list was drained, but __free_rcu might * still execute. Wait for it now before we free 'c'. */ + rcu_barrier_tasks_trace(); rcu_barrier(); free_percpu(ma->cache); ma->cache = NULL; @@ -471,6 +483,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) } if (c->objcg) obj_cgroup_put(c->objcg); + rcu_barrier_tasks_trace(); rcu_barrier(); free_percpu(ma->caches); ma->caches = NULL; -- cgit v1.2.3 From 02cc5aa29e8cef4c1d710accd423546ab63f4eda Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:56 -0700 Subject: bpf: Remove prealloc-only restriction for sleepable bpf programs. Since hash map is now converted to bpf_mem_alloc and it's waiting for rcu and rcu_tasks_trace GPs before freeing elements into global memory slabs it's safe to use dynamically allocated hash maps in sleepable bpf programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-15-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 57ec06b1d09d..068b20ed34d2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12586,14 +12586,6 @@ err_put: return err; } -static int check_map_prealloc(struct bpf_map *map) -{ - return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH && - map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || - !(map->map_flags & BPF_F_NO_PREALLOC); -} - static bool is_tracing_prog_type(enum bpf_prog_type type) { switch (type) { @@ -12608,15 +12600,6 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } -static bool is_preallocated_map(struct bpf_map *map) -{ - if (!check_map_prealloc(map)) - return false; - if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) - return false; - return true; -} - static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -12669,12 +12652,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: - if (!is_preallocated_map(map)) { - verbose(env, - "Sleepable programs can only use preallocated maps\n"); - return -EINVAL; - } - break; case BPF_MAP_TYPE_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: -- cgit v1.2.3 From bfc03c15bebf5e0028e21ca5fc0fe4a60a6b6681 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:57 -0700 Subject: bpf: Remove usage of kmem_cache from bpf_mem_cache. For bpf_mem_cache based hash maps the following stress test: for (i = 1; i <= 512; i <<= 1) for (j = 1; j <= 1 << 18; j <<= 1) fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, i, j, 2, 0); creates many kmem_cache-s that are not mergeable in debug kernels and consume unnecessary amount of memory. Turned out bpf_mem_cache's free_list logic does batching well, so usage of kmem_cache for fixes size allocations doesn't bring any performance benefits vs normal kmalloc. Hence get rid of kmem_cache in bpf_mem_cache. That saves memory, speeds up map create/destroy operations, while maintains hash map update/delete performance. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220902211058.60789-16-alexei.starovoitov@gmail.com --- kernel/bpf/memalloc.c | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 8895c016dcdb..38fbd15c130a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -91,17 +91,13 @@ struct bpf_mem_cache { */ struct llist_head free_llist_extra; - /* kmem_cache != NULL when bpf_mem_alloc was created for specific - * element size. - */ - struct kmem_cache *kmem_cache; struct irq_work refill_work; struct obj_cgroup *objcg; int unit_size; /* count of objects in free_llist */ int free_cnt; int low_watermark, high_watermark, batch; - bool percpu; + int percpu_size; struct rcu_head rcu; struct llist_head free_by_rcu; @@ -134,8 +130,8 @@ static void *__alloc(struct bpf_mem_cache *c, int node) */ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; - if (c->percpu) { - void **obj = kmem_cache_alloc_node(c->kmem_cache, flags, node); + if (c->percpu_size) { + void **obj = kmalloc_node(c->percpu_size, flags, node); void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); if (!obj || !pptr) { @@ -147,9 +143,6 @@ static void *__alloc(struct bpf_mem_cache *c, int node) return obj; } - if (c->kmem_cache) - return kmem_cache_alloc_node(c->kmem_cache, flags, node); - return kmalloc_node(c->unit_size, flags, node); } @@ -207,16 +200,13 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) static void free_one(struct bpf_mem_cache *c, void *obj) { - if (c->percpu) { + if (c->percpu_size) { free_percpu(((void **)obj)[1]); - kmem_cache_free(c->kmem_cache, obj); + kfree(obj); return; } - if (c->kmem_cache) - kmem_cache_free(c->kmem_cache, obj); - else - kfree(obj); + kfree(obj); } static void __free_rcu(struct rcu_head *head) @@ -356,7 +346,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); } -/* When size != 0 create kmem_cache and bpf_mem_cache for each cpu. +/* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on @@ -368,40 +358,29 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; - struct kmem_cache *kmem_cache = NULL; struct obj_cgroup *objcg = NULL; - char buf[32]; - int cpu, i, unit_size; + int cpu, i, unit_size, percpu_size = 0; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) { - unit_size = size; + if (percpu) /* room for llist_node and per-cpu pointer */ - size = LLIST_NODE_SZ + sizeof(void *); - } else { + percpu_size = LLIST_NODE_SZ + sizeof(void *); + else size += LLIST_NODE_SZ; /* room for llist_node */ - unit_size = size; - } + unit_size = size; - snprintf(buf, sizeof(buf), "bpf-%u", size); - kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL); - if (!kmem_cache) { - free_percpu(pc); - return -ENOMEM; - } #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); - c->kmem_cache = kmem_cache; c->unit_size = unit_size; c->objcg = objcg; - c->percpu = percpu; + c->percpu_size = percpu_size; prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -461,8 +440,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) c = per_cpu_ptr(ma->cache, cpu); drain_mem_cache(c); } - /* kmem_cache and memcg are the same across cpus */ - kmem_cache_destroy(c->kmem_cache); + /* objcg is the same across cpus */ if (c->objcg) obj_cgroup_put(c->objcg); /* c->waiting_for_gp list was drained, but __free_rcu might -- cgit v1.2.3 From 9f2c6e96c65e6fa1aebef546be0c30a5895fcb37 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:58 -0700 Subject: bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc. User space might be creating and destroying a lot of hash maps. Synchronous rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets and other map memory and may cause artificial OOM situation under stress. Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc: - remove rcu_barrier from hash map, since htab doesn't use call_rcu directly and there are no callback to wait for. - bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks. Use it to avoid barriers in fast path. - When barriers are needed copy bpf_mem_alloc into temp structure and wait for rcu barrier-s in the worker to let the rest of hash map freeing to proceed. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 ++ kernel/bpf/hashtab.c | 6 ++-- kernel/bpf/memalloc.c | 80 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 69 insertions(+), 19 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 653ed1584a03..3e164b8efaa9 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -3,6 +3,7 @@ #ifndef _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H #include +#include struct bpf_mem_cache; struct bpf_mem_caches; @@ -10,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct work_struct work; }; int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a77b9c4a4e48..0fe3f136cbbe 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map) * There is no need to synchronize_rcu() here to protect map elements. */ - /* some of free_htab_elem() callbacks for elements of this map may - * not have executed. Wait for them. + /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it + * underneath and is reponsible for waiting for callbacks to finish + * during bpf_mem_alloc_destroy(). */ - rcu_barrier(); if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 38fbd15c130a..5cc952da7d41 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - /* The caller has done rcu_barrier() and no progs are using this - * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for - * all remaining elements and they can be in free_by_rcu or in - * waiting_for_gp lists, so drain those lists now. + /* No progs are using this bpf_mem_cache, but htab_map_free() called + * bpf_mem_cache_free() for all remaining elements and they can be in + * free_by_rcu or in waiting_for_gp lists, so drain those lists now. */ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) free_one(c, llnode); @@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c) free_one(c, llnode); } +static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) +{ + free_percpu(ma->cache); + free_percpu(ma->caches); + ma->cache = NULL; + ma->caches = NULL; +} + +static void free_mem_alloc(struct bpf_mem_alloc *ma) +{ + /* waiting_for_gp lists was drained, but __free_rcu might + * still execute. Wait for it now before we freeing percpu caches. + */ + rcu_barrier_tasks_trace(); + rcu_barrier(); + free_mem_alloc_no_barrier(ma); +} + +static void free_mem_alloc_deferred(struct work_struct *work) +{ + struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); + + free_mem_alloc(ma); + kfree(ma); +} + +static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) +{ + struct bpf_mem_alloc *copy; + + if (!rcu_in_progress) { + /* Fast path. No callbacks are pending, hence no need to do + * rcu_barrier-s. + */ + free_mem_alloc_no_barrier(ma); + return; + } + + copy = kmalloc(sizeof(*ma), GFP_KERNEL); + if (!copy) { + /* Slow path with inline barrier-s */ + free_mem_alloc(ma); + return; + } + + /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->cache = ma->cache; + ma->cache = NULL; + copy->caches = ma->caches; + ma->caches = NULL; + INIT_WORK(©->work, free_mem_alloc_deferred); + queue_work(system_unbound_wq, ©->work); +} + void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) { struct bpf_mem_caches *cc; struct bpf_mem_cache *c; - int cpu, i; + int cpu, i, rcu_in_progress; if (ma->cache) { + rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ if (c->objcg) obj_cgroup_put(c->objcg); - /* c->waiting_for_gp list was drained, but __free_rcu might - * still execute. Wait for it now before we free 'c'. - */ - rcu_barrier_tasks_trace(); - rcu_barrier(); - free_percpu(ma->cache); - ma->cache = NULL; + destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { + rcu_in_progress = 0; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } if (c->objcg) obj_cgroup_put(c->objcg); - rcu_barrier_tasks_trace(); - rcu_barrier(); - free_percpu(ma->caches); - ma->caches = NULL; + destroy_mem_alloc(ma, rcu_in_progress); } } -- cgit v1.2.3 From 1e660f7ebe0ff6ac65ee0000280392d878630a67 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 6 Sep 2022 19:38:53 -0700 Subject: bpf: Replace __ksize with ksize. __ksize() was made private. Use ksize() instead. Reported-by: Stephen Rothwell Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5cc952da7d41..20621f5407d8 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -610,7 +610,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); if (idx < 0) return; -- cgit v1.2.3 From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:46 -0700 Subject: bpf: Allow struct argument in trampoline based programs Allow struct argument in trampoline based programs where the struct size should be <= 16 bytes. In such cases, the argument will be put into up to 2 registers for bpf, x86_64 and arm64 architectures. To support arch-specific trampoline manipulation, add arg_flags for additional struct information about arguments in btf_func_model. Such information will be used in arch specific function arch_prepare_bpf_trampoline() to prepare argument access properly in trampoline. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ kernel/bpf/btf.c | 45 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..4d32f125f4af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_REG_ARGS 5 +/* The argument is a structure. */ +#define BTF_FMODEL_STRUCT_ARG BIT(0) + struct btf_func_model { u8 ret_size; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; + u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 903719b89238..ea94527e5d70 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5328,6 +5328,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } +static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 offset = 0, nr_args; + int i; + + if (!func_proto) + return off / 8; + + nr_args = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nr_args; i++) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return i; + } + + t = btf_type_skip_modifiers(btf, func_proto->type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return nr_args; + + return nr_args + 1; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -5347,7 +5375,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = off / 8; + arg = get_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -5417,7 +5445,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5881,7 +5909,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -5901,8 +5929,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; + m->arg_flags[i] = 0; + } m->ret_size = 8; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; @@ -5916,7 +5946,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0) { + if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5932,7 +5962,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); - if (ret < 0) { + + /* No support of struct argument size greater than 16 bytes */ + if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5945,6 +5977,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } m->arg_size[i] = ret; + m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0; } m->nr_args = nargs; return 0; -- cgit v1.2.3 From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:58 +0200 Subject: bpf: split btf_check_subprog_arg_match in two btf_check_subprog_arg_match() was used twice in verifier.c: - when checking for the type mismatches between a (sub)prog declaration and BTF - when checking the call of a subprog to see if the provided arguments are correct and valid This is problematic when we check if the first argument of a program (pointer to ctx) is correctly accessed: To be able to ensure we access a valid memory in the ctx, the verifier assumes the pointer to context is not null. This has the side effect of marking the program accessing the entire context, even if the context is never dereferenced. For example, by checking the context access with the current code, the following eBPF program would fail with -EINVAL if the ctx is set to null from the userspace: ``` SEC("syscall") int prog(struct my_ctx *args) { return 0; } ``` In that particular case, we do not want to actually check that the memory is correct while checking for the BTF validity, but we just want to ensure that the (sub)prog definition matches the BTF we have. So split btf_check_subprog_arg_match() in two so we can actually check for the memory used when in a call, and ignore that part when not. Note that a further patch is in preparation to disentangled btf_check_func_arg_match() from these two purposes, and so right now we just add a new hack around that by adding a boolean to this function. Signed-off-by: Benjamin Tissoires Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/btf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 +- 3 files changed, 52 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d32f125f4af..3cf161cfd396 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ea94527e5d70..9291e2b2c950 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6203,7 +6203,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags) + u32 kfunc_flags, + bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_arg = false; @@ -6389,7 +6390,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (ptr_to_mem_ok) { + } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; @@ -6464,7 +6465,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return rel ? ref_regno : 0; } -/* Compare BTF of a function with given bpf_reg_state. +/* Compare BTF of a function declaration with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. @@ -6491,7 +6492,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + * + * NOTE: the code is duplicated from btf_check_subprog_arg_match() + * because btf_check_func_arg_match() is still doing both. Once that + * function is split in 2, we can call from here btf_check_subprog_arg_match() + * first, and then treat the calling part in a new code path. + */ +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6507,7 +6551,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 kfunc_flags) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); + return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 003f7ba19558..7d9a2e18ca8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6629,7 +6629,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_subprog_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { -- cgit v1.2.3 From 15baa55ff5b00b81bcd9874b89cb8e0b0daaa13d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:59 +0200 Subject: bpf/verifier: allow all functions to read user provided context When a function was trying to access data from context in a syscall eBPF program, the verifier was rejecting the call unless it was accessing the first element. This is because the syscall context is not known at compile time, and so we need to check this when actually accessing it. Check for the valid memory access if there is no convert_ctx callback, and allow such situation to happen. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-4-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d9a2e18ca8a..3cfe60206de6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5233,6 +5233,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_CTX: + /* in case the function doesn't know how to access the context, + * (because we are in a program of type SYSCALL for example), we + * can not statically check its size. + * Dynamically check it now. + */ + if (!env->ops->convert_ctx_access) { + enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; + int offset = access_size - 1; + + /* Allow zero-byte read from PTR_TO_CTX */ + if (access_size == 0) + return zero_size_allowed ? 0 : -EACCES; + + return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, + atype, -1, false); + } + + fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && -- cgit v1.2.3 From f9b348185f4d684cc19e6bd9b87904823d5aa5ed Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:01 +0200 Subject: bpf/btf: bump BTF_KFUNC_SET_MAX_CNT net/bpf/test_run.c is already presenting 20 kfuncs. net/netfilter/nf_conntrack_bpf.c is also presenting an extra 10 kfuncs. Given that all the kfuncs are regrouped into one unique set, having only 2 space left prevent us to add more selftests. Bump it to 256. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-6-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9291e2b2c950..2c2d8190ca4a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -208,7 +208,7 @@ enum btf_kfunc_hook { }; enum { - BTF_KFUNC_SET_MAX_CNT = 32, + BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, }; -- cgit v1.2.3 From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:02 +0200 Subject: bpf/verifier: allow kfunc to return an allocated mem For drivers (outside of network), the incoming data is not statically defined in a struct. Most of the time the data buffer is kzalloc-ed and thus we can not rely on eBPF and BTF to explore the data. This commit allows to return an arbitrary memory, previously allocated by the driver. An interesting extra point is that the kfunc can mark the exported memory region as read only or read/write. So, when a kfunc is not returning a pointer to a struct but to a plain type, we can consider it is a valid allocated memory assuming that: - one of the arguments is either called rdonly_buf_size or rdwr_buf_size - and this argument is a const from the caller point of view We can then use this parameter as the size of the allocated memory. The memory is either read-only or read-write based on the name of the size parameter. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++- include/linux/bpf_verifier.h | 2 + include/linux/btf.h | 10 +++++ kernel/bpf/btf.c | 101 ++++++++++++++++++++++++++++++++++--------- kernel/bpf/verifier.c | 45 +++++++++++++------ 5 files changed, 133 insertions(+), 34 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf161cfd396..79883f883ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +struct bpf_kfunc_arg_meta { + u64 r0_size; + bool r0_rdonly; + int ref_obj_id; + u32 flags; +}; + struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); @@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags); + struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fdddbf3546b..8fbc1d05281e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +int mark_chain_precision(struct bpf_verifier_env *env, int regno); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ diff --git a/include/linux/btf.h b/include/linux/btf.h index ad93c2d9cc1c..1fcc833a8690 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt } #endif +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} + #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c2d8190ca4a..9d12212fcd61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6199,11 +6199,36 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return true; } +static bool btf_is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg, + const char *name) +{ + int len, target_len = strlen(name); + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags, + struct bpf_kfunc_arg_meta *kfunc_meta, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -6241,12 +6266,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - if (is_kfunc) { + if (is_kfunc && kfunc_meta) { /* Only kfunc can be release func */ - rel = kfunc_flags & KF_RELEASE; - kptr_get = kfunc_flags & KF_KPTR_GET; - trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; - sleepable = kfunc_flags & KF_SLEEPABLE; + rel = kfunc_meta->flags & KF_RELEASE; + kptr_get = kfunc_meta->flags & KF_KPTR_GET; + trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS; + sleepable = kfunc_meta->flags & KF_SLEEPABLE; } /* check that BTF function arguments match actual types that the @@ -6259,6 +6284,38 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { + if (is_kfunc && kfunc_meta) { + bool is_buf_size = false; + + /* check for any const scalar parameter of name "rdonly_buf_size" + * or "rdwr_buf_size" + */ + if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdonly_buf_size")) { + kfunc_meta->r0_rdonly = true; + is_buf_size = true; + } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdwr_buf_size")) + is_buf_size = true; + + if (is_buf_size) { + if (kfunc_meta->r0_size) { + bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "R%d is not a const\n", regno); + return -EINVAL; + } + + kfunc_meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + } + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", regno); @@ -6289,6 +6346,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (ret < 0) return ret; + if (is_kfunc && reg->ref_obj_id) { + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } + /* kptr_get is only true for kfunc */ if (i == 0 && kptr_get) { struct bpf_map_value_off_desc *off_desc; @@ -6361,16 +6429,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ - if (reg->ref_obj_id) { - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[base_type(reg->type)]; @@ -6461,6 +6519,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (kfunc_meta && ref_obj_id) + kfunc_meta->ref_obj_id = ref_obj_id; + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } @@ -6492,7 +6553,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6535,7 +6596,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6549,9 +6610,9 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags) + struct bpf_kfunc_arg_meta *meta) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3cfe60206de6..f3344a86d88d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2908,7 +2908,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +int mark_chain_precision(struct bpf_verifier_env *env, int regno) { return __mark_chain_precision(env, regno, -1); } @@ -7595,6 +7595,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); + struct bpf_kfunc_arg_meta meta = { 0 }; const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; @@ -7629,8 +7630,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, acq = *kfunc_flags & KF_ACQUIRE; + meta.flags = *kfunc_flags; + /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -7651,7 +7654,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_ptr(t)) { + if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -7663,17 +7666,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (!btf_type_is_struct(ptr_type)) { - ptr_type_name = btf_name_by_offset(desc_btf, - ptr_type->name_off); - verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", - func_name, btf_type_str(ptr_type), - ptr_type_name); - return -EINVAL; + if (!meta.r0_size) { + ptr_type_name = btf_name_by_offset(desc_btf, + ptr_type->name_off); + verbose(env, + "kernel function %s returns pointer type %s %s is not supported\n", + func_name, + btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM; + regs[BPF_REG_0].mem_size = meta.r0_size; + + if (meta.r0_rdonly) + regs[BPF_REG_0].type |= MEM_RDONLY; + + /* Ensures we don't access the memory after a release_reference() */ + if (meta.ref_obj_id) + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; } - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ -- cgit v1.2.3 From 9fad7fe5b29803584c7f17a2abe6c2936fec6828 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 7 Sep 2022 16:24:20 +0100 Subject: bpf: Fix resetting logic for unreferenced kptrs Sparse reported a warning at bpf_map_free_kptrs() "warning: Using plain integer as NULL pointer" During the process of fixing this warning, it was discovered that the current code erroneously writes to the pointer variable instead of deferencing and writing to the actual kptr. Hence, Sparse tool accidentally helped to uncover this problem. Fix this by doing WRITE_ONCE(*p, 0) instead of WRITE_ONCE(p, 0). Note that the effect of this bug is that unreferenced kptrs will not be cleared during check_and_free_fields. It is not a problem if the clearing is not done during map_free stage, as there is nothing to free for them. Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/Yxi3pJaK6UDjVJSy@playground Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4fb08c43420d..d35a6aa3aa96 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -598,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) if (off_desc->type == BPF_KPTR_UNREF) { u64 *p = (u64 *)btf_id_ptr; - WRITE_ONCE(p, 0); + WRITE_ONCE(*p, 0); continue; } old_ptr = xchg(btf_id_ptr, 0); -- cgit v1.2.3 From 6df4ea1ff0ff70798ff1e7eed79f98ccb7b5b0a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:15 +0200 Subject: bpf: Support kptrs in percpu arraymap Enable support for kptrs in percpu BPF arraymap by wiring up the freeing of these kptrs from percpu map elements. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 33 ++++++++++++++++++++++++--------- kernel/bpf/syscall.c | 3 ++- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 624527401d4d..832b2659e96e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); @@ -338,8 +339,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), - value, map->value_size); + val = this_cpu_ptr(array->pptrs[index & array->index_mask]); + copy_map_value(map, val, value); + check_and_free_fields(array, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -383,7 +385,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); + check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -421,8 +424,20 @@ static void array_map_free(struct bpf_map *map) int i; if (map_value_has_kptrs(map)) { - for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + for (i = 0; i < array->map.max_entries; i++) { + void __percpu *pptr = array->pptrs[i & array->index_mask]; + int cpu; + + for_each_possible_cpu(cpu) { + bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } + } else { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + } bpf_map_free_kptr_off_tab(map); } @@ -608,9 +623,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) pptr = v; size = array->elem_size; for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d35a6aa3aa96..69be1c612daa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1049,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) { + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } -- cgit v1.2.3 From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:28 +0200 Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate For a lot of use cases in future patches, we will want to modify the state of registers part of some same 'group' (e.g. same ref_obj_id). It won't just be limited to releasing reference state, but setting a type flag dynamically based on certain actions, etc. Hence, we need a way to easily pass a callback to the function that iterates over all registers in current bpf_verifier_state in all frames upto (and including) the curframe. While in C++ we would be able to easily use a lambda to pass state and the callback together, sadly we aren't using C++ in the kernel. The next best thing to avoid defining a function for each case seems like statement expressions in GNU C. The kernel already uses them heavily, hence they can passed to the macro in the style of a lambda. The statement expression will then be substituted in the for loop bodies. Variables __state and __reg are set to current bpf_func_state and reg for each invocation of the expression inside the passed in verifier state. Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers, release_reference, find_good_pkt_pointers, find_equal_scalars to use bpf_for_each_reg_in_vstate. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 21 +++++++ kernel/bpf/verifier.c | 135 +++++++++---------------------------------- 2 files changed, 49 insertions(+), 107 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8fbc1d05281e..b49a349cc6ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -348,6 +348,27 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame)) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_verifier_state *___vstate = __vst; \ + int ___i, ___j; \ + for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ + struct bpf_reg_state *___regs; \ + __state = ___vstate->frame[___i]; \ + ___regs = __state->regs; \ + for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ + __reg = &___regs[___j]; \ + (void)(__expr); \ + } \ + bpf_for_each_spilled_reg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ + } \ + }) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3344a86d88d..c0f175ac187a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6513,31 +6513,15 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); + struct bpf_func_state *state; + struct bpf_reg_state *reg; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(env, reg); - } -} - -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state *vstate = env->cur_state; - int i; - - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); + })); } enum { @@ -6566,41 +6550,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int ref_obj_id) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].ref_obj_id == ref_obj_id) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); - } -} - /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { - struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state; + struct bpf_reg_state *reg; int err; - int i; err = release_reference_state(cur_func(env), ref_obj_id); if (err) return err; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], ref_obj_id); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + __mark_reg_unknown(env, reg); + })); return 0; } @@ -9335,34 +9302,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void __find_good_pkt_pointers(struct bpf_func_state *state, - struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, int new_range) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - if (reg->type == type && reg->id == dst_reg->id) - /* keep the maximum range already checked */ - reg->range = max(reg->range, new_range); - } - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } -} - static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - int new_range, i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int new_range; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -9427,9 +9374,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i <= vstate->curframe; i++) - __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, - new_range); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + })); } static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) @@ -9918,7 +9867,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reg_references(). + * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset. @@ -9928,22 +9877,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } -static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, - bool is_null) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } -} - /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -9951,10 +9884,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs, *reg; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -9963,8 +9895,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i <= vstate->curframe; i++) - __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + mark_ptr_or_null_reg(state, reg, id, is_null); + })); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -10077,23 +10010,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, { struct bpf_func_state *state; struct bpf_reg_state *reg; - int i, j; - for (i = 0; i <= vstate->curframe; i++) { - state = vstate->frame[i]; - for (j = 0; j < MAX_BPF_REG; j++) { - reg = &state->regs[j]; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - - bpf_for_each_spilled_reg(j, state, reg) { - if (!reg) - continue; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - } + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + })); } static int check_cond_jmp_op(struct bpf_verifier_env *env, -- cgit v1.2.3 From cf7de6a53600ea554a8358e44fbcf47b449235f9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 11 Sep 2022 00:07:11 +0900 Subject: bpf: add missing percpu_counter_destroy() in htab_map_alloc() syzbot is reporting ODEBUG bug in htab_map_alloc() [1], for commit 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated hash map.") added percpu_counter_init() to htab_map_alloc() but forgot to add percpu_counter_destroy() to the error path. Link: https://syzkaller.appspot.com/bug?extid=5d1da78b375c3b5e6c2b [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated hash map.") Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/e2e4cc0e-9d36-4ca1-9bfa-ce23e6f8310b@I-love.SAKURA.ne.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0fe3f136cbbe..86aec20c22d0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -622,6 +622,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); free_map_locked: + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); -- cgit v1.2.3 From 57c92f11a215717bf90880828b7a23c736c3c0d9 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Wed, 7 Sep 2022 16:57:46 +0100 Subject: bpf: Simplify code by using for_each_cpu_wrap() In the percpu freelist code, it is a common pattern to iterate over the possible CPUs mask starting with the current CPU. The pattern is implemented using a hand rolled while loop with the loop variable increment being open-coded. Simplify the code by using for_each_cpu_wrap() helper to iterate over the possible cpus starting with the current CPU. As a result, some of the special-casing in the loop also gets simplified. No functional change intended. Signed-off-by: Punit Agrawal Acked-by: Song Liu Link: https://lore.kernel.org/r/20220907155746.1750329-1-punit.agrawal@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/percpu_freelist.c | 48 +++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 00b874c8e889..b6e7f5c5b9ab 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, { int cpu, orig_cpu; - orig_cpu = cpu = raw_smp_processor_id(); + orig_cpu = raw_smp_processor_id(); while (1) { - struct pcpu_freelist_head *head; + for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { + struct pcpu_freelist_head *head; - head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } } - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; /* cannot lock any per cpu lock, try extralist */ - if (cpu == orig_cpu && - pcpu_freelist_try_push_extra(s, node)) + if (pcpu_freelist_try_push_extra(s, node)) return; } } @@ -125,13 +123,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; raw_spin_lock(&head->lock); node = head->first; if (node) { @@ -140,12 +137,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) return node; } raw_spin_unlock(&head->lock); -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* per cpu lists are all empty, try extralist */ @@ -164,13 +155,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { @@ -180,12 +170,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } raw_spin_unlock(&head->lock); } -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* cannot pop from per cpu lists, try extralist */ -- cgit v1.2.3 From 65269888c695cf4643c6fdb989ea28bf1623685d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:36 -0600 Subject: bpf: Remove duplicate PTR_TO_BTF_ID RO check Since commit 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS") there has existed bpf_verifier_ops:btf_struct_access. When btf_struct_access is _unset_ for a prog type, the verifier runs the default implementation, which is to enforce read only: if (env->ops->btf_struct_access) { [...] } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } [...] } When btf_struct_access is _set_, the expectation is that btf_struct_access has full control over accesses, including if writes are allowed. Rather than carve out an exception for each prog type that may write to BTF ptrs, delete the redundant check and give full control to btf_struct_access. Signed-off-by: Daniel Xu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/962da2bff1238746589e332ff1aecc49403cd7ce.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c0f175ac187a..c3efd461f36c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13406,9 +13406,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { - verbose(env, "Writes through BTF pointers are not allowed\n"); - return -EINVAL; } continue; default: -- cgit v1.2.3 From 84c6ac417ceacd086efc330afece8922969610b7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:39 -0600 Subject: bpf: Export btf_type_by_id() and bpf_log() These symbols will be used in nf_conntrack.ko to support direct writes to `nf_conn`. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/3c98c19dc50d3b18ea5eca135b4fc3a5db036060.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9d12212fcd61..98be25d13325 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -818,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return NULL; return btf->types[type_id]; } +EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Regular int is not a bit field and it must be either diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c3efd461f36c..9109e07b759a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -370,6 +370,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, bpf_verifier_vlog(log, fmt, args); va_end(args); } +EXPORT_SYMBOL_GPL(bpf_log); static const char *ltrim(const char *s) { -- cgit v1.2.3 From a37a32583e282d8d815e22add29bc1e91e19951a Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Sat, 10 Sep 2022 11:01:20 +0000 Subject: bpf: btf: fix truncated last_member_type_id in btf_struct_resolve When trying to finish resolving a struct member, btf_struct_resolve saves the member type id in a u16 temporary variable. This truncates the 32 bit type id value if it exceeds UINT16_MAX. As a result, structs that have members with type ids > UINT16_MAX and which need resolution will fail with a message like this: [67414] STRUCT ff_device size=120 vlen=12 effect_owners type_id=67434 bits_offset=960 Member exceeds struct_size Fix this by changing the type of last_member_type_id to u32. Fixes: a0791f0df7d2 ("bpf: fix BTF limits") Reviewed-by: Stanislav Fomichev Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20220910110120.339242-1-oss@lmb.io Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e64447659f3..36fd4b509294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3128,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; - u16 last_member_type_id; + u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; -- cgit v1.2.3 From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 8 Sep 2022 16:07:16 -0700 Subject: bpf: Add verifier support for custom callback return range Verifier logic to confirm that a callback function returns 0 or 1 was added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper"). At the time, callback return value was only used to continue or stop iteration. In order to support callbacks with a broader return value range, such as those added in rbtree series[0] and others, add a callback_ret_range to bpf_func_state. Verifier's helpers which set in_callback_fn will also set the new field, which the verifier will later use to check return value bounds. Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel value as the latter would prevent the valid range (0, U64_MAX) being used. Previous global default tnum_range(0, 1) is explicitly set for extant callback helpers. The change to global default was made after discussion around this patch in rbtree series [1], goal here is to make it more obvious that callback_ret_range should be explicitly set. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/ [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b49a349cc6ae..e197f8fb27e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,7 @@ struct bpf_func_state { */ u32 async_entry_cnt; bool in_callback_fn; + struct tnum callback_ret_range; bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9109e07b759a..c259d734f863 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1750,6 +1750,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; + state->callback_ret_range = tnum_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -6790,6 +6791,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6811,6 +6813,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6840,6 +6843,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6867,6 +6871,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6894,7 +6899,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ - struct tnum range = tnum_range(0, 1); + struct tnum range = callee->callback_ret_range; if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); -- cgit v1.2.3 From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 12 Sep 2022 08:45:44 -0700 Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing referenced kptr in map") to denote a bpf_func_proto btf_id which the verifier will replace with a dynamically-determined btf_id at verification time. This patch adds verifier 'poison' functionality to BPF_PTR_POISON in order to prepare for expanded use of the value to poison ret- and arg-btf_id in ongoing work, namely rbtree and linked list patchsets [0, 1]. Specifically, when the verifier checks helper calls, it assumes that BPF_PTR_POISON'ed ret type will be replaced with a valid type before - or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id. If poisoned btf_id reaches default handling block for either, consider this a verifier internal error and fail verification. Otherwise a helper w/ poisoned btf_id but no verifier logic replacing the type will cause a crash as the invalid pointer is dereferenced. Also move BPF_PTR_POISON to existing include/linux/posion.h header and remove unnecessary shift. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com Signed-off-by: Dave Marchevsky Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/poison.h | 3 +++ kernel/bpf/helpers.c | 6 +++--- kernel/bpf/verifier.c | 30 +++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6b4e9..2d3249eb0e62 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fc08035f14ed..41aeaf3862ec 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1376,10 +1377,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() - * helper is determined dynamically by the verifier. + * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to + * denote type that verifier will determine. */ -#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) - static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c259d734f863..8c6fbcd0afaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "disasm.h" @@ -5782,13 +5783,22 @@ found: if (meta->func_id == BPF_FUNC_kptr_xchg) { if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) return -EACCES; - } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, - strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); - return -EACCES; + } else { + if (arg_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", + regno); + return -EACCES; + } + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); + return -EACCES; + } } } @@ -7457,6 +7467,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn ret_btf = meta.kptr_off_desc->kptr.btf; ret_btf_id = meta.kptr_off_desc->kptr.btf_id; } else { + if (fn->ret_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", + func_id_name(func_id)); + return -EINVAL; + } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; } -- cgit v1.2.3 From 83c10cc362d91c0d8d25e60779ee52fdbbf3894d Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 12 Sep 2022 14:38:55 +0100 Subject: bpf: Ensure correct locking around vulnerable function find_vpid() The documentation for find_vpid() clearly states: "Must be called with the tasklist_lock or rcu_read_lock() held." Presently we do neither for find_vpid() instance in bpf_task_fd_query(). Add proper rcu_read_lock/unlock() to fix the issue. Fixes: 41bdc4b40ed6f ("bpf: introduce bpf subcommand BPF_TASK_FD_QUERY") Signed-off-by: Lee Jones Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220912133855.1218900-1-lee@kernel.org --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 27760627370d..1bd18af8af83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4395,7 +4395,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (attr->task_fd_query.flags != 0) return -EINVAL; + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); if (!task) return -ENOENT; -- cgit v1.2.3 From a02c118ee9e898612cbae42121b9e8663455b515 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 13 Sep 2022 16:40:33 +0800 Subject: bpf: use kvmemdup_bpfptr helper Use kvmemdup_bpfptr helper instead of open-coding to simplify the code. Signed-off-by: Wang Yufen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/1663058433-14089-1-git-send-email-wangyufen@huawei.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 69be1c612daa..dab156f09f8d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1417,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } value_size = bpf_map_value_size(map); - - err = -ENOMEM; - value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + value = kvmemdup_bpfptr(uvalue, value_size); + if (IS_ERR(value)) { + err = PTR_ERR(value); goto free_key; - - err = -EFAULT; - if (copy_from_bpfptr(value, uvalue, value_size) != 0) - goto free_value; + } err = bpf_map_update_value(map, f, key, value, attr->flags); -free_value: kvfree(value); free_key: kvfree(key); -- cgit v1.2.3 From 571f9738bfb3d4b42253c1d0ad26da9fede85f36 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Fri, 16 Sep 2022 13:28:00 -0700 Subject: bpf/btf: Use btf_type_str() whenever possible We have btf_type_str(). Use it whenever possible in btf.c, instead of "btf_kind_str[BTF_INFO_KIND(t->info)]". Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20220916202800.31421-1-yepeilin.cs@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 98be25d13325..b3940c605aac 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1397,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; - u8 kind = BTF_INFO_KIND(t->info); struct btf *btf = env->btf; va_list args; @@ -1413,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, - btf_kind_str[kind], + btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); @@ -5427,7 +5426,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } break; @@ -5454,7 +5453,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } @@ -5538,11 +5537,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", - tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", - tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); return true; } @@ -5950,7 +5949,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", - tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; @@ -5968,7 +5967,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", - tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { @@ -6727,7 +6726,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + i, btf_type_str(t), tname); return -EINVAL; } return 0; -- cgit v1.2.3 From c31b38cb948ee7d3317139f005fa1f90de4a06b7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 19 Sep 2022 22:48:11 +0800 Subject: bpf: Check whether or not node is NULL before free it in free_bulk llnode could be NULL if there are new allocations after the checking of c-free_cnt > c->high_watermark in bpf_mem_refill() and before the calling of __llist_del_first() in free_bulk (e.g. a PREEMPT_RT kernel or allocation in NMI context). And it will incur oops as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT_RT SMP CPU: 39 PID: 373 Comm: irq_work/39 Tainted: G W 6.0.0-rc6-rt9+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:bpf_mem_refill+0x66/0x130 ...... Call Trace: irq_work_single+0x24/0x60 irq_work_run_list+0x24/0x30 run_irq_workd+0x18/0x20 smpboot_thread_fn+0x13f/0x2c0 kthread+0x121/0x140 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Simply fixing it by checking whether or not llnode is NULL in free_bulk(). Fixes: 8d5a8011b35d ("bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220919144811.3570825-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 20621f5407d8..5f83be1d2018 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -277,7 +277,8 @@ static void free_bulk(struct bpf_mem_cache *c) local_dec(&c->active); if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(flags); - enque_to_free(c, llnode); + if (llnode) + enque_to_free(c, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ -- cgit v1.2.3 From 3a74904ceff3ecdb9d6cc0844ed67df417968eb6 Mon Sep 17 00:00:00 2001 From: William Dean Date: Sat, 17 Sep 2022 16:42:48 +0800 Subject: bpf: simplify code in btf_parse_hdr It could directly return 'btf_check_sec_info' to simplify code. Signed-off-by: William Dean Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220917084248.3649-1-williamsukatube@163.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b3940c605aac..6ccd4f4d731e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4854,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env) u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; - int err; btf = env->btf; btf_data_size = btf->data_size; @@ -4911,11 +4910,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -EINVAL; } - err = btf_check_sec_info(env, btf_data_size); - if (err) - return err; - - return 0; + return btf_check_sec_info(env, btf_data_size); } static int btf_check_type_tags(struct btf_verifier_env *env, -- cgit v1.2.3 From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:02 +0000 Subject: bpf, cgroup: Reject prog_attach_flags array when effective query Attach flags is only valid for attached progs of this layer cgroup, but not for effective progs. For querying with EFFECTIVE flags, exporting attach flags does not make sense. So when effective query, we reject prog_attach_flags array and don't need to populate it. Also we limit attach_flags to output 0 during effective query. Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 7 +++++-- kernel/bpf/cgroup.c | 28 ++++++++++++++++++---------- tools/include/uapi/linux/bpf.h | 7 +++++-- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4a400cd63731..22888aaa68b6 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type from_atype, to_atype; @@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int total_cnt = 0; u32 flags; + if (effective_query && prog_attach_flags) + return -EINVAL; + if (type == BPF_LSM_CGROUP) { - if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + if (!effective_query && attr->query.prog_cnt && + prog_ids && !prog_attach_flags) return -EINVAL; from_atype = CGROUP_LSM_START; @@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); @@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } } + /* always output uattr->query.attach_flags as 0 during effective query */ + flags = effective_query ? 0 : flags; if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) @@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); @@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, if (++i == cnt) break; } - } - if (prog_attach_flags) { - flags = cgrp->bpf.flags[atype]; + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; - for (i = 0; i < cnt; i++) - if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) - return -EFAULT; - prog_attach_flags += cnt; + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, + &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } } prog_ids += cnt; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ -- cgit v1.2.3 From 8addbfc7b308d591f8a5f2f6bb24d08d9d79dfbb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 21 Sep 2022 16:35:50 +0200 Subject: bpf: Gate dynptr API behind CAP_BPF This has been enabled for unprivileged programs for only one kernel release, hence the expected annoyances due to this move are low. Users using ringbuf can stick to non-dynptr APIs. The actual use cases dynptr is meant to serve may not make sense in unprivileged BPF programs. Hence, gate these helpers behind CAP_BPF and limit use to privileged BPF programs. Fixes: 263ae152e962 ("bpf: Add bpf_dynptr_from_mem for local dynptrs") Fixes: bc34dee65a65 ("bpf: Dynptr support for ring buffers") Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") Fixes: 34d4ef5775f7 ("bpf: Add dynptr data slices") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220921143550.30247-1-memxor@gmail.com Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..3814b0fd3a2c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1627,26 +1627,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; - case BPF_FUNC_ringbuf_reserve_dynptr: - return &bpf_ringbuf_reserve_dynptr_proto; - case BPF_FUNC_ringbuf_submit_dynptr: - return &bpf_ringbuf_submit_dynptr_proto; - case BPF_FUNC_ringbuf_discard_dynptr: - return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; - case BPF_FUNC_dynptr_from_mem: - return &bpf_dynptr_from_mem_proto; - case BPF_FUNC_dynptr_read: - return &bpf_dynptr_read_proto; - case BPF_FUNC_dynptr_write: - return &bpf_dynptr_write_proto; - case BPF_FUNC_dynptr_data: - return &bpf_dynptr_data_proto; default: break; } @@ -1675,6 +1661,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } -- cgit v1.2.3 From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/ringbuf.c | 62 ++++++++++++++++++++++--- kernel/bpf/verifier.c | 3 ++ tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 8 files changed, 65 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b483aea35f41..754e915748fb 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,27 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, + .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, @@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c6fbcd0afaf..83710b60e708 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + goto error; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 7c188a598444..7f3b67a8b48f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 38b6bc9c26c3..9a6ca9f31133 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter }\n" + " task_storage | bloom_filter | user_ringbuf }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2ca30ccc774c..d480da05b6de 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { -- cgit v1.2.3 From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- include/linux/bpf.h | 11 ++- include/uapi/linux/bpf.h | 38 +++++++++ kernel/bpf/helpers.c | 2 + kernel/bpf/ringbuf.c | 181 +++++++++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 61 +++++++++++++- tools/include/uapi/linux/bpf.h | 38 +++++++++ 6 files changed, 320 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41aeaf3862ec..cb5564c77482 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 754e915748fb..9e832acf4692 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,6 +38,22 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the @@ -153,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -288,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -301,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, - .map_poll = ringbuf_map_poll, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -321,6 +356,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -362,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -509,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -603,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 83710b60e708..c76fa45a5906 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, - [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -6066,6 +6073,13 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + break; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6241,7 +6255,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_USER_RINGBUF: - goto error; + if (func_id != BPF_FUNC_user_ringbuf_drain) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -6361,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -6887,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -7346,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn case BPF_FUNC_dynptr_data: for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - /* Find the id of the dynptr we're tracking the reference of */ - meta.ref_obj_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + + if (base_type(reg->type) != PTR_TO_DYNPTR) + /* Find the id of the dynptr we're + * tracking the reference of + */ + meta.ref_obj_id = stack_slot_get_id(env, reg); break; } } @@ -7360,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From d15bf1501c7533826a616478002c601fcc7671f3 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 20 Sep 2022 09:59:39 +0200 Subject: bpf: Allow kfuncs to be used in LSM programs In preparation for the addition of new kfuncs, allow kfuncs defined in the tracing subsystem to be used in LSM programs by mapping the LSM program type to the TRACING hook. Signed-off-by: KP Singh Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-2-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6ccd4f4d731e..dbcf020c4124 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7376,6 +7376,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; -- cgit v1.2.3 From 00f146413ccb6c84308e559281449755c83f54c5 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:40 +0200 Subject: btf: Export bpf_dynptr definition eBPF dynamic pointers is a new feature recently added to upstream. It binds together a pointer to a memory area and its size. The internal kernel structure bpf_dynptr_kern is not accessible by eBPF programs in user space. They instead see bpf_dynptr, which is then translated to the internal kernel structure by the eBPF verifier. The problem is that it is not possible to include at the same time the uapi include linux/bpf.h and the vmlinux BTF vmlinux.h, as they both contain the definition of some structures/enums. The compiler complains saying that the structures/enums are redefined. As bpf_dynptr is defined in the uapi include linux/bpf.h, this makes it impossible to include vmlinux.h. However, in some cases, e.g. when using kfuncs, vmlinux.h has to be included. The only option until now was to include vmlinux.h and add the definition of bpf_dynptr directly in the eBPF program source code from linux/bpf.h. Solve the problem by using the same approach as for bpf_timer (which also follows the same scheme with the _kern suffix for the internal kernel structure). Add the following line in one of the dynamic pointer helpers, bpf_dynptr_from_mem(): BTF_TYPE_EMIT(struct bpf_dynptr); Cc: stable@vger.kernel.org Cc: Joanne Koong Fixes: 97e03f521050c ("bpf: Add verifier support for dynptrs") Signed-off-by: Roberto Sassu Acked-by: Yonghong Song Tested-by: KP Singh Link: https://lore.kernel.org/r/20220920075951.929132-3-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cb5564c77482..6d69e30f42d8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1446,6 +1446,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_ { int err; + BTF_TYPE_EMIT(struct bpf_dynptr); + err = bpf_dynptr_check_size(size); if (err) goto error; -- cgit v1.2.3 From e9e315b4a5de32d0482b92f482517095d5d844e4 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:41 +0200 Subject: bpf: Move dynptr type check to is_dynptr_type_expected() Move dynptr type check to is_dynptr_type_expected() from is_dynptr_reg_valid_init(), so that callers can better determine the cause of a negative result (dynamic pointer not valid/initialized, dynamic pointer of the wrong type). It will be useful for example for BTF, to restrict which dynamic pointer types can be passed to kfuncs, as initially only the local type will be supported. Also, splitting makes the code more readable, since checking the dynamic pointer type is not necessarily related to validity and initialization. Split the validity/initialization and dynamic pointer type check also in the verifier, and adjust the expected error message in the test (a test for an unexpected dynptr type passed to a helper cannot be added due to missing suitable helpers, but this case has been tested manually). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-4-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 35 +++++++++++++++++++------ tools/testing/selftests/bpf/prog_tests/dynptr.c | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c76fa45a5906..c08dde19eb67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -799,11 +799,24 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re return false; } + return true; +} + +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type dynptr_type; + int spi = get_spi(reg->off); + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; - return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); + dynptr_type = arg_to_dynptr_type(arg_type); + + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; } /* The reg state of a pointer or a bounded scalar was saved when @@ -6095,21 +6108,27 @@ skip_type_check: } meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { + } else if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + arg + 1); + return -EINVAL; + } else if (!is_dynptr_type_expected(env, reg, arg_type)) { const char *err_extra = ""; switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { case DYNPTR_TYPE_LOCAL: - err_extra = "local "; + err_extra = "local"; break; case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf "; + err_extra = "ringbuf"; break; default: + err_extra = ""; break; } - - verbose(env, "Expected an initialized %sdynptr as arg #%d\n", + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", err_extra, arg + 1); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index bcf80b9f7c27..8fc4e6c02bfd 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -30,7 +30,7 @@ static struct { {"invalid_helper2", "Expected an initialized dynptr as arg #3"}, {"invalid_write1", "Expected an initialized dynptr as arg #1"}, {"invalid_write2", "Expected an initialized dynptr as arg #3"}, - {"invalid_write3", "Expected an initialized ringbuf dynptr as arg #1"}, + {"invalid_write3", "Expected an initialized dynptr as arg #1"}, {"invalid_write4", "arg 1 is an unacquired reference"}, {"invalid_read1", "invalid read from stack"}, {"invalid_read2", "cannot pass in dynptr at an offset"}, -- cgit v1.2.3 From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:42 +0200 Subject: btf: Allow dynamic pointer parameters in kfuncs Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as parameters in kfuncs. Also, ensure that dynamic pointers passed as argument are valid and initialized, are a pointer to the stack, and of the type local. More dynamic pointer types can be supported in the future. To properly detect whether a parameter is of the desired type, introduce the stringify_struct() macro to compare the returned structure name with the desired name. In addition, protect against structure renames, by halting the build with BUILD_BUG_ON(), so that developers have to revisit the code. To check if a dynamic pointer passed to the kfunc is valid and initialized, and if its type is local, export the existing functions is_dynptr_reg_valid_init() and is_dynptr_type_expected(). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ include/linux/btf.h | 9 +++++++++ kernel/bpf/btf.c | 33 +++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 10 +++++----- 4 files changed, 52 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e197f8fb27e2..9e1e6965f407 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1fcc833a8690..f9aababc5d78 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -52,6 +52,15 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +/* + * Return the name of the passed struct, if exists, or halt the build if for + * example the structure gets renamed. In this way, developers have to revisit + * the code using that structure name, and update it accordingly. + */ +#define stringify_struct(x) \ + ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ + __stringify(x); }) + struct btf; struct btf_member; struct btf_type; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbcf020c4124..13faede0f2b4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6449,15 +6449,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (is_kfunc) { bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + bool arg_dynptr = btf_type_is_struct(ref_t) && + !strcmp(ref_tname, + stringify_struct(bpf_dynptr_kern)); /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. * When arg_mem_size is true, the pointer can be * void *. + * Also permit initialized local dynamic pointers. */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + !arg_dynptr && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", @@ -6465,6 +6470,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (arg_dynptr) { + if (reg->type != PTR_TO_STACK) { + bpf_log(log, "arg#%d pointer type %s %s not to stack\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + bpf_log(log, + "arg#%d pointer type %s %s must be valid and initialized\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_type_expected(env, reg, + ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { + bpf_log(log, + "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + continue; + } + /* Check for mem, len pair */ if (arg_mem_size) { if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c08dde19eb67..6f6d2d511c06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -802,9 +802,9 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -static bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; -- cgit v1.2.3 From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:43 +0200 Subject: bpf: Export bpf_dynptr_get_size() Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic pointers can obtain the real size of data carried by this data structure. Signed-off-by: Roberto Sassu Reviewed-by: Joanne Koong Acked-by: KP Singh Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/helpers.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33e543b86e1a..6535fb1e21b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6d69e30f42d8..b069517a3da0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1408,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } -- cgit v1.2.3 From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Sep 2022 09:19:14 +0200 Subject: bpf: Prevent bpf program recursion for raw tracepoint probes We got report from sysbot [1] about warnings that were caused by bpf program attached to contention_begin raw tracepoint triggering the same tracepoint by using bpf_trace_printk helper that takes trace_printk_lock lock. Call Trace: ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90 bpf_trace_printk+0x2b/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 __unfreeze_partials+0x5b/0x160 ... The can be reproduced by attaching bpf program as raw tracepoint on contention_begin tracepoint. The bpf prog calls bpf_trace_printk helper. Then by running perf bench the spin lock code is forced to take slow path and call contention_begin tracepoint. Fixing this by skipping execution of the bpf program if it's already running, Using bpf prog 'active' field, which is being currently used by trampoline programs for the same reason. Moving bpf_prog_inc_misses_counter to syscall.c because trampoline.c is compiled in just for CONFIG_BPF_JIT option. Reviewed-by: Stanislav Fomichev Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com [1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/bpf/trampoline.c | 15 ++------------- kernel/trace/bpf_trace.c | 6 ++++++ 4 files changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1435b019aca..edd43edb27d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } + +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void) { return false; } + +static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dab156f09f8d..372fad5ef3d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2093,6 +2093,17 @@ struct bpf_prog_kstats { u64 misses; }; +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + unsigned int flags; + + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ad76940b02cc..41b67eb83ab3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -863,17 +863,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void) return start; } -static void notrace inc_misses_counter(struct bpf_prog *prog) -{ - struct bpf_prog_stats *stats; - unsigned int flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->misses); - u64_stats_update_end_irqrestore(&stats->syncp, flags); -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -896,7 +885,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); @@ -967,7 +956,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r might_fault(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9df53c40cffd..b05f0310dbd3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2222,9 +2222,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ -- cgit v1.2.3 From 1d8b82c613297f24354b4d750413a7456b5cd92c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 21 Sep 2022 15:38:26 +0800 Subject: bpf: Always use raw spinlock for hash bucket lock For a non-preallocated hash map on RT kernel, regular spinlock instead of raw spinlock is used for bucket lock. The reason is that on RT kernel memory allocation is forbidden under atomic context and regular spinlock is sleepable under RT. Now hash map has been fully converted to use bpf_map_alloc, and there will be no synchronous memory allocation for non-preallocated hash map, so it is safe to always use raw spinlock for bucket lock on RT. So removing the usage of htab_use_raw_lock() and updating the comments accordingly. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220921073826.2365800-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 66 +++++++++++----------------------------------------- 1 file changed, 14 insertions(+), 52 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 86aec20c22d0..ed3f8a53603b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -68,24 +68,16 @@ * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be - * atomic contexts even on RT. These mechanisms require preallocated maps, - * so there is no need to invoke memory allocations within the lock held - * sections. - * - * BPF maps which need dynamic allocation are only used from (forced) - * thread context on RT and can therefore use regular spinlocks which in - * turn allows to invoke memory allocations from the lock held section. - * - * On a non RT kernel this distinction is neither possible nor required. - * spinlock maps to raw_spinlock and the extra code is optimized out by the - * compiler. + * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, + * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, + * because there is no memory allocation within the lock held sections. However + * after hash map was fully converted to use bpf_mem_alloc, there will be + * non-synchronous memory allocation for non-preallocated hash map, so it is + * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; - union { - raw_spinlock_t raw_lock; - spinlock_t lock; - }; + raw_spinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -141,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab) return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } -static inline bool htab_use_raw_lock(const struct bpf_htab *htab) -{ - return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); -} - static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) { - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, + raw_spin_lock_init(&htab->buckets[i].raw_lock); + lockdep_set_class(&htab->buckets[i].raw_lock, &htab->lockdep_key); - } else { - spin_lock_init(&htab->buckets[i].lock); - lockdep_set_class(&htab->buckets[i].lock, - &htab->lockdep_key); - } cond_resched(); } } @@ -170,28 +151,17 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, unsigned long *pflags) { unsigned long flags; - bool use_raw_lock; hash = hash & HASHTAB_MAP_LOCK_MASK; - use_raw_lock = htab_use_raw_lock(htab); - if (use_raw_lock) - preempt_disable(); - else - migrate_disable(); + preempt_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - if (use_raw_lock) - preempt_enable(); - else - migrate_enable(); + preempt_enable(); return -EBUSY; } - if (use_raw_lock) - raw_spin_lock_irqsave(&b->raw_lock, flags); - else - spin_lock_irqsave(&b->lock, flags); + raw_spin_lock_irqsave(&b->raw_lock, flags); *pflags = flags; return 0; @@ -201,18 +171,10 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { - bool use_raw_lock = htab_use_raw_lock(htab); - hash = hash & HASHTAB_MAP_LOCK_MASK; - if (use_raw_lock) - raw_spin_unlock_irqrestore(&b->raw_lock, flags); - else - spin_unlock_irqrestore(&b->lock, flags); + raw_spin_unlock_irqrestore(&b->raw_lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - if (use_raw_lock) - preempt_enable(); - else - migrate_enable(); + preempt_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); -- cgit v1.2.3 From eed807f626101f6a4227bd53942892c5983b95a7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 21 Sep 2022 18:48:25 +0200 Subject: bpf: Tweak definition of KF_TRUSTED_ARGS Instead of forcing all arguments to be referenced pointers with non-zero reg->ref_obj_id, tweak the definition of KF_TRUSTED_ARGS to mean that only PTR_TO_BTF_ID (and socket types translated to PTR_TO_BTF_ID) have that constraint, and require their offset to be set to 0. The rest of pointer types are also accomodated in this definition of trusted pointers, but with more relaxed rules regarding offsets. The inherent meaning of setting this flag is that all kfunc pointer arguments have a guranteed lifetime, and kernel object pointers (PTR_TO_BTF_ID, PTR_TO_CTX) are passed in their unmodified form (with offset 0). In general, this is not true for PTR_TO_BTF_ID as it can be obtained using pointer walks. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/cdede0043c47ed7a357f0a915d16f9ce06a1d589.1663778601.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 24 ++++++++++++++++-------- kernel/bpf/btf.c | 18 +++++++++++++----- 2 files changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 781731749e55..0f858156371d 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -137,14 +137,22 @@ KF_ACQUIRE and KF_RET_NULL flags. -------------------------- The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It -indicates that the all pointer arguments will always be refcounted, and have -their offset set to 0. It can be used to enforce that a pointer to a refcounted -object acquired from a kfunc or BPF helper is passed as an argument to this -kfunc without any modifications (e.g. pointer arithmetic) such that it is -trusted and points to the original object. This flag is often used for kfuncs -that operate (change some property, perform some operation) on an object that -was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to -ensure the integrity of the operation being performed on the expected object. +indicates that the all pointer arguments will always have a guaranteed lifetime, +and pointers to kernel objects are always passed to helpers in their unmodified +form (as obtained from acquire kfuncs). + +It can be used to enforce that a pointer to a refcounted object acquired from a +kfunc or BPF helper is passed as an argument to this kfunc without any +modifications (e.g. pointer arithmetic) such that it is trusted and points to +the original object. + +Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs, +but those can have a non-zero offset. + +This flag is often used for kfuncs that operate (change some property, perform +some operation) on an object that was obtained using an acquire kfunc. Such +kfuncs need an unchanged pointer to ensure the integrity of the operation being +performed on the expected object. 2.4.6 KF_SLEEPABLE flag ----------------------- diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 13faede0f2b4..a44ad4b347ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6227,7 +6227,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - bool rel = false, kptr_get = false, trusted_arg = false; + bool rel = false, kptr_get = false, trusted_args = false; bool sleepable = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; @@ -6265,7 +6265,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Only kfunc can be release func */ rel = kfunc_meta->flags & KF_RELEASE; kptr_get = kfunc_meta->flags & KF_KPTR_GET; - trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS; + trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; sleepable = kfunc_meta->flags & KF_SLEEPABLE; } @@ -6276,6 +6276,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; + bool obj_ptr = false; t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { @@ -6323,10 +6324,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* These register types have special constraints wrt ref_obj_id + * and offset checks. The rest of trusted args don't. + */ + obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID || + reg2btf_ids[base_type(reg->type)]; + /* Check if argument must be a referenced pointer, args + i has * been verified to be a pointer (after skipping modifiers). + * PTR_TO_CTX is ok without having non-zero ref_obj_id. */ - if (is_kfunc && trusted_arg && !reg->ref_obj_id) { + if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) { bpf_log(log, "R%d must be referenced\n", regno); return -EINVAL; } @@ -6335,7 +6343,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_tname = btf_name_by_offset(btf, ref_t->name_off); /* Trusted args have the same offset checks as release arguments */ - if (trusted_arg || (rel && reg->ref_obj_id)) + if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) @@ -6435,7 +6443,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, reg->off, btf, ref_id, - trusted_arg || (rel && reg->ref_obj_id))) { + trusted_args || (rel && reg->ref_obj_id))) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, -- cgit v1.2.3 From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:38 -0700 Subject: bpf: use bpf_prog_pack for bpf_dispatcher Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher can share pages with bpf programs. arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working area for arch code to write to. This also fixes CPA W^X warnning like: CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 16 ++++++++-------- include/linux/bpf.h | 3 ++- include/linux/filter.h | 5 +++++ kernel/bpf/core.c | 9 +++++++-- kernel/bpf/dispatcher.c | 27 +++++++++++++++++++++------ 5 files changed, 43 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d4a6183197e9..35796db58116 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2242,7 +2242,7 @@ cleanup: return ret; } -static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) +static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; int pivot, err, jg_bytes = 1; @@ -2258,12 +2258,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a]); err = emit_cond_near_jump(&prog, /* je func */ - (void *)progs[a], prog, + (void *)progs[a], image + (prog - buf), X86_JE); if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, prog); + emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -2288,7 +2288,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) jg_reloc = prog; err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */ - progs); + progs, image, buf); if (err) return err; @@ -2302,7 +2302,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */ - b, progs); + b, progs, image, buf); if (err) return err; @@ -2322,12 +2322,12 @@ static int cmp_ips(const void *a, const void *b) return 0; } -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { - u8 *prog = image; + u8 *prog = buf; sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL); - return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs); + return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } struct x64_jit_data { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edd43edb27d6..9ae155c75014 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -946,6 +946,7 @@ struct bpf_dispatcher { struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; + void *rw_image; u32 image_off; struct bpf_ksym ksym; }; @@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..efc42a6e3aed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_prog_pack_free(struct bpf_binary_header *hdr); + static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d1be78c28619..711fd293b6de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -825,6 +825,11 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) +{ + memset(area, 0, size); +} + #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); @@ -864,7 +869,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins return pack; } -static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -905,7 +910,7 @@ out: return ptr; } -static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(struct bpf_binary_header *hdr) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 2444bd15cc2d..fa64b80b8bca 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, return false; } -int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } -static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; @@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } - return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); + return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new; + void *old, *new, *tmp; u32 noff; int err; @@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) } new = d->num_progs ? d->image + noff : NULL; + tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { - if (bpf_dispatcher_prepare(d, new)) + /* Prepare the dispatcher in d->rw_image. Then use + * bpf_arch_text_copy to update d->image, which is RO+X. + */ + if (bpf_dispatcher_prepare(d, new, tmp)) + return; + if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } @@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; + d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!d->rw_image) { + u32 size = PAGE_SIZE; + + bpf_arch_text_copy(d->image, &size, sizeof(size)); + bpf_prog_pack_free((struct bpf_binary_header *)d->image); + d->image = NULL; + goto out; + } bpf_image_ksym_add(d->image, &d->ksym); } -- cgit v1.2.3 From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:39 -0700 Subject: bpf: Enforce W^X for bpf trampoline Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that the trampoine follows W^X rule strictly. This will turn off warnings like CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Also remove bpf_jit_alloc_exec_page(), since it is not used any more. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - kernel/bpf/trampoline.c | 22 +++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ae155c75014..5161fac0513f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 41b67eb83ab3..6f7b939321d6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void *bpf_jit_alloc_exec_page(void) -{ - void *image; - - image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!image) - return NULL; - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * every time new program is attached or detached. - */ - set_memory_x((long)image, 1); - return image; -} - void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; @@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec_page(); + im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; + set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -483,6 +468,9 @@ again: if (err < 0) goto out; + set_memory_ro((long)im->image, 1); + set_memory_x((long)im->image, 1); + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) -- cgit v1.2.3 From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- include/linux/bpf.h | 25 ++++++ include/uapi/linux/bpf.h | 6 ++ kernel/bpf/task_iter.c | 188 ++++++++++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 6 ++ 4 files changed, 203 insertions(+), 22 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5161fac0513f..0f3eaf3ed98c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..8b2f47e7139d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,9 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +25,115 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + /* The first time, the iterator calls this function. */ + pid = find_pid_ns(common->pid, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) + return NULL; + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* If the control returns to user space and comes back to the + * kernel again, *tid and common->pid_visiting should be the + * same for task_seq_start() to pick up the correct task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + +retry: + if (!pid_alive(task)) { + put_task_struct(task); + return NULL; + } + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (!*tid || *tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) { + task = next_task; + goto retry; + } + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +156,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +173,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +217,41 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, task_active_pid_ns(current)); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +272,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +285,18 @@ again: curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +317,15 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +406,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -307,11 +447,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +510,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +568,12 @@ again: return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +674,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +693,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +714,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ -- cgit v1.2.3 From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:54 -0700 Subject: bpf: Handle bpf_link_info for the parameterized task BPF iterators. Add new fields to bpf_link_info that users can query it through bpf_obj_get_info_by_fd(). Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com --- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/task_iter.c | 18 ++++++++++++++++++ tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 26 insertions(+) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8b2f47e7139d..46f836be22e2 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -672,6 +672,21 @@ static const struct bpf_iter_seq_info task_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), }; +static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) +{ + switch (aux->task.type) { + case BPF_TASK_ITER_TID: + info->iter.task.tid = aux->task.pid; + break; + case BPF_TASK_ITER_TGID: + info->iter.task.pid = aux->task.pid; + break; + default: + break; + } + return 0; +} + static struct bpf_iter_reg task_reg_info = { .target = "task", .attach_target = bpf_iter_attach_task, @@ -682,6 +697,7 @@ static struct bpf_iter_reg task_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -703,6 +719,7 @@ static struct bpf_iter_reg task_file_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_file_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -724,6 +741,7 @@ static struct bpf_iter_reg task_vma_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_vma_seq_info, + .fill_link_info = bpf_iter_fill_link_info, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { -- cgit v1.2.3 From 2c4fe44fb020f3cce904da2ba9e42bb1c118e8a3 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:55 -0700 Subject: bpf: Handle show_fdinfo for the parameterized task BPF iterators Show information of iterators in the respective files under /proc//fdinfo/. For example, for a task file iterator with 1723 as the value of tid parameter, its fdinfo would look like the following lines. pos: 0 flags: 02000000 mnt_id: 14 ino: 38 link_type: iter link_id: 51 prog_tag: a590ac96db22b825 prog_id: 299 target_name: task_file task_type: TID tid: 1723 This patch add the last three fields. task_type is the type of the task parameter. TID means the iterator visit only the thread specified by tid. The value of tid in the above example is 1723. For the case of PID task_type, it means the iterator visits only threads of a process and will show the pid value of the process instead of a tid. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-4-kuifeng@fb.com --- kernel/bpf/task_iter.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 46f836be22e2..67e03e1833ba 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,6 +10,12 @@ #include #include "mmap_unlock_work.h" +static const char * const iter_task_type_names[] = { + "ALL", + "TID", + "PID", +}; + struct bpf_iter_seq_task_common { struct pid_namespace *ns; enum bpf_iter_task_type type; @@ -687,6 +693,15 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b return 0; } +static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) +{ + seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); + if (aux->task.type == BPF_TASK_ITER_TID) + seq_printf(seq, "tid:\t%u\n", aux->task.pid); + else if (aux->task.type == BPF_TASK_ITER_TGID) + seq_printf(seq, "pid:\t%u\n", aux->task.pid); +} + static struct bpf_iter_reg task_reg_info = { .target = "task", .attach_target = bpf_iter_attach_task, @@ -698,6 +713,7 @@ static struct bpf_iter_reg task_reg_info = { }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -720,6 +736,7 @@ static struct bpf_iter_reg task_file_reg_info = { }, .seq_info = &task_file_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -742,6 +759,7 @@ static struct bpf_iter_reg task_vma_reg_info = { }, .seq_info = &task_vma_seq_info, .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, -- cgit v1.2.3 From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:03 -0700 Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline The struct_ops prog is to allow using bpf to implement the functions in a struct (eg. kernel module). The current usage is to implement the tcp_congestion. The kernel does not call the tcp-cc's ops (ie. the bpf prog) in a recursive way. The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It is needed for tracing prog. However, it turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. Skip running the '.ssthresh' will end up returning random value to the caller. The patch adds __bpf_prog_{enter,exit}_struct_ops for the struct_ops trampoline. They do not track the prog->active to detect recursion. One exception is when the tcp_congestion's '.init' ops is doing bpf_setsockopt(TCP_CONGESTION) and then recurs to the same '.init' ops. This will be addressed in the following patches. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 4 ++++ kernel/bpf/trampoline.c | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+) (limited to 'kernel/bpf') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 35796db58116..5b6230779cf3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (p->aux->sleepable) { enter = __bpf_prog_enter_sleepable; exit = __bpf_prog_exit_sleepable; + } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { + enter = __bpf_prog_enter_struct_ops; + exit = __bpf_prog_exit_struct_ops; } else if (p->expected_attach_type == BPF_LSM_CGROUP) { enter = __bpf_prog_enter_lsm_cgroup; exit = __bpf_prog_exit_lsm_cgroup; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f3eaf3ed98c..9e7d46d16032 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6f7b939321d6..bf0906e1e2b9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, rcu_read_unlock_trace(); } +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock(); +} + void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); -- cgit v1.2.3