diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bpf_iter.c | 9 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 81 | ||||
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 7 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 234 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 416 | ||||
-rw-r--r-- | kernel/bpf/core.c | 40 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 24 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.c | 20 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 25 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 262 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 287 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 37 | ||||
-rw-r--r-- | kernel/events/core.c | 61 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/kexec_file.c | 11 | ||||
-rw-r--r-- | kernel/module/internal.h | 13 | ||||
-rw-r--r-- | kernel/module/kallsyms.c | 35 | ||||
-rw-r--r-- | kernel/module/main.c | 9 | ||||
-rw-r--r-- | kernel/printk/printk.c | 13 | ||||
-rw-r--r-- | kernel/sysctl.c | 98 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 19 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 3 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace.c | 11 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 7 | ||||
-rw-r--r-- | kernel/watch_queue.c | 53 |
27 files changed, 1400 insertions, 383 deletions
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..7e8fd49406f6 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg4_type = ARG_ANYTHING, }; -/* maximum number of loops */ -#define MAX_LOOPS BIT(23) - BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { @@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64 ret; u32 i; + /* Note: these safety checks are also verified when bpf_loop + * is inlined, be careful to modify this code in sync. See + * function verifier.c:inline_bpf_loop. + */ if (flags) return -EINVAL; - if (nr_loops > MAX_LOOPS) + if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c1351df9f7ee..d469b7f3deef 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include <linux/bpf_local_storage.h> #include <linux/btf_ids.h> #include <linux/ima.h> +#include <linux/bpf-cgroup.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,57 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_SET_END(bpf_lsm_current_hooks) + +/* List of LSM hooks that trigger while the socket is properly locked. + */ +BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) +BTF_ID(func, bpf_lsm_sock_graft) +BTF_ID(func, bpf_lsm_inet_csk_clone) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_SET_END(bpf_lsm_locked_sockopt_hooks) + +/* List of LSM hooks that trigger while the socket is _not_ locked, + * but it's ok to call bpf_{g,s}etsockopt because the socket is still + * in the early init phase. + */ +BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) + +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + + args = btf_params(prog->aux->attach_func_proto); + +#ifdef CONFIG_NET + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -158,6 +210,35 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; + case BPF_FUNC_get_local_storage: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_local_storage_proto : NULL; + case BPF_FUNC_set_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_set_retval_proto : NULL; + case BPF_FUNC_get_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_retval_proto : NULL; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_setsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_getsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_getsockopt_proto; + return NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d9a3c9207240..7e0068c3399c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -503,10 +503,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index eb12d4f705cc..4423045b8ff3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) @@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: return true; } @@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) return (const struct btf_decl_tag *)(t + 1); } +static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) +{ + return (const struct btf_enum64 *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show) parens = "{"; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: prefix = "enum"; break; default: @@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: size = type->size; goto resolved; @@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; + const char *fmt_str; u16 i, nr_enums; u32 meta_needed; @@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; @@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, if (env->log.level == BPF_LOG_KERNEL) continue; - btf_verifier_log(env, "\t%s val=%d\n", + fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; + btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, return; } - btf_show_type_value(show, "%d", v); + if (btf_type_kflag(t)) + btf_show_type_value(show, "%d", v); + else + btf_show_type_value(show, "%u", v); btf_show_end_type(show); } @@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = { .show = btf_enum_show, }; +static s32 btf_enum64_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + struct btf *btf = env->btf; + const char *fmt_str; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); + return -EINVAL; + } + + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (env->log.level == BPF_LOG_KERNEL) + continue; + + fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; + btf_verifier_log(env, fmt_str, + __btf_name_by_offset(btf, enums[i].name_off), + btf_enum64_value(enums + i)); + } + + return meta_needed; +} + +static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + u32 i, nr_enums = btf_type_vlen(t); + void *safe_data; + s64 v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(u64 *)safe_data; + + for (i = 0; i < nr_enums; i++) { + if (v != btf_enum64_value(enums + i)) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; + } + + if (btf_type_kflag(t)) + btf_show_type_value(show, "%lld", v); + else + btf_show_type_value(show, "%llu", v); + btf_show_end_type(show); +} + +static struct btf_kind_operations enum64_ops = { + .check_meta = btf_enum64_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, + .log_details = btf_enum_log, + .show = btf_enum64_show, +}; + static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, + [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -5255,6 +5368,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -5304,7 +5418,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5768,7 +5882,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; *bad_type = t; return -EINVAL; @@ -5916,7 +6030,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, * to context only. And only global functions can be replaced. * Hence type check only those types. */ - if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, @@ -6414,7 +6528,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -6429,7 +6543,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (btf_type_is_int(t) || btf_is_any_enum(t)) { reg->type = SCALAR_VALUE; continue; } @@ -7308,95 +7422,15 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) -{ - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; - - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - return 0; - } -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: - * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); - * - for ENUMs, the size is ignored; + * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; @@ -7410,11 +7444,19 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index afb414b26d01..59b7eb60d5b4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> +#include <linux/bpf_lsm.h> +#include <linux/bpf_verifier.h> #include <net/sock.h> #include <net/bpf_sk_storage.h> @@ -61,6 +63,132 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, return run_ctx.retval; } +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + int i; + + lockdep_assert_held(&cgroup_mutex); + + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) + return CGROUP_LSM_START + i; + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == 0) + return CGROUP_LSM_START + i; + + return -E2BIG; + +} + +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + lockdep_assert_held(&cgroup_mutex); + + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && + cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + cgroup_lsm_atype[i].refcnt++; +} + +void bpf_cgroup_atype_put(int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + mutex_lock(&cgroup_mutex); + if (--cgroup_lsm_atype[i].refcnt <= 0) + cgroup_lsm_atype[i].attach_btf_id = 0; + WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); + mutex_unlock(&cgroup_mutex); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -157,15 +285,22 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_lock(&cgroup_mutex); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { - struct list_head *progs = &cgrp->bpf.progs[atype]; - struct bpf_prog_list *pl, *pltmp; + struct hlist_head *progs = &cgrp->bpf.progs[atype]; + struct bpf_prog_list *pl; + struct hlist_node *pltmp; - list_for_each_entry_safe(pl, pltmp, progs, node) { - list_del(&pl->node); - if (pl->prog) + hlist_for_each_entry_safe(pl, pltmp, progs, node) { + hlist_del(&pl->node); + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -217,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct list_head *head) +static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; @@ -291,7 +426,7 @@ static int compute_effective_progs(struct cgroup *cgrp, if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[atype], node) { + hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -342,7 +477,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cgroup_bpf_get(p); for (i = 0; i < NR; i++) - INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); @@ -418,7 +553,7 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 -static struct bpf_prog_list *find_attach_entry(struct list_head *progs, +static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, @@ -428,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* single-attach case */ if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) return NULL; - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); @@ -444,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* direct prog multi-attach w/ replacement case */ if (replace_prog) { - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; @@ -478,9 +613,10 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -494,7 +630,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -503,7 +639,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) + if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -525,12 +661,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { + struct hlist_node *last = NULL; + pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - list_add_tail(&pl->node, progs); + if (hlist_empty(progs)) + hlist_add_head(&pl->node, progs); + else + hlist_for_each(last, progs) { + if (last->next) + continue; + hlist_add_behind(&pl->node, last); + break; + } } pl->prog = prog; @@ -538,17 +684,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -556,7 +715,7 @@ cleanup: } bpf_cgroup_storages_free(new_storage); if (!old_prog) { - list_del(&pl->node); + hlist_del(&pl->node); kfree(pl); } return err; @@ -587,7 +746,7 @@ static void replace_effective_prog(struct cgroup *cgrp, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -603,7 +762,7 @@ static void replace_effective_prog(struct cgroup *cgrp, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) @@ -637,10 +796,10 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -649,7 +808,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (link->link.prog->type != new_prog->type) return -EINVAL; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; @@ -688,7 +847,7 @@ out_unlock: return ret; } -static struct bpf_prog_list *find_detach_entry(struct list_head *progs, +static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) @@ -696,14 +855,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog_list *pl; if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) @@ -713,7 +872,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } @@ -721,6 +880,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, } /** + * purge_effective_progs() - After compute_effective_progs fails to alloc new + * cgrp->bpf.inactive table we can recover by + * recomputing the array in place. + * + * @cgrp: The cgroup which descendants to travers + * @prog: A program to detach or NULL + * @link: A link to detach or NULL + * @atype: Type of detach operation + */ +static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, + enum cgroup_bpf_attach_type atype) +{ + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct hlist_head *head; + struct cgroup *cg; + int pos; + + /* recompute effective prog array in place */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link or prog in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[atype]; + hlist_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->prog == prog && pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + + /* Remove the program from the array */ + WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), + "Failed to purge a prog from array at index %d", pos); + } +} + +/** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse @@ -737,11 +950,16 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - int err; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -761,26 +979,27 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, atype); - if (err) - goto cleanup; + if (update_effective_progs(cgrp, atype)) { + /* if update effective array failed replace the prog with a dummy prog*/ + pl->prog = old_prog; + pl->link = link; + purge_effective_progs(cgrp, old_prog, link, atype); + } /* now can actually delete it from this cgroup list */ - list_del(&pl->node); + hlist_del(&pl->node); + kfree(pl); - if (list_empty(progs)) + if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; - -cleanup: - /* restore back prog or link */ - pl->prog = old_prog; - pl->link = link; - return err; } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, @@ -798,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; + enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct list_head *progs; - struct bpf_prog *prog; int cnt, ret = 0, i; + int total_cnt = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); - if (atype < 0) - return -EINVAL; - - progs = &cgrp->bpf.progs[atype]; - flags = cgrp->bpf.flags[atype]; + if (type == BPF_LSM_CGROUP) { + if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + return -EINVAL; - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], - lockdep_is_held(&cgroup_mutex)); + from_atype = CGROUP_LSM_START; + to_atype = CGROUP_LSM_END; + flags = 0; + } else { + from_atype = to_cgroup_bpf_attach_type(type); + if (from_atype < 0) + return -EINVAL; + to_atype = from_atype; + flags = cgrp->bpf.flags[from_atype]; + } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(effective); - else - cnt = prog_list_length(progs); + for (atype = from_atype; atype <= to_atype; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + total_cnt += bpf_prog_array_length(effective); + } else { + total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + } + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; + + if (attr->query.prog_cnt < total_cnt) { + total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; - - i = 0; - list_for_each_entry(pl, progs, node) { - prog = prog_list_prog(pl); - id = prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; + for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); + ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); + } else { + struct hlist_head *progs; + struct bpf_prog_list *pl; + struct bpf_prog *prog; + u32 id; + + progs = &cgrp->bpf.progs[atype]; + cnt = min_t(int, prog_list_length(progs), total_cnt); + i = 0; + hlist_for_each_entry(pl, progs, node) { + prog = prog_list_prog(pl); + id = prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } } + + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; + + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } + + prog_ids += cnt; + total_cnt -= cnt; } return ret; } @@ -937,6 +1189,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1281,7 +1535,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1296,7 +1550,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5f6f3f829b36..bfeb9b937315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns { u8 *ptr = NULL; - if (k >= SKF_NET_OFF) + if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) + } else if (k >= SKF_LL_OFF) { + if (unlikely(!skb_mac_header_was_set(skb))) + return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - + } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -107,6 +109,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); +#ifdef CONFIG_CGROUP_BPF + aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; +#endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -176,7 +181,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * - * jited_off is the byte off to the last byte of the jited insn. + * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: @@ -2279,6 +2284,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; @@ -2555,6 +2575,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); #endif +#ifdef CONFIG_CGROUP_BPF + if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) + bpf_cgroup_atype_put(aux->cgroup_atype); +#endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) @@ -2651,6 +2675,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { @@ -2714,6 +2740,12 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool __weak bpf_jit_supports_subprog_tailcalls(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 225806a02efb..1f961f9982d2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -584,7 +584,7 @@ BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) return strncmp(s1, s2, s1_sz); } -const struct bpf_func_proto bpf_strncmp_proto = { +static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1402,7 +1402,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) */ #define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_func_proto bpf_kptr_xchg_proto = { +static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, @@ -1487,7 +1487,7 @@ error: return err; } -const struct bpf_func_proto bpf_dynptr_from_mem_proto = { +static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) { int err; - if (!src->data) + if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); @@ -1513,7 +1514,7 @@ BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src return 0; } -const struct bpf_func_proto bpf_dynptr_read_proto = { +static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) { int err; - if (!dst->data || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); @@ -1539,7 +1542,7 @@ BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, return 0; } -const struct bpf_func_proto bpf_dynptr_write_proto = { +static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) @@ -1566,7 +1570,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len return (unsigned long)(ptr->data + ptr->offset + offset); } -const struct bpf_func_proto bpf_dynptr_data_proto = { +static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 3d897de89061..00b874c8e889 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; - head->first = node; + WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, @@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; raw_spin_lock(&head->lock); node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) } /* per cpu lists are all empty, try extralist */ + if (!READ_ONCE(s->extralist.first)) + return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } @@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + goto next_cpu; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } +next_cpu: cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; @@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } /* cannot pop from per cpu lists, try extralist */ - if (!raw_spin_trylock(&s->extralist.lock)) + if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2b69306d3c6e..ab688d85b2c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3506,13 +3513,14 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; } } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3548,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: + case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); @@ -4058,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); + info.attach_btf_id = prog->aux->attach_btf_id; + if (prog->aux->attach_btf) + info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); + else if (prog->aux->dst_prog) + info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -4090,14 +4104,15 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { + unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { - if (put_user((__u64)(long)prog->aux->jited_linfo[i], - &user_linfo[i])) + line_addr = (unsigned long)prog->aux->jited_linfo[i]; + if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { @@ -4539,6 +4554,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, @@ -5130,7 +5147,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag return *res ? 0 : -ENOENT; } -const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { +static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 93c7675f0c9e..6cd226584c33 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,8 @@ #include <linux/rcupdate_wait.h> #include <linux/module.h> #include <linux/static_call.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf_lsm.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -410,7 +412,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -418,44 +420,33 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); - if (tr->extension_prog) { + if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ - err = -EBUSY; - goto out; - } + return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) { - err = -EBUSY; - goto out; - } + if (cnt) + return -EBUSY; tr->extension_prog = link->link.prog; - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - goto out; - } - if (cnt >= BPF_MAX_TRAMP_LINKS) { - err = -E2BIG; - goto out; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } - if (!hlist_unhashed(&link->tramp_hlist)) { + if (cnt >= BPF_MAX_TRAMP_LINKS) + return -E2BIG; + if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ - err = -EBUSY; - goto out; - } + return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ - err = -EBUSY; - goto out; + return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); @@ -465,34 +456,220 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } -out: + return err; +} + +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -/* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; - goto out; + return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); -out: + return bpf_trampoline_update(tr); +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_unlink_prog(link, tr); + mutex_unlock(&tr->mutex); + return err; +} + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return err; } +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -625,6 +802,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + migrate_enable(); + rcu_read_unlock(); +} + u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0efbac0fd126..328cfab3af60 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5847,6 +5847,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; + u32 *arg_btf_id = NULL; int err = 0; if (arg_type == ARG_DONTCARE) @@ -5883,7 +5884,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta); + /* arg_btf_id and arg_size are in a union. */ + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + arg_btf_id = fn->arg_btf_id[arg]; + + err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); if (err) return err; @@ -6010,6 +6015,11 @@ skip_type_check: * next is_mem_size argument below. */ meta->raw_mode = arg_type & MEM_UNINIT; + if (arg_type & MEM_FIXED_SIZE) { + err = check_helper_mem_access(env, regno, + fn->arg_size[arg], false, + meta); + } } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -6143,7 +6153,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { - return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); + return env->prog->jit_requested && + bpf_jit_supports_subprog_tailcalls(); } static int check_map_func_compatibility(struct bpf_verifier_env *env, @@ -6399,11 +6410,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) return count <= 1; } -static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, - enum bpf_arg_type arg_next) +static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg) { - return (base_type(arg_curr) == ARG_PTR_TO_MEM) != - arg_type_is_mem_size(arg_next); + bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE; + bool has_size = fn->arg_size[arg] != 0; + bool is_next_size = false; + + if (arg + 1 < ARRAY_SIZE(fn->arg_type)) + is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]); + + if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM) + return is_next_size; + + return has_size == is_next_size || is_next_size == is_fixed; } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -6414,11 +6433,11 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - base_type(fn->arg5_type) == ARG_PTR_TO_MEM || - check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || - check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || - check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || - check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + check_args_pair_invalid(fn, 0) || + check_args_pair_invalid(fn, 1) || + check_args_pair_invalid(fn, 2) || + check_args_pair_invalid(fn, 3) || + check_args_pair_invalid(fn, 4)) return false; return true; @@ -6459,7 +6478,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && + /* arg_btf_id and arg_size are in a union. */ + (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || + !(fn->arg_type[i] & MEM_FIXED_SIZE))) return false; } @@ -7100,6 +7122,41 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static bool loop_flag_is_zero(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[BPF_REG_4]; + bool reg_is_null = register_is_null(reg); + + if (reg_is_null) + mark_chain_precision(env, BPF_REG_4); + + return reg_is_null; +} + +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) +{ + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + + if (!state->initialized) { + state->initialized = 1; + state->fit_for_inline = loop_flag_is_zero(env); + state->callback_subprogno = subprogno; + return; + } + + if (!state->fit_for_inline) + return; + + state->fit_for_inline = (loop_flag_is_zero(env) && + state->callback_subprogno == subprogno); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -7252,6 +7309,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: + update_loop_inline_state(env, meta.subprogno); err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; @@ -7261,6 +7319,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_set_retval: + if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; } if (err) @@ -7658,11 +7728,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) -{ - return &env->insn_aux_data[env->insn_idx]; -} - enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -9033,7 +9098,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != 0 || + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); @@ -10360,11 +10425,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -10455,6 +10530,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -10471,6 +10562,10 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -10882,7 +10977,7 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = - btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); goto err_free; @@ -14275,6 +14370,142 @@ patch_call_imm: return 0; } +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + struct bpf_insn insn_buf[] = { + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), + BPF_JMP_IMM(BPF_JA, 0, 0, 16), + /* spill R6, R7, R8 to use these as loop vars */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), + /* initialize loop vars */ + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), + BPF_MOV32_IMM(reg_loop_cnt, 0), + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), + /* callback call, + * correct callback offset would be set after patching + */ + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), + BPF_CALL_REL(0), + /* increment loop counter */ + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), + /* jump to loop header if callback returned 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), + /* restore original values of R6, R7, R8 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), + }; + + *cnt = ARRAY_SIZE(insn_buf); + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + new_prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +static int optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -14694,6 +14925,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -14810,8 +15042,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); return -EINVAL; } @@ -15012,6 +15244,9 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (ret == 0) + ret = optimize_bpf_loop(env); + if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1779ccddb734..13c8e91d7862 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -765,7 +765,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -1240,7 +1241,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -2597,21 +2599,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2651,7 +2659,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2664,7 +2672,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2689,7 +2697,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2708,7 +2716,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2716,8 +2724,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2963,7 +2971,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 80782cddb1da..07d51f95d6a6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6253,10 +6253,10 @@ again: if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Raced against perf_mmap_close(); remove the + * event and try again. */ + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } @@ -10068,26 +10068,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event) int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { @@ -11825,14 +11829,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -11870,8 +11885,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; @@ -11881,6 +11903,12 @@ set: rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -11888,20 +11916,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; diff --git a/kernel/exit.c b/kernel/exit.c index f072959fcab7..64c938ce36fe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -766,7 +766,7 @@ void __noreturn do_exit(long code) #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798..f9261c07b048 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -29,6 +29,15 @@ #include <linux/vmalloc.h> #include "kexec_internal.h" +#ifdef CONFIG_KEXEC_SIG +static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE); + +void set_kexec_sig_enforced(void) +{ + sig_enforce = true; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* @@ -159,7 +168,7 @@ kimage_validate_signature(struct kimage *image) image->kernel_buf_len); if (ret) { - if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + if (sig_enforce) { pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } diff --git a/kernel/module/internal.h b/kernel/module/internal.h index bc5507ab8450..ec104c2950c3 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -11,6 +11,7 @@ #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> +#include <linux/mm.h> #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 @@ -30,11 +31,13 @@ * to ensure complete separation of code and data, but * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_STRICT_MODULE_RWX -# define strict_align(X) PAGE_ALIGN(X) -#else -# define strict_align(X) (X) -#endif +static inline unsigned int strict_align(unsigned int size) +{ + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return PAGE_ALIGN(size); + else + return size; +} extern struct mutex module_mutex; extern struct list_head modules; diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 3e11523bc6f6..77e75bead569 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -137,6 +137,7 @@ void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->data_layout.size += strtab_size; + /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */ info->core_typeoffs = mod->data_layout.size; mod->data_layout.size += ndst * sizeof(char); mod->data_layout.size = strict_align(mod->data_layout.size); @@ -169,19 +170,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info) Elf_Sym *dst; char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + unsigned long strtab_size; /* Set up to point into init section. */ mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off; - preempt_disable(); + rcu_read_lock(); /* The following is safe since this pointer cannot change */ - rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; + rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference_sched(mod->kallsyms)->strtab = + rcu_dereference(mod->kallsyms)->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -190,22 +192,29 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; - src = rcu_dereference_sched(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { - rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + strtab_size = info->core_typeoffs - info->stroffs; + src = rcu_dereference(mod->kallsyms)->symtab; + for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { + rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + ssize_t ret; + mod->core_kallsyms.typetab[ndst] = - rcu_dereference_sched(mod->kallsyms)->typetab[i]; + rcu_dereference(mod->kallsyms)->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strscpy(s, - &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; + ret = strscpy(s, + &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + strtab_size); + if (ret < 0) + break; + s += ret + 1; + strtab_size -= ret + 1; } } - preempt_enable(); + rcu_read_unlock(); mod->core_kallsyms.num_symtab = ndst; } diff --git a/kernel/module/main.c b/kernel/module/main.c index fed58d30725d..0548151dd933 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2939,24 +2939,25 @@ static void cfi_init(struct module *mod) { #ifdef CONFIG_CFI_CLANG initcall_t *init; +#ifdef CONFIG_MODULE_UNLOAD exitcall_t *exit; +#endif rcu_read_lock_sched(); mod->cfi_check = (cfi_check_fn) find_kallsyms_symbol_value(mod, "__cfi_check"); init = (initcall_t *) find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - rcu_read_unlock_sched(); - /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; #ifdef CONFIG_MODULE_UNLOAD + exit = (exitcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); if (exit) mod->exit = *exit; #endif + rcu_read_unlock_sched(); cfi_module_add(mod, mod_tree.addr_min); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b49c6ff6dca0..a1a81fd9889b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3380,6 +3380,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre diff = 0; console_lock(); + for_each_console(c) { if (con && con != c) continue; @@ -3389,11 +3390,19 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre if (printk_seq < seq) diff += seq - printk_seq; } - console_unlock(); - if (diff != last_diff && reset_on_progress) + /* + * If consoles are suspended, it cannot be expected that they + * make forward progress, so timeout immediately. @diff is + * still used to return a valid flush status. + */ + if (console_suspended) + remaining = 0; + else if (diff != last_diff && reset_on_progress) remaining = timeout_ms; + console_unlock(); + if (diff == 0 || remaining == 0) break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..b233714a1c78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -446,14 +446,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; - *valp = -*lvalp; + WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } } else { - int val = *valp; + int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; @@ -472,9 +472,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, if (write) { if (*lvalp > UINT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } else { - unsigned int val = *valp; + unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; @@ -857,7 +857,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; @@ -923,7 +923,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, (param->max && *param->max < tmp)) return -ERANGE; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; @@ -1007,13 +1007,13 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, tmp.maxlen = sizeof(val); tmp.data = &val; - val = *data; + val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, ¶m); if (res) return res; if (write) - *data = val; + WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); @@ -1090,9 +1090,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = -EINVAL; break; } - *i = val; + WRITE_ONCE(*i, val); } else { - val = convdiv * (*i) / convmul; + val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); @@ -1173,9 +1173,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, if (write) { if (*lvalp > INT_MAX / HZ) return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + if (*negp) + WRITE_ONCE(*valp, -*lvalp * HZ); + else + WRITE_ONCE(*valp, *lvalp * HZ); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1221,9 +1224,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, if (jif > INT_MAX) return 1; - *valp = (int)jif; + WRITE_ONCE(*valp, (int)jif); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1237,6 +1240,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + return 0; +} + /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table @@ -1259,6 +1286,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, do_proc_dointvec_jiffies_conv,NULL); } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); +} + /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table @@ -1291,8 +1329,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. @@ -1523,6 +1561,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2091,6 +2135,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", @@ -2107,15 +2162,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { .procname = "hugetlb_shm_group", diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1cd10b102c51..5dead89308b7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1051,15 +1051,24 @@ retry_delete: } /* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. + * This is called by do_exit or de_thread, only when nobody else can + * modify the signal->posix_timers list. Yet we need sighand->siglock + * to prevent the race with /proc/pid/timers. */ -void exit_itimers(struct signal_struct *sig) +void exit_itimers(struct task_struct *tsk) { + struct list_head timers; struct k_itimer *tmr; - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + if (list_empty(&tsk->signal->posix_timers)) + return; + + spin_lock_irq(&tsk->sighand->siglock); + list_replace_init(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + while (!list_empty(&timers)) { + tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index debbbb083286..ccd6a5ade3e9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -194,7 +194,8 @@ config FUNCTION_TRACER sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very - small and not measurable even in micro-benchmarks. + small and not measurable even in micro-benchmarks (at least on + x86, but may have impact on other architectures). config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88589d74a892..68e5cdd24cef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8cfac0611bc..b8dd54627075 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9864,6 +9864,12 @@ void trace_init_global_iter(struct trace_iterator *iter) /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* Can not use kmalloc for iter.temp and iter.fmt */ + iter->temp = static_temp_buf; + iter->temp_size = STATIC_TEMP_BUF_SIZE; + iter->fmt = static_fmt_buf; + iter->fmt_size = STATIC_FMT_BUF_SIZE; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) @@ -9896,11 +9902,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); - /* Can not use kmalloc for iter.temp and iter.fmt */ - iter.temp = static_temp_buf; - iter.temp_size = STATIC_TEMP_BUF_SIZE; - iter.fmt = static_fmt_buf; - iter.fmt_size = STATIC_FMT_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 48e82e141d54..e87a46794079 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4430,6 +4430,8 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + hist_data->attrs->var_defs.name[n_vars] = NULL; ret = -ENOMEM; goto free; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c3dc4f859a6b..88ba5b4bd0c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/string.h> #include <linux/rculist.h> +#include <linux/filter.h> #include "trace_dynevent.h" #include "trace_probe.h" @@ -1342,15 +1343,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; +#ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); if (!ret) return; } +#endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 230038d4f908..bb9962b33f95 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -34,6 +34,27 @@ MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) +/* + * This must be called under the RCU read-lock, which makes + * sure that the wqueue still exists. It can then take the lock, + * and check that the wqueue hasn't been destroyed, which in + * turn makes sure that the notification pipe still exists. + */ +static inline bool lock_wqueue(struct watch_queue *wqueue) +{ + spin_lock_bh(&wqueue->lock); + if (unlikely(wqueue->defunct)) { + spin_unlock_bh(&wqueue->lock); + return false; + } + return true; +} + +static inline void unlock_wqueue(struct watch_queue *wqueue) +{ + spin_unlock_bh(&wqueue->lock); +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { /* * Post a notification to a watch queue. + * + * Must be called with the RCU lock for reading, and the + * watch_queue lock held, which guarantees that the pipe + * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) @@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue, spin_lock_irq(&pipe->rd_wait.lock); - if (wqueue->defunct) - goto out; - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; @@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist, if (security_post_notification(watch->cred, cred, n) < 0) continue; - post_one_notification(wqueue, n); + if (lock_wqueue(wqueue)) { + post_one_notification(wqueue, n); + unlock_wqueue(wqueue); + } } rcu_read_unlock(); @@ -462,11 +487,12 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) return -EAGAIN; } - spin_lock_bh(&wqueue->lock); - kref_get(&wqueue->usage); - kref_get(&watch->usage); - hlist_add_head(&watch->queue_node, &wqueue->watches); - spin_unlock_bh(&wqueue->lock); + if (lock_wqueue(wqueue)) { + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + unlock_wqueue(wqueue); + } hlist_add_head(&watch->list_node, &wlist->watchers); return 0; @@ -520,20 +546,15 @@ found: wqueue = rcu_dereference(watch->queue); - /* We don't need the watch list lock for the next bit as RCU is - * protecting *wqueue from deallocation. - */ - if (wqueue) { + if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); - spin_lock_bh(&wqueue->lock); - if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } - spin_unlock_bh(&wqueue->lock); + unlock_wqueue(wqueue); } if (wlist->release_watch) { |