diff options
Diffstat (limited to 'kernel')
98 files changed, 3156 insertions, 1117 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 986c8214dabf..179848ad33e9 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,7 +84,6 @@ static struct ctl_table kern_acct_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __init int kernel_acct_sysctls_init(void) diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 4100df44c665..17067dcb4386 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e497011261b8..7eb9ad3a3ae6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +ifneq ($(CONFIG_CRYPTO),) +obj-$(CONFIG_BPF_SYSCALL) += crypto.o +endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..583ee4fe48ef 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { @@ -251,7 +251,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) int ret; kbase = bpf_arena_get_kern_vm_start(arena); - kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + kaddr = kbase + (u32)(vmf->address); guard(mutex)(&arena->lock); page = vmalloc_to_page((void *)kaddr); @@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..feabc0193852 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ +static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + + BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); + + *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); + if (!map->bypass_spec_v1) { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); + } + + *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); + return insn - insn_buf; +} + static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -396,17 +428,22 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers(struct bpf_map *map) +static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!btf_record_has_field(map->record, BPF_TIMER)) - return; - - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + /* We don't reset or free fields other than timer and workqueue + * on uref dropping to zero. + */ + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + for (i = 0; i < array->map.max_entries; i++) { + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + } + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -750,7 +787,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers, + .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, @@ -776,6 +813,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index bdea1a459153..976cb258a0ed 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not - * responsibile to free the local_storage. + * responsible to free the local_storage. */ if (storage_smap) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 43356faaa057..86c7884abaf8 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -728,8 +728,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, cur_image = image; trampoline_start = 0; } - if (err < 0) - goto reset_unlock; *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); @@ -742,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - for (i = 0; i < st_map->image_pages_cnt; i++) - arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + for (i = 0; i < st_map->image_pages_cnt; i++) { + err = arch_protect_bpf_trampoline(st_map->image_pages[i], + PAGE_SIZE); + if (err) + goto reset_unlock; + } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 90c4a32d89ff..821063660d9f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -218,6 +218,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, + BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; @@ -3464,6 +3465,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_WORKQUEUE) { + if (!strcmp(name, "bpf_wq")) { + if (*seen_mask & BPF_WORKQUEUE) + return -E2BIG; + *seen_mask |= BPF_WORKQUEUE; + type = BPF_WORKQUEUE; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3515,6 +3525,7 @@ static int btf_find_struct_field(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3582,6 +3593,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3816,6 +3828,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); @@ -3846,6 +3859,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_WORKQUEUE: + WARN_ON_ONCE(rec->wq_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->wq_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -5642,8 +5660,8 @@ errout_free: return ERR_PTR(err); } -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -5971,6 +5989,9 @@ struct btf *btf_parse_vmlinux(void) struct btf *btf = NULL; int err; + if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) + return ERR_PTR(-ENOENT); + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); @@ -8137,6 +8158,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; + case BPF_PROG_TYPE_KPROBE: + return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 82243cb6c54d..8ba73042a239 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2575,8 +2575,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1ea5ce5bb599..1a6c3faa6e4a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,10 +22,10 @@ #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/random.h> -#include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> +#include <linux/overflow.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> @@ -37,6 +37,7 @@ #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> +#include <linux/execmem.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -747,7 +748,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -813,7 +814,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; @@ -849,7 +850,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return -EINVAL; } - tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; @@ -908,23 +909,30 @@ static LIST_HEAD(pack_list); static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; + int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); - if (!pack->ptr) { - kfree(pack); - return NULL; - } + if (!pack->ptr) + goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); - list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + err = set_memory_rox((unsigned long)pack->ptr, + BPF_PROG_PACK_SIZE / PAGE_SIZE); + if (err) + goto out; + list_add_tail(&pack->list, &pack_list); return pack; + +out: + bpf_jit_free_exec(pack->ptr); + kfree(pack); + return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) @@ -939,9 +947,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { + int err; + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + err = set_memory_rox((unsigned long)ptr, + size / PAGE_SIZE); + if (err) { + bpf_jit_free_exec(ptr); + ptr = NULL; + } } goto out; } @@ -1050,12 +1065,12 @@ void bpf_jit_uncharge_modmem(u32 size) void *__weak bpf_jit_alloc_exec(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { - module_memfree(addr); + execmem_free(addr); } struct bpf_binary_header * @@ -2204,6 +2219,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ @@ -2217,6 +2233,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ @@ -2403,7 +2420,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } finalize: - bpf_prog_lock_ro(fp); + *err = bpf_prog_lock_ro(fp); + if (*err) + return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal @@ -2437,13 +2456,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { + struct bpf_prog_array *p; + if (prog_cnt) - return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog_array_item) * - (prog_cnt + 1), - flags); + p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); + else + p = &bpf_empty_prog_array.hdr; - return &bpf_empty_prog_array.hdr; + return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) @@ -2796,7 +2816,7 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); -/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +/* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) @@ -2921,12 +2941,28 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return true if the JIT inlines the call to the helper corresponding to + * the imm. + * + * The verifier will not patch the insn->imm for the call to the helper if + * this returns true. + */ +bool __weak bpf_jit_inlines_helper_call(s32 imm) +{ + return false; +} + /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { return false; } +bool __weak bpf_jit_supports_percpu_insn(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; @@ -2942,6 +2978,11 @@ bool __weak bpf_jit_supports_arena(void) return false; } +bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index dad0fb1c8e87..33c473d676a5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void) ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, ARRAY_SIZE(cpumask_dtors), THIS_MODULE); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c new file mode 100644 index 000000000000..2bee4af91e38 --- /dev/null +++ b/kernel/bpf/crypto.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include <linux/bpf.h> +#include <linux/bpf_crypto.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/scatterlist.h> +#include <linux/skbuff.h> +#include <crypto/skcipher.h> + +struct bpf_crypto_type_list { + const struct bpf_crypto_type *type; + struct list_head list; +}; + +/* BPF crypto initialization parameters struct */ +/** + * struct bpf_crypto_params - BPF crypto initialization parameters structure + * @type: The string of crypto operation type. + * @reserved: Reserved member, will be reused for more options in future + * Values: + * 0 + * @algo: The string of algorithm to initialize. + * @key: The cipher key used to init crypto algorithm. + * @key_len: The length of cipher key. + * @authsize: The length of authentication tag used by algorithm. + */ +struct bpf_crypto_params { + char type[14]; + u8 reserved[2]; + char algo[128]; + u8 key[256]; + u32 key_len; + u32 authsize; +}; + +static LIST_HEAD(bpf_crypto_types); +static DECLARE_RWSEM(bpf_crypto_types_sem); + +/** + * struct bpf_crypto_ctx - refcounted BPF crypto context structure + * @type: The pointer to bpf crypto type + * @tfm: The pointer to instance of crypto API struct. + * @siv_len: Size of IV and state storage for cipher + * @rcu: The RCU head used to free the crypto context with RCU safety. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + */ +struct bpf_crypto_ctx { + const struct bpf_crypto_type *type; + void *tfm; + u32 siv_len; + struct rcu_head rcu; + refcount_t usage; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -EEXIST; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (!strcmp(node->type->name, type->name)) + goto unlock; + } + + node = kmalloc(sizeof(*node), GFP_KERNEL); + err = -ENOMEM; + if (!node) + goto unlock; + + node->type = type; + list_add(&node->list, &bpf_crypto_types); + err = 0; + +unlock: + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_register_type); + +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -ENOENT; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, type->name)) + continue; + + list_del(&node->list); + kfree(node); + err = 0; + break; + } + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type); + +static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name) +{ + const struct bpf_crypto_type *type = ERR_PTR(-ENOENT); + struct bpf_crypto_type_list *node; + + down_read(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, name)) + continue; + + if (try_module_get(node->type->owner)) + type = node->type; + break; + } + up_read(&bpf_crypto_types_sem); + + return type; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_crypto_ctx_create() - Create a mutable BPF crypto context. + * + * Allocates a crypto context that can be used, acquired, and released by + * a BPF program. The crypto context returned by this function must either + * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release(). + * As crypto API functions use GFP_KERNEL allocations, this function can + * only be used in sleepable BPF programs. + * + * bpf_crypto_ctx_create() allocates memory for crypto context. + * It may return NULL if no memory is available. + * @params: pointer to struct bpf_crypto_params which contains all the + * details needed to initialise crypto context. + * @params__sz: size of steuct bpf_crypto_params usef by bpf program + * @err: integer to store error code when NULL is returned. + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz, + int *err) +{ + const struct bpf_crypto_type *type; + struct bpf_crypto_ctx *ctx; + + if (!params || params->reserved[0] || params->reserved[1] || + params__sz != sizeof(struct bpf_crypto_params)) { + *err = -EINVAL; + return NULL; + } + + type = bpf_crypto_get_type(params->type); + if (IS_ERR(type)) { + *err = PTR_ERR(type); + return NULL; + } + + if (!type->has_algo(params->algo)) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!!params->authsize ^ !!type->setauthsize) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!params->key_len || params->key_len > sizeof(params->key)) { + *err = -EINVAL; + goto err_module_put; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + *err = -ENOMEM; + goto err_module_put; + } + + ctx->type = type; + ctx->tfm = type->alloc_tfm(params->algo); + if (IS_ERR(ctx->tfm)) { + *err = PTR_ERR(ctx->tfm); + goto err_free_ctx; + } + + if (params->authsize) { + *err = type->setauthsize(ctx->tfm, params->authsize); + if (*err) + goto err_free_tfm; + } + + *err = type->setkey(ctx->tfm, params->key, params->key_len); + if (*err) + goto err_free_tfm; + + if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) { + *err = -EINVAL; + goto err_free_tfm; + } + + ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm); + + refcount_set(&ctx->usage, 1); + + return ctx; + +err_free_tfm: + type->free_tfm(ctx->tfm); +err_free_ctx: + kfree(ctx); +err_module_put: + module_put(type->owner); + + return NULL; +} + +static void crypto_free_cb(struct rcu_head *head) +{ + struct bpf_crypto_ctx *ctx; + + ctx = container_of(head, struct bpf_crypto_ctx, rcu); + ctx->type->free_tfm(ctx->tfm); + module_put(ctx->type->owner); + kfree(ctx); +} + +/** + * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context. + * @ctx: The BPF crypto context being acquired. The ctx must be a trusted + * pointer. + * + * Acquires a reference to a BPF crypto context. The context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_crypto_ctx_release(). + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) +{ + if (!refcount_inc_not_zero(&ctx->usage)) + return NULL; + return ctx; +} + +/** + * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context. + * @ctx: The crypto context being released. + * + * Releases a previously acquired reference to a BPF crypto context. When the final + * reference of the BPF crypto context has been released, its memory + * will be released. + */ +__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->usage)) + call_rcu(&ctx->rcu, crypto_free_cb); +} + +static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv, + bool decrypt) +{ + u32 src_len, dst_len, siv_len; + const u8 *psrc; + u8 *pdst, *piv; + int err; + + if (__bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + siv_len = __bpf_dynptr_size(siv); + src_len = __bpf_dynptr_size(src); + dst_len = __bpf_dynptr_size(dst); + if (!src_len || !dst_len) + return -EINVAL; + + if (siv_len != ctx->siv_len) + return -EINVAL; + + psrc = __bpf_dynptr_data(src, src_len); + if (!psrc) + return -EINVAL; + pdst = __bpf_dynptr_data_rw(dst, dst_len); + if (!pdst) + return -EINVAL; + + piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL; + if (siv_len && !piv) + return -EINVAL; + + err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) + : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); + + return err; +} + +/** + * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, true); +} + +/** + * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, false); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(crypt_init_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_KFUNCS_END(crypt_init_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_init_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_init_kfunc_btf_ids, +}; + +BTF_KFUNCS_START(crypt_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU) +BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU) +BTF_KFUNCS_END(crypt_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_kfunc_btf_ids, +}; + +BTF_ID_LIST(bpf_crypto_dtor_ids) +BTF_ID(struct, bpf_crypto_ctx) +BTF_ID(func, bpf_crypto_ctx_release) + +static int __init crypto_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = { + { + .btf_id = bpf_crypto_dtor_ids[0], + .kfunc_btf_id = bpf_crypto_dtor_ids[1] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &crypt_init_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors, + ARRAY_SIZE(bpf_crypto_dtors), + THIS_MODULE); +} + +late_initcall(crypto_kfunc_init); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bd2e2dd04740..309c4aa1b026 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) insn->off == BPF_ADDR_SPACE_CAST; } +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + <percpu_base_off> + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); + } else if (is_mov_percpu_addr(insn)) { + verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", + insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..06115f8728e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab); } -static void htab_free_prealloced_timers(struct bpf_htab *htab) +static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -235,7 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -1490,11 +1493,12 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; @@ -1506,24 +1510,29 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_and_wq(htab); + else + htab_free_prealloced_timers_and_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -1538,7 +1547,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { @@ -2259,7 +2268,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2280,7 +2289,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, @@ -2307,6 +2316,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2435,6 +2464,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 449b9a5d3fe3..2a69a9a36c0f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +struct bpf_async_cb { + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; + struct rcu_head rcu; + u64 flags; +}; + /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. - * That space is used to keep 'struct bpf_timer_kern'. + * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. @@ -1096,17 +1105,23 @@ const struct bpf_func_proto bpf_snprintf_proto = { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + struct bpf_async_cb cb; struct hrtimer timer; - struct bpf_map *map; - struct bpf_prog *prog; - void __rcu *callback_fn; - void *value; - struct rcu_head rcu; }; -/* the actual struct hidden inside uapi struct bpf_timer */ -struct bpf_timer_kern { - struct bpf_hrtimer *timer; +struct bpf_work { + struct bpf_async_cb cb; + struct work_struct work; + struct work_struct delete_work; +}; + +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ +struct bpf_async_kern { + union { + struct bpf_async_cb *cb; + struct bpf_hrtimer *timer; + struct bpf_work *work; + }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. @@ -1114,19 +1129,24 @@ struct bpf_timer_kern { struct bpf_spin_lock lock; } __attribute__((aligned(8))); +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); - struct bpf_map *map = t->map; - void *value = t->value; + struct bpf_map *map = t->cb.map; + void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); - callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1155,46 +1175,112 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, - u64, flags) +static void bpf_wq_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, work); + struct bpf_async_cb *cb = &w->cb; + struct bpf_map *map = cb->map; + bpf_callback_t callback_fn; + void *value = cb->value; + void *key; + u32 idx; + + BTF_TYPE_EMIT(struct bpf_wq); + + callback_fn = READ_ONCE(cb->callback_fn); + if (!callback_fn) + return; + + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + rcu_read_lock_trace(); + migrate_disable(); + + callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); + + migrate_enable(); + rcu_read_unlock_trace(); +} + +static void bpf_wq_delete_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + + cancel_work_sync(&w->work); + + kfree_rcu(w, cb.rcu); +} + +static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, + enum bpf_async_type type) { - clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_async_cb *cb; struct bpf_hrtimer *t; + struct bpf_work *w; + clockid_t clockid; + size_t size; int ret = 0; - BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); - if (in_nmi()) return -EOPNOTSUPP; - if (flags >= MAX_CLOCKS || - /* similar to timerfd except _ALARM variants are not supported */ - (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME && - clockid != CLOCK_BOOTTIME)) + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + size = sizeof(struct bpf_hrtimer); + break; + case BPF_ASYNC_TYPE_WQ: + size = sizeof(struct bpf_work); + break; + default: return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; + } + + __bpf_spin_lock_irqsave(&async->lock); + t = async->timer; if (t) { ret = -EBUSY; goto out; } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ - t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); - if (!t) { + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + if (!cb) { ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->record->timer_off; - t->map = map; - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; - WRITE_ONCE(timer->timer, t); - /* Guarantee the order between timer->timer and map->usercnt. So + + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + clockid = flags & (MAX_CLOCKS - 1); + t = (struct bpf_hrtimer *)cb; + + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + cb->value = (void *)async - map->record->timer_off; + break; + case BPF_ASYNC_TYPE_WQ: + w = (struct bpf_work *)cb; + + INIT_WORK(&w->work, bpf_wq_work); + INIT_WORK(&w->delete_work, bpf_wq_delete_work); + cb->value = (void *)async - map->record->wq_off; + break; + } + cb->map = map; + cb->prog = NULL; + cb->flags = flags; + rcu_assign_pointer(cb->callback_fn, NULL); + + WRITE_ONCE(async->cb, cb); + /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. @@ -1204,15 +1290,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(timer->timer, NULL); - kfree(t); + WRITE_ONCE(async->cb, NULL); + kfree(cb); ret = -EPERM; } out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clock_t clockid = flags & (MAX_CLOCKS - 1); + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + + return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, @@ -1222,22 +1327,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, - struct bpf_prog_aux *, aux) +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { + __bpf_spin_lock_irqsave(&async->lock); + cb = async->cb; + if (!cb) { ret = -EINVAL; goto out; } - if (!atomic64_read(&t->map->usercnt)) { + if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1246,7 +1352,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->prog; + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1259,14 +1365,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->prog = prog; + cb->prog = prog; } - rcu_assign_pointer(t->callback_fn, callback_fn); + rcu_assign_pointer(cb->callback_fn, callback_fn); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, @@ -1275,7 +1387,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1287,7 +1399,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; - if (!t || !t->prog) { + if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } @@ -1315,18 +1427,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_hrtimer *t) +static void drop_prog_refcnt(struct bpf_async_cb *async) { - struct bpf_prog *prog = t->prog; + struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + async->prog = NULL; + rcu_assign_pointer(async->callback_fn, NULL); } } -BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; @@ -1348,7 +1460,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) ret = -EDEADLK; goto out; } - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1366,36 +1478,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { - struct bpf_timer_kern *timer = val; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; - /* Performance optimization: read timer->timer without lock first. */ - if (!READ_ONCE(timer->timer)) - return; + /* Performance optimization: read async->cb without lock first. */ + if (!READ_ONCE(async->cb)) + return NULL; - __bpf_spin_lock_irqsave(&timer->lock); + __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ - t = timer->timer; - if (!t) + cb = async->cb; + if (!cb) goto out; - drop_prog_refcnt(t); + drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - WRITE_ONCE(timer->timer, NULL); + WRITE_ONCE(async->cb, NULL); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); + return cb; +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. - * The timer->timer = NULL was already done and no code path can + * The async->cb = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer @@ -1404,13 +1524,33 @@ out: * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since timer->timer = NULL was already done. The timer will be + * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree_rcu(t, rcu); + kfree_rcu(t, cb.rcu); +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + struct bpf_work *work; + + BTF_TYPE_EMIT(struct bpf_wq); + + work = (struct bpf_work *)__bpf_async_cancel_and_free(val); + if (!work) + return; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) @@ -1443,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1730,6 +1870,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: break; } @@ -2408,7 +2552,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer - * if the bpf program allows skb data writes. There are two possiblities + * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the @@ -2545,6 +2689,61 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_map *map = p__map; + + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); + + if (flags) + return -EINVAL; + + return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); +} + +__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_work *w; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + w = READ_ONCE(async->work); + if (!w || !READ_ONCE(w->cb.prog)) + return -EINVAL; + + schedule_work(&w->work); + return 0; +} + +__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags, + void *aux__ign) +{ + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + + if (flags) + return -EINVAL; + + return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); +} + +__bpf_kfunc void bpf_preempt_disable(void) +{ + preempt_disable(); +} + +__bpf_kfunc void bpf_preempt_enable(void) +{ + preempt_enable(); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2621,6 +2820,12 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +BTF_ID_FLAGS(func, bpf_wq_init) +BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_start) +BTF_ID_FLAGS(func, bpf_preempt_disable) +BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -2648,6 +2853,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 2a243cf37c60..4bd8f17a9f24 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); + strscpy(postfix, "or_null_"); else - strncpy(postfix, "_or_null", 16); + strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf0f7..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index) } /** - * longest_prefix_match() - determine the longest prefix + * __longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ -static size_t longest_prefix_match(const struct lpm_trie *trie, - const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key_u8 *key) +static __always_inline +size_t __longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; @@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, return prefixlen; } +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) +{ + return __longest_prefix_match(trie, node, key); +} + /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { @@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ - matchlen = longest_prefix_match(trie, node, key); + matchlen = __longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; @@ -308,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -382,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -429,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -437,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -506,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -521,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 8ef269e66ba5..b4f18c85d7bc 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -32,7 +32,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ - if (inner_map->ops == &array_map_ops) + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); @@ -68,7 +68,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; - if (inner_map->ops == &array_map_ops) { + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 550f02e2cb13..a546aba46d5d 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -759,8 +759,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -776,8 +775,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..2222c3ff88e7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -659,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -679,6 +688,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -980,7 +992,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif @@ -1085,7 +1097,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1115,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -3498,17 +3511,12 @@ out_put_prog: return err; } -struct bpf_raw_tp_link { - struct bpf_link link; - struct bpf_raw_event_map *btp; -}; - static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3808,7 +3816,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3855,6 +3863,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3862,7 +3871,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; @@ -3875,11 +3884,13 @@ out_put_btp: return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3889,7 +3900,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -3985,6 +3998,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, * check permissions at attach time. */ return -EPERM; + + ptype = attach_type_to_prog_type(attach_type); + if (prog->type != ptype) + return -EINVAL; + return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -4003,11 +4021,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; @@ -5227,7 +5249,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5242,6 +5264,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); @@ -5264,7 +5290,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); @@ -6008,7 +6035,6 @@ static struct ctl_table bpf_syscall_table[] = { .mode = 0644, .proc_handler = bpf_stats_handler, }, - { } }; static int __init bpf_syscall_sysctl_init(void) diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index ef6911aee3bb..fedb54c94cdb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,8 +9,8 @@ #include <linux/sysfs.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, @@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) + if (bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 88673a4267eb..f8302a5ca400 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -456,7 +456,9 @@ again: if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, im->size); + err = arch_protect_bpf_trampoline(im->image, im->size); + if (err) + goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -883,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog, * Hence check that 'start' is valid. */ start > NO_START_TIME) { + u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } @@ -1072,17 +1075,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size) bpf_jit_free_exec(image); } -void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) -{ - WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_rox((long)image, 1); -} - -void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) +int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_nx((long)image, 1); - set_memory_rw((long)image, 1); + return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cb7ad1f795e1..77da1f438bec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set; /* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { - /* verifer state is 'st' + /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ @@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem { #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); @@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; + return aux->map_ptr_state.poison; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state.unpriv; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) + struct bpf_map *map, + bool unpriv, bool poison) { - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_ptr_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state.unpriv = unpriv; + aux->map_ptr_state.poison = poison; + aux->map_ptr_state.map_ptr = map; } static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) @@ -336,6 +332,10 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + struct { + struct bpf_map *ptr; + int uid; + } map; u64 mem_size; }; @@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_async_callback_calling_kfunc(u32 btf_id); +static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) static bool is_async_callback_calling_insn(struct bpf_insn *insn) { - return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); + return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } static bool is_may_goto_insn(struct bpf_insn *insn) @@ -1429,6 +1434,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->active_preempt_lock = src->active_preempt_lock; + dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; dst_state->active_lock.id = src->active_lock.id; @@ -1842,6 +1849,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) */ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -2135,7 +2144,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tigher range. + * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound @@ -2143,7 +2152,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respecitve s64 or u64 domain, just + * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ __u64 new_umin, new_umax; @@ -2359,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -2402,7 +2413,7 @@ static void init_func_state(struct bpf_verifier_env *env, /* Similar to push_stack(), but for async callbacks */ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, - int subprog) + int subprog, bool is_sleepable) { struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; @@ -2429,6 +2440,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * Initialize it similar to do_check_common(). */ elem->st.branches = 1; + elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -3615,7 +3627,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * sreg needs precision before this insn */ bt_clear_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3631,7 +3644,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * both dreg and sreg need precision * before this insn */ - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ @@ -5274,7 +5288,8 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable; + return env->prog->sleepable || + (env->cur_state && env->cur_state->in_sleepable); } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5297,6 +5312,7 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +BTF_ID(struct, bpf_crypto_ctx) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -5386,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); - /* For mark_ptr_or_null_reg */ - val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5705,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* Types listed in the reg2btf_ids are always trusted */ - if (reg2btf_ids[base_type(reg->type)]) + if (reg2btf_ids[base_type(reg->type)] && + !bpf_type_has_unsafe_modifiers(reg->type)) return true; /* If a register is not referenced, it is trusted if it has the @@ -6325,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) #define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) +#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null) /* * Allow list few fields as RCU trusted or full trusted. @@ -6388,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) { struct inode *d_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct socket) { +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; @@ -6423,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } +static bool type_is_trusted_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, + "__safe_trusted_or_null"); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -6536,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; + } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) { + flag |= PTR_TRUSTED | PTR_MAYBE_NULL; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ @@ -6972,6 +6999,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_mismatch); + static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int load_reg; @@ -7032,7 +7062,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg) || - is_arena_reg(env, insn->dst_reg)) { + (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7068,6 +7098,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (is_arena_reg(env, insn->dst_reg)) { + err = save_aux_ptr_type(env, PTR_TO_ARENA, false); + if (err) + return err; + } /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); @@ -7590,6 +7625,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_wq_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->wq_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", + val + reg->off, map->record->wq_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -9484,7 +9536,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins */ env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && - !is_sync_callback_calling_kfunc(insn->imm)) { + !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", func_id_name(insn->imm), insn->imm); return -EFAULT; @@ -9498,10 +9550,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; - /* there is no real recursion here. timer callbacks are async */ + /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - insn_idx, subprog); + insn_idx, subprog, + is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -9561,6 +9614,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + /* Only global subprogs cannot be called with preemption disabled. */ + if (env->cur_state->active_preempt_lock) { + verbose(env, "global function calls are not allowed with preemption disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -9653,12 +9713,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_map *map; int err; - if (bpf_map_ptr_poisoned(insn_aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - - map = BPF_MAP_PTR(insn_aux->map_ptr_state); + /* valid map_ptr and poison value does not matter */ + map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { verbose(env, "callback function not allowed for map\n"); @@ -10017,12 +10073,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_ptr_state)) + if (!aux->map_ptr_state.map_ptr) + bpf_map_ptr_store(aux, meta->map_ptr, + !meta->map_ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map_ptr) bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1); - else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - !meta->map_ptr->bypass_spec_v1); + !meta->map_ptr->bypass_spec_v1, true); return 0; } @@ -10201,8 +10257,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } @@ -10251,6 +10307,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_preempt_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in non-preemptible region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -10839,6 +10906,7 @@ enum { KF_ARG_LIST_NODE_ID, KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, + KF_ARG_WORKQUEUE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -10847,6 +10915,7 @@ BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) +BTF_ID(struct, bpf_wq) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -10890,6 +10959,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -10959,6 +11033,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_WORKQUEUE, }; enum special_kfunc_type { @@ -10984,7 +11059,11 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_wq_set_callback_impl, + KF_bpf_preempt_disable, + KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, + KF_bpf_session_cookie, }; BTF_SET_START(special_kfunc_set) @@ -11008,6 +11087,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif @@ -11036,11 +11116,15 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_preempt_disable) +BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_session_cookie) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11062,6 +11146,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; } +static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable]; +} + +static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -11115,6 +11209,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP; + if (is_kfunc_arg_wq(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_WORKQUEUE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11366,12 +11463,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_async_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + static bool is_bpf_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return is_sync_callback_calling_kfunc(btf_id) || + is_async_callback_calling_kfunc(btf_id); +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -11716,6 +11829,34 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: + if (!reg->map_ptr) { + verbose(env, "pointer in R%d isn't map pointer\n", regno); + return -EINVAL; + } + if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * wq = bpf_map_lookup_elem(inner_map1); + * if (wq) + * // mismatch would have been allowed + * bpf_wq_init(wq, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; + fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -11748,6 +11889,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: + case KF_ARG_PTR_TO_WORKQUEUE: /* Trusted by default */ break; default: @@ -12034,6 +12176,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret) return ret; break; + case KF_ARG_PTR_TO_WORKQUEUE: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_wq_func(env, regno, meta); + if (ret < 0) + return ret; + break; } } @@ -12093,11 +12244,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { - const struct btf_type *t, *ptr_type; + bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; + const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; @@ -12145,9 +12296,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) { + meta.r0_size = sizeof(u64); + meta.r0_rdonly = false; + } + + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + preempt_disable = is_kfunc_bpf_preempt_disable(&meta); + preempt_enable = is_kfunc_bpf_preempt_enable(&meta); + if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -12180,6 +12349,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + env->cur_state->active_preempt_lock--; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); + return -EACCES; + } + } else if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -13318,7 +13503,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; if (src_known && dst_known) { @@ -13331,18 +13515,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13351,7 +13533,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { @@ -13364,18 +13545,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13387,7 +13566,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; if (src_known && dst_known) { @@ -13400,18 +13578,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13420,7 +13596,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { @@ -13433,18 +13608,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13456,7 +13629,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -13467,10 +13639,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u32 result into s32. - */ + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } else { @@ -13484,7 +13656,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */ @@ -13496,10 +13667,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u64 result into s64. - */ + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } else { @@ -13707,6 +13878,46 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, + const struct bpf_reg_state *src_reg) +{ + bool src_is_const = false; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + + if (insn_bitness == 32) { + if (tnum_subreg_is_const(src_reg->var_off) + && src_reg->s32_min_value == src_reg->s32_max_value + && src_reg->u32_min_value == src_reg->u32_max_value) + src_is_const = true; + } else { + if (tnum_is_const(src_reg->var_off) + && src_reg->smin_value == src_reg->smax_value + && src_reg->umin_value == src_reg->umax_value) + src_is_const = true; + } + + switch (BPF_OP(insn->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_XOR: + case BPF_OR: + case BPF_MUL: + return true; + + /* Shift operators range is only computable if shift dimension operand + * is a constant. Shifts greater than 31 or 63 are undefined. This + * includes shifts by a negative number. + */ + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + return (src_is_const && src_reg->umax_value < insn_bitness); + default: + return false; + } +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -13716,53 +13927,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known; - s64 smin_val, smax_val; - u64 umin_val, umax_val; - s32 s32_min_val, s32_max_val; - u32 u32_min_val, u32_max_val; - u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; - smin_val = src_reg.smin_value; - smax_val = src_reg.smax_value; - umin_val = src_reg.umin_value; - umax_val = src_reg.umax_value; - - s32_min_val = src_reg.s32_min_value; - s32_max_val = src_reg.s32_max_value; - u32_min_val = src_reg.u32_min_value; - u32_max_val = src_reg.u32_max_value; - - if (alu32) { - src_known = tnum_subreg_is_const(src_reg.var_off); - if ((src_known && - (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || - s32_min_val > s32_max_val || u32_min_val > u32_max_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } else { - src_known = tnum_is_const(src_reg.var_off); - if ((src_known && - (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } - - if (!src_known && - opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) { __mark_reg_unknown(env, dst_reg); return 0; } @@ -13819,46 +13988,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_xor(dst_reg, &src_reg); break; case BPF_LSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - mark_reg_unknown(env, regs, insn->dst_reg); - break; - } if (alu32) scalar32_min_max_lsh(dst_reg, &src_reg); else scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - mark_reg_unknown(env, regs, insn->dst_reg); - break; - } if (alu32) scalar32_min_max_rsh(dst_reg, &src_reg); else scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - mark_reg_unknown(env, regs, insn->dst_reg); - break; - } if (alu32) scalar32_min_max_arsh(dst_reg, &src_reg); else scalar_min_max_arsh(dst_reg, &src_reg); break; default: - mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -14564,7 +14711,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state struct tnum t; u64 val; -again: + /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */ + switch (opcode) { + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + opcode = flip_opcode(opcode); + swap(reg1, reg2); + break; + default: + break; + } + switch (opcode) { case BPF_JEQ: if (is_jmp32) { @@ -14707,14 +14866,6 @@ again: reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - case BPF_JGE: - case BPF_JGT: - case BPF_JSGE: - case BPF_JSGT: - /* just reuse LE/LT logic above */ - opcode = flip_opcode(opcode); - swap(reg1, reg2); - goto again; default: return; } @@ -14722,7 +14873,7 @@ again: /* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ @@ -15337,6 +15488,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -16904,6 +17060,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->active_preempt_lock != cur->active_preempt_lock) + return false; + + if (old->in_sleepable != cur->in_sleepable) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -17360,7 +17522,7 @@ hit: err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and - * current state is equivalent to it (except precsion marks) + * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ @@ -17538,7 +17700,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch) + bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; @@ -17556,7 +17718,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * src_reg == stack|map in some other branch. * Reject it. */ - if (allow_trust_missmatch && + if (allow_trust_mismatch && base_type(type) == PTR_TO_BTF_ID && base_type(*prev_type) == PTR_TO_BTF_ID) { /* @@ -17852,6 +18014,13 @@ process_bpf_exit_full: return -EINVAL; } + if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { + verbose(env, "%d bpf_preempt_enable%s missing\n", + env->cur_state->active_preempt_lock, + env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback @@ -18149,6 +18318,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_wq yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -18343,6 +18519,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); fdput(f); return -E2BIG; } @@ -18957,6 +19135,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; } else { continue; } @@ -19143,12 +19327,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; - if (bpf_pseudo_func(insn)) + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif /* jit (e.g. x86_64) may emit fewer instructions * if it learns a u32 imm is the same as a u64 imm. - * Force a non zero here. + * Set close enough to possible prog address. */ - insn[1].imm = 1; + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } } err = bpf_prog_alloc_jited_linfo(prog); @@ -19180,6 +19371,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->sleepable = prog->sleepable; func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; @@ -19220,6 +19412,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) BPF_CLASS(insn->code) == BPF_ST) && BPF_MODE(insn->code) == BPF_PROBE_MEM32) num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19284,10 +19479,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) * bpf_prog_load will add the kallsyms for the main program. */ for (i = 1; i < env->subprog_cnt; i++) { - bpf_prog_lock_ro(func[i]); - bpf_prog_kallsyms_add(func[i]); + err = bpf_prog_lock_ro(func[i]); + if (err) + goto out_free; } + for (i = 1; i < env->subprog_cnt; i++) + bpf_prog_kallsyms_add(func[i]); + /* Last step: make now unused interpreter insns from main * prog consistent for later dump requests, so they can * later look the same as if they were interpreted only. @@ -19547,6 +19746,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + *cnt = 3; } return 0; } @@ -19819,6 +20025,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Skip inlining the helper call if the JIT does it. */ + if (bpf_jit_inlines_helper_call(insn->imm)) + goto next_insn; + if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) @@ -19852,7 +20062,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) !bpf_map_ptr_unpriv(aux)) { struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.map = aux->map_ptr_state.map_ptr, .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; @@ -19881,7 +20091,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -19989,7 +20199,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -20095,6 +20305,30 @@ patch_map_ops_generic: goto next_insn; } +#ifdef CONFIG_X86_64 + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + prog->jit_requested && bpf_jit_supports_percpu_insn()) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if pcpu_hot.cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ + insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { @@ -20178,6 +20412,62 @@ patch_map_ops_generic: goto next_insn; } + /* Implement bpf_get_branch_snapshot inline. */ + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Division by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_kptr_xchg inline */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_kptr_xchg && diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..b9dbf6bf2779 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851c..e32b6972c478 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index da24187c4e02..c12b9fdb22a4 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -202,6 +202,14 @@ struct cpuset { }; /* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; @@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); - static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void) /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void) } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } @@ -2079,14 +2075,11 @@ write_error: /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * cpuset_handle_hotplug may call back into cgroup core asynchronously + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after @@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, */ css_get(&cs->css); kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); cpus_read_lock(); mutex_lock(&cpuset_mutex); @@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -4318,8 +4302,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. + * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); } } @@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void) force_rebuild = true; } -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; -} - /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4493,13 +4470,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4519,18 +4494,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4558,8 +4523,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +4537,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - rebuild_sched_domains(); + rebuild_sched_domains_cpuslocked(); } free_cpumasks(NULL, ptmp); @@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042a7..074653f964c1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { @@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb40..0e5ec7d59b4d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb499..fb8b49437573 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include <linux/btf.h> #include <linux/btf_ids.h> +#include <trace/events/cgroup.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; #endif - cgroup_rstat_flush_release(); + cgroup_rstat_flush_release(cgrp); } else { root_cgroup_cputime(&bstat); usage = bstat.cputime.sum_exec_runtime; diff --git a/kernel/cpu.c b/kernel/cpu.c index 63447eb85dab..563877d6c28b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -/* - * - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock - * protected region. - * - * The operation is still serialized against concurrent CPU hotplug via - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ - * serialized against other hotplug related activity like adding or - * removing of state callbacks and state instances, which invoke either the - * startup or the teardown callback of the affected state. - * - * This is required for subsystems which are unfixable vs. CPU hotplug and - * evade lock inversion problems by scheduling work which has to be - * completed _before_ cpu_up()/_cpu_down() returns. - * - * Don't even think about adding anything to this for any new code or even - * drivers. It's only purpose is to keep existing lock order trainwrecks - * working. - * - * For cpu_down() there might be valid reasons to finish cleanups which are - * not required to be done under cpu_hotplug_lock, but that's a different - * story and would be not invoked via this. - */ -static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) -{ - /* - * cpusets delegate hotplug operations to a worker to "solve" the - * lock order problems. Wait for the worker, but only if tasks are - * _not_ frozen (suspend, hibernate) as that would wait forever. - * - * The wait is required because otherwise the hotplug operation - * returns with inconsistent state, which could even be observed in - * user space when a new CPU is brought up. The CPU plug uevent - * would be delivered and user space reacting on it would fail to - * move tasks to the newly plugged CPU up to the point where the - * work has finished because up to that point the newly plugged CPU - * is not assignable in cpusets/cgroups. On unplug that's not - * necessarily a visible issue, but it is still inconsistent state, - * which is the real problem which needs to be "fixed". This can't - * prevent the transient state between scheduling the work and - * returning from waiting for it. - */ - if (!tasks_frozen) - cpuset_wait_for_hotplug(); -} - #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) @@ -1494,7 +1448,6 @@ out: */ lockup_detector_cleanup(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } @@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) out: cpus_write_unlock(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..394db3ebe835 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -493,10 +493,10 @@ static DEFINE_MUTEX(__crash_hotplug_lock); /* * This routine utilized when the crash_hotplug sysfs node is read. - * It reflects the kernel's ability/permission to update the crash - * elfcorehdr directly. + * It reflects the kernel's ability/permission to update the kdump + * image directly. */ -int crash_check_update_elfcorehdr(void) +int crash_check_hotplug_support(void) { int rc = 0; @@ -508,10 +508,7 @@ int crash_check_update_elfcorehdr(void) return 0; } if (kexec_crash_image) { - if (kexec_crash_image->file_mode) - rc = 1; - else - rc = kexec_crash_image->update_elfcorehdr; + rc = kexec_crash_image->hotplug_support; } /* Release lock now that update complete */ kexec_unlock(); @@ -534,7 +531,7 @@ int crash_check_update_elfcorehdr(void) * list of segments it checks (since the elfcorehdr changes and thus * would require an update to purgatory itself to update the digest). */ -static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg) { struct kimage *image; @@ -552,8 +549,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image = kexec_crash_image; - /* Check that updating elfcorehdr is permitted */ - if (!(image->file_mode || image->update_elfcorehdr)) + /* Check that kexec segments update is permitted */ + if (!image->hotplug_support) goto out; if (hp_action == KEXEC_CRASH_HP_ADD_CPU || @@ -596,7 +593,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image->hp_action = hp_action; /* Now invoke arch-specific update handler */ - arch_crash_handle_hotplug_event(image); + arch_crash_handle_hotplug_event(image, arg); /* No longer handling a hotplug event */ image->hp_action = KEXEC_CRASH_HP_NONE; @@ -612,17 +609,17 @@ out: crash_hotplug_unlock(); } -static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg) { switch (val) { case MEM_ONLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; case MEM_OFFLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; } return NOTIFY_OK; @@ -635,13 +632,13 @@ static struct notifier_block crash_memhp_nb = { static int crash_cpuhp_online(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL); return 0; } static int crash_cpuhp_offline(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL); return 0; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 6f0c358e73d8..e039b0f99a0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -74,7 +74,6 @@ static struct ctl_table kern_delayacct_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static __init int kernel_delayacct_sysctls_init(void) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58db8fd70471..5e2d51e1cdf6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - if (!ops->alloc_pages) + if (!ops->alloc_pages_op) return NULL; - return ops->alloc_pages(dev, size, dma_handle, dir, gfp); + return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); } struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b0a66ed2ae3..f0128c5ff278 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7550,7 +7550,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; @@ -7598,7 +7598,7 @@ again: if (pte_present(pte)) size = pte_leaf_size(pte); pte_unmap(ptep); -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ return size; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4834d23e1d1..2c83ba776fc7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -18,7 +18,7 @@ #include <linux/sched/coredump.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ -#include <linux/mmu_notifier.h> /* set_pte_at_notify */ +#include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); return u; } @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe) if (WARN_ON(!uprobe_is_active(uprobe))) return; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe); } @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode, get_uprobe(u); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..cd3aa9042f1a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -94,7 +94,6 @@ static struct ctl_table kern_exit_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_exit_sysctls_init(void) diff --git a/kernel/fork.c b/kernel/fork.c index aebb3e6c96dc..99076dbe27d8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1344,7 +1344,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); - mm_put_huge_zero_page(mm); + mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1e78ef24321e..06a1f091be81 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1150,7 +1150,7 @@ static int __init futex_init(void) unsigned int futex_shift; unsigned long i; -#if CONFIG_BASE_SMALL +#ifdef CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2fc2727d654..1d92016b0b3c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -314,7 +314,6 @@ static struct ctl_table hung_task_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, - {} }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 18edd57b5fe8..22ea19a36e6e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -325,12 +325,6 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; - /* This kernel should never had been booted. */ - if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) - BUG_ON(!kallsyms_addresses); - else - BUG_ON(!kallsyms_offsets); - /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 27fabdcc40f5..85480274fc8f 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -5,27 +5,21 @@ #include <linux/types.h> /* - * These will be re-linked against their real values - * during the second link stage. + * These will be re-linked against their real values during the second link + * stage. Preliminary values must be provided in the linker script using the + * PROVIDE() directive so that the first link stage can complete successfully. */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; +extern const unsigned long kallsyms_addresses[]; +extern const int kallsyms_offsets[]; +extern const u8 kallsyms_names[]; -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); +extern const unsigned int kallsyms_num_syms; +extern const unsigned long kallsyms_relative_base; -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; +extern const char kallsyms_token_table[]; +extern const u16 kallsyms_token_index[]; -extern const unsigned int kallsyms_markers[] __weak; -extern const u8 kallsyms_seqs_of_names[] __weak; +extern const unsigned int kallsyms_markers[]; +extern const u8 kallsyms_seqs_of_names[]; #endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 8a689b4ff4f9..2f84896a7bcb 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -82,7 +82,7 @@ static struct test_item test_items[] = { ITEM_FUNC(kallsyms_test_func_static), ITEM_FUNC(kallsyms_test_func), ITEM_FUNC(kallsyms_test_func_weak), - ITEM_FUNC(vmalloc), + ITEM_FUNC(vmalloc_noprof), ITEM_FUNC(vfree), #ifdef CONFIG_KALLSYMS_ALL ITEM_DATA(kallsyms_test_var_bss_static), diff --git a/kernel/kexec.c b/kernel/kexec.c index bab542fc1463..a6b3f96bb50c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -135,8 +135,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image->preserve_context = 1; #ifdef CONFIG_CRASH_HOTPLUG - if (flags & KEXEC_UPDATE_ELFCOREHDR) - image->update_elfcorehdr = 1; + if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; #endif ret = machine_kexec_prepare(image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 0e96f6b24344..9112d69d68b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -948,7 +948,6 @@ static struct ctl_table kexec_core_sysctls[] = { .mode = 0644, .proc_handler = kexec_limit_handler, }, - { } }; static int __init kexec_core_sysctl_init(void) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2d1db05fbf04..3d64290d24c9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -376,6 +376,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; +#ifdef CONFIG_CRASH_HOTPLUG + if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 65adc815fc6e..6a76a8100073 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/stddef.h> #include <linux/export.h> -#include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> @@ -39,6 +38,7 @@ #include <linux/jump_label.h> #include <linux/static_call.h> #include <linux/perf_event.h> +#include <linux/execmem.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -113,17 +113,17 @@ enum kprobe_slot_state { void __weak *alloc_insn_page(void) { /* - * Use module_alloc() so this page is within +/- 2GB of where the + * Use execmem_alloc() so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * for most of the architectures. * (e.g. x86-64 needs this to handle the %rip-relative fixups.) */ - return module_alloc(PAGE_SIZE); + return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); } static void free_insn_page(void *page) { - module_memfree(page); + execmem_free(page); } struct kprobe_insn_cache kprobe_insn_slots = { @@ -968,7 +968,6 @@ static struct ctl_table kprobe_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init kprobe_sysctls_init(void) @@ -1068,6 +1067,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; +bool kprobe_ftrace_disabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1136,6 +1136,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } + +void kprobe_ftrace_kill(void) +{ + kprobe_ftrace_disabled = true; +} #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int arm_kprobe_ftrace(struct kprobe *p) { @@ -1588,7 +1593,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* Get module refcount and reject __init functions for loaded modules. */ - if (*probed_mod) { + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1603,12 +1608,13 @@ static int check_kprobe_address_safe(struct kprobe *p, * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && - (*probed_mod)->state != MODULE_STATE_COMING) { + !module_is_coming(*probed_mod)) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } + out: preempt_enable(); jump_label_unlock(); @@ -2488,24 +2494,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } -/* Remove all symbols in given area from kprobe blacklist */ -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) -{ - struct kprobe_blacklist_entry *ent, *n; - - list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { - if (ent->start_addr < start || ent->start_addr >= end) - continue; - list_del(&ent->list); - kfree(ent); - } -} - -static void kprobe_remove_ksym_blacklist(unsigned long entry) -{ - kprobe_remove_area_blacklist(entry, entry + 1); -} - int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { @@ -2570,6 +2558,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2672,6 +2679,17 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +static int kprobe_register_module_notifier(void) +{ + return register_module_notifier(&kprobe_module_nb); +} +#else +static int kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); @@ -2731,7 +2749,7 @@ static int __init init_kprobes(void) if (!err) err = register_die_notifier(&kprobe_exceptions_nb); if (!err) - err = register_module_notifier(&kprobe_module_nb); + err = kprobe_register_module_notifier(); kprobes_initialized = (err == 0); kprobe_sysctls_init(); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 495b69a71a5d..07fb5987b42b 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -228,8 +228,8 @@ KERNEL_ATTR_RW(rcu_normal); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __weak; -extern const void __stop_notes __weak; +extern const void __start_notes; +extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/kthread.c b/kernel/kthread.c index c5e40830c1f2..f7be976ff88a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -315,6 +315,7 @@ void __noreturn kthread_exit(long result) kthread->result = result; do_exit(0); } +EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 781249098cb6..84c53285f499 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -85,7 +85,6 @@ static struct ctl_table latencytop_sysctl[] = { .mode = 0644, .proc_handler = sysctl_latencytop, }, - {} }; #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ecbc9b6aba3a..52426665eecc 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -973,7 +973,7 @@ static int __klp_disable_patch(struct klp_patch *patch) if (klp_transition_patch) return -EBUSY; - klp_init_transition(patch, KLP_UNPATCHED); + klp_init_transition(patch, KLP_TRANSITION_UNPATCHED); klp_for_each_object(patch, obj) if (obj->patched) @@ -1008,7 +1008,7 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); - klp_init_transition(patch, KLP_PATCHED); + klp_init_transition(patch, KLP_TRANSITION_PATCHED); /* * Enforce the order of the func->transition writes in diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 4152c71507e2..90408500e5a3 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -95,9 +95,9 @@ static void notrace klp_ftrace_handler(unsigned long ip, patch_state = current->patch_state; - WARN_ON_ONCE(patch_state == KLP_UNDEFINED); + WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE); - if (patch_state == KLP_UNPATCHED) { + if (patch_state == KLP_TRANSITION_UNPATCHED) { /* * Use the previously patched version of the function. * If no previous patches exist, continue with the diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index e54c3d60a904..ba069459c101 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -23,7 +23,7 @@ static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); struct klp_patch *klp_transition_patch; -static int klp_target_state = KLP_UNDEFINED; +static int klp_target_state = KLP_TRANSITION_IDLE; static unsigned int klp_signals_cnt; @@ -96,16 +96,16 @@ static void klp_complete_transition(void) pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) { klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* - * All tasks have transitioned to KLP_UNPATCHED so we can now + * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now * remove the new functions from the func_stack. */ klp_unpatch_objects(klp_transition_patch); @@ -123,36 +123,36 @@ static void klp_complete_transition(void) klp_for_each_func(obj, func) func->transition = false; - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ - if (klp_target_state == KLP_PATCHED) + /* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */ + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_synchronize_transition(); read_lock(&tasklist_lock); for_each_process_thread(g, task) { WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } read_unlock(&tasklist_lock); for_each_possible_cpu(cpu) { task = idle_task(cpu); WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); - task->patch_state = KLP_UNDEFINED; + task->patch_state = KLP_TRANSITION_IDLE; } klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; - if (klp_target_state == KLP_PATCHED) + if (klp_target_state == KLP_TRANSITION_PATCHED) klp_post_patch_callback(obj); - else if (klp_target_state == KLP_UNPATCHED) + else if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_post_unpatch_callback(obj); } pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); - klp_target_state = KLP_UNDEFINED; + klp_target_state = KLP_TRANSITION_IDLE; klp_transition_patch = NULL; } @@ -164,13 +164,13 @@ static void klp_complete_transition(void) */ void klp_cancel_transition(void) { - if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) + if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED)) return; pr_debug("'%s': canceling patching transition, going to unpatch\n", klp_transition_patch->mod->name); - klp_target_state = KLP_UNPATCHED; + klp_target_state = KLP_TRANSITION_UNPATCHED; klp_complete_transition(); } @@ -218,7 +218,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, struct klp_ops *ops; int i; - if (klp_target_state == KLP_UNPATCHED) { + if (klp_target_state == KLP_TRANSITION_UNPATCHED) { /* * Check for the to-be-unpatched function * (the func itself). @@ -455,7 +455,7 @@ void klp_try_complete_transition(void) struct klp_patch *patch; bool complete = true; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); /* * Try to switch the tasks to the target patch state by walking their @@ -532,11 +532,11 @@ void klp_start_transition(void) struct task_struct *g, *task; unsigned int cpu; - WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE); pr_notice("'%s': starting %s transition\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Mark all normal tasks as needing a patch state update. They'll @@ -578,7 +578,7 @@ void klp_init_transition(struct klp_patch *patch, int state) struct klp_func *func; int initial_state = !state; - WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); + WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE); klp_transition_patch = patch; @@ -589,7 +589,7 @@ void klp_init_transition(struct klp_patch *patch, int state) klp_target_state = state; pr_debug("'%s': initializing %s transition\n", patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching"); /* * Initialize all tasks to the initial patch state to prepare them for @@ -597,7 +597,7 @@ void klp_init_transition(struct klp_patch *patch, int state) */ read_lock(&tasklist_lock); for_each_process_thread(g, task) { - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } read_unlock(&tasklist_lock); @@ -607,19 +607,19 @@ void klp_init_transition(struct klp_patch *patch, int state) */ for_each_possible_cpu(cpu) { task = idle_task(cpu); - WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE); task->patch_state = initial_state; } /* * Enforce the order of the task->patch_state initializations and the * func->transition updates to ensure that klp_ftrace_handler() doesn't - * see a func in transition with a task->patch_state of KLP_UNDEFINED. + * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE. * * Also enforce the order of the klp_target_state write and future * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and * __klp_sched_try_switch() don't set a task->patch_state to - * KLP_UNDEFINED. + * KLP_TRANSITION_IDLE. */ smp_wmb(); @@ -652,7 +652,7 @@ void klp_reverse_transition(void) pr_debug("'%s': reversing transition from %s\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching to unpatching" : + klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" : "unpatching to patching"); /* @@ -741,7 +741,7 @@ void klp_force_transition(void) klp_update_patch_state(idle_task(cpu)); /* Set forced flag for patches being removed. */ - if (klp_target_state == KLP_UNPATCHED) + if (klp_target_state == KLP_TRANSITION_UNPATCHED) klp_transition_patch->forced = true; else if (klp_transition_patch->replace) { klp_for_each_patch(patch) { diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index f3e0329337f6..4047b6d48255 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -2,6 +2,7 @@ menuconfig MODULES bool "Enable loadable module support" modules + select EXECMEM help Kernel modules are small pieces of compiled code which can be inserted in the running kernel, rather than being @@ -392,7 +393,7 @@ config UNUSED_KSYMS_WHITELIST exported at all times, even in absence of in-tree users. The value to set here is the path to a text file containing the list of symbols, one per line. The path can be absolute, or relative to the kernel - source tree. + source or obj tree. config MODULES_TREE_LOOKUP def_bool y diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index ef73ae7c8909..62fb57bb9f16 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -348,7 +348,7 @@ const char *module_address_lookup(unsigned long addr, } /* Make a copy in here where it's safe */ if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + strscpy(namebuf, ret, KSYM_NAME_LEN); ret = namebuf; } preempt_enable(); diff --git a/kernel/module/main.c b/kernel/module/main.c index e1e8a7a9d6c1..d18a94b973e1 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -56,7 +56,9 @@ #include <linux/dynamic_debug.h> #include <linux/audit.h> #include <linux/cfi.h> +#include <linux/codetag.h> #include <linux/debugfs.h> +#include <linux/execmem.h> #include <uapi/linux/module.h> #include "internal.h" @@ -1179,16 +1181,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -void __weak module_memfree(void *module_region) -{ - /* - * This memory may be RO, and freeing RO memory in an interrupt is not - * supported by vmalloc. - */ - WARN_ON(in_interrupt()); - vfree(module_region); -} - void __weak module_arch_cleanup(struct module *mod) { } @@ -1197,28 +1189,54 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static bool mod_mem_use_vmalloc(enum mod_mem_type type) +static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { - return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) && - mod_mem_type_is_core_data(type); -} + unsigned int size = PAGE_ALIGN(mod->mem[type].size); + enum execmem_type execmem_type; + void *ptr; -static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) -{ - if (mod_mem_use_vmalloc(type)) - return vzalloc(size); - return module_alloc(size); + mod->mem[type].size = size; + + if (mod_mem_type_is_data(type)) + execmem_type = EXECMEM_MODULE_DATA; + else + execmem_type = EXECMEM_MODULE_TEXT; + + ptr = execmem_alloc(execmem_type, size); + if (!ptr) + return -ENOMEM; + + /* + * The pointer to these blocks of memory are stored on the module + * structure and we keep that around so long as the module is + * around. We only free that memory when we unload the module. + * Just mark them as not being a leak then. The .init* ELF + * sections *do* get freed after boot so we *could* treat them + * slightly differently with kmemleak_ignore() and only grey + * them out as they work as typical memory allocations which + * *do* eventually get freed, but let's just keep things simple + * and avoid *any* false positives. + */ + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; + + return 0; } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(struct module *mod, enum mod_mem_type type, + bool unload_codetags) { - if (mod_mem_use_vmalloc(type)) - vfree(ptr); - else - module_memfree(ptr); + void *ptr = mod->mem[type].base; + + if (!unload_codetags && mod_mem_type_is_core_data(type)) + return; + + execmem_free(ptr); } -static void free_mod_mem(struct module *mod) +static void free_mod_mem(struct module *mod, bool unload_codetags) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1229,19 +1247,26 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod, type, unload_codetags); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod, MOD_DATA, unload_codetags); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + bool unload_codetags; + trace_module_free(mod); + unload_codetags = codetag_unload_module(mod); + if (!unload_codetags) + pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n", + mod->name); + mod_sysfs_teardown(mod); /* @@ -1283,7 +1308,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod); + free_mod_mem(mod, unload_codetags); } void *__symbol_get(const char *symbol) @@ -1610,13 +1635,6 @@ static void free_modinfo(struct module *mod) } } -void * __weak module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - bool __weak module_init_section(const char *name) { return strstarts(name, ".init"); @@ -2225,7 +2243,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; enum mod_mem_type t = 0; int ret = -ENOMEM; @@ -2234,26 +2251,12 @@ static int move_module(struct module *mod, struct load_info *info) mod->mem[type].base = NULL; continue; } - mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); - ptr = module_memory_alloc(mod->mem[type].size, type); - /* - * The pointer to these blocks of memory are stored on the module - * structure and we keep that around so long as the module is - * around. We only free that memory when we unload the module. - * Just mark them as not being a leak then. The .init* ELF - * sections *do* get freed after boot so we *could* treat them - * slightly differently with kmemleak_ignore() and only grey - * them out as they work as typical memory allocations which - * *do* eventually get freed, but let's just keep things simple - * and avoid *any* false positives. - */ - kmemleak_not_leak(ptr); - if (!ptr) { + + ret = module_memory_alloc(mod, type); + if (ret) { t = type; goto out_enomem; } - memset(ptr, 0, mod->mem[type].size); - mod->mem[type].base = ptr; } /* Transfer each section which specifies SHF_ALLOC */ @@ -2296,7 +2299,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod, t, true); return ret; } @@ -2426,7 +2429,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod); + free_mod_mem(mod, true); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2482,9 +2485,9 @@ static void do_free_init(struct work_struct *w) llist_for_each_safe(pos, n, list) { initfree = container_of(pos, struct mod_initfree, node); - module_memfree(initfree->init_text); - module_memfree(initfree->init_data); - module_memfree(initfree->init_rodata); + execmem_free(initfree->init_text); + execmem_free(initfree->init_data); + execmem_free(initfree->init_rodata); kfree(initfree); } } @@ -2594,10 +2597,10 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path. module_memfree() cannot be called in an interrupt, so do the + * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. * - * Note that module_alloc() on most architectures creates W+X page + * Note that execmem_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work by invoking @@ -2995,6 +2998,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Get rid of temporary copy. */ free_copy(info, flags); + codetag_load_module(mod); + /* Done! */ trace_module_load(mod); diff --git a/kernel/panic.c b/kernel/panic.c index 747c3f3d289a..8bff183d6180 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,6 @@ static struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_panic_sysctls_init(void) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7ade20e95232..dc48fecfa1dc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -307,7 +307,6 @@ static struct ctl_table pid_ns_ctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, - { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index 2ee41a3a1dfd..fe9fb991dc42 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -41,7 +41,6 @@ static struct ctl_table pid_ns_ctl_table_vm[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static inline void register_pid_ns_sysctl_table_vm(void) { diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9e1c9aa399ea..927cc55ba0b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -674,23 +674,15 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -/* - * Adjustment of CPU performance values after boot, when all CPUs capacites - * are correctly calculated. - */ -static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) +static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) { struct em_perf_table __rcu *em_table; struct em_perf_state *ps, *new_ps; - int ret, ps_size; + int ps_size; em_table = em_table_alloc(pd); - if (!em_table) { - dev_warn(dev, "EM: allocation failed\n"); - return; - } + if (!em_table) + return NULL; new_ps = em_table->state; @@ -702,24 +694,52 @@ static void em_adjust_new_capacity(struct device *dev, rcu_read_unlock(); - em_init_performance(dev, pd, new_ps, pd->nr_perf_states); - ret = em_compute_costs(dev, new_ps, NULL, pd->nr_perf_states, + return em_table; +} + +static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, + struct em_perf_table __rcu *em_table) +{ + int ret; + + ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, pd->flags); - if (ret) { - dev_warn(dev, "EM: compute costs failed\n"); - return; - } + if (ret) + goto free_em_table; ret = em_dev_update_perf_domain(dev, em_table); if (ret) - dev_warn(dev, "EM: update failed %d\n", ret); + goto free_em_table; /* * This is one-time-update, so give up the ownership in this updater. * The EM framework has incremented the usage counter and from now * will keep the reference (then free the memory when needed). */ +free_em_table: em_table_free(em_table); + return ret; +} + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(struct device *dev, + struct em_perf_domain *pd, + u64 max_cap) +{ + struct em_perf_table __rcu *em_table; + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); + + em_recalc_and_update(dev, pd, em_table); } static void em_check_capacity_update(void) @@ -788,3 +808,51 @@ static void em_update_workfn(struct work_struct *work) { em_check_capacity_update(); } + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table __rcu *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 43b1a82e800c..0a213f69a9e4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1361,7 +1361,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy(resume_file, str, 255); + strscpy(resume_file, str); return 1; } diff --git a/kernel/power/process.c b/kernel/power/process.c index cae81a87cc91..66ac067d9ae6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -194,8 +194,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index adf99c05adca..3160d287d4cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -178,9 +178,9 @@ static int __init control_devkmsg(char *str) * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) - strcpy(devkmsg_log_str, "on"); + strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) - strcpy(devkmsg_log_str, "off"); + strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* @@ -209,7 +209,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return -EINVAL; old = devkmsg_log; - strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); @@ -227,7 +227,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* ... and restore old setting. */ devkmsg_log = old; - strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + strscpy(devkmsg_log_str, old_str); return -EINVAL; } @@ -2506,22 +2506,22 @@ static int __init console_setup(char *str) /* * Decode str into name, index, options. */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; + if (isdigit(str[0])) + scnprintf(buf, sizeof(buf), "ttyS%s", str); + else + strscpy(buf, str); + options = strchr(str, ','); if (options) *(options++) = 0; + #ifdef __sparc__ if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); + strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); + strscpy(buf, "ttyS1"); #endif + for (s = buf; *s; s++) if (isdigit(*s) || *s == ',') break; diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index c228343eeb97..3e47dedce9e5 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -76,7 +76,6 @@ static struct ctl_table printk_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - {} }; void __init printk_sysctl_init(void) diff --git a/kernel/reboot.c b/kernel/reboot.c index 22c16e2564cc..f05dbde2c93f 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1295,7 +1295,6 @@ static struct ctl_table kern_reboot_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void __init kernel_reboot_sysctls_init(void) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 991fc9002535..db68a964e34e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init sched_autogroup_sysctl_init(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f88f50552acb..bcf2c4cc0522 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4741,7 +4741,6 @@ static struct ctl_table sched_core_sysctls[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_core_sysctl_init(void) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..c75d1307d86d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9744b5036dd8..8a5b1ae0aa55 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -151,7 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e..aa4c1c874fa4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -56,7 +56,6 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - {} }; static int __init sched_rt_sysctl_init(void) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 67a777b31743..329c82faca9b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -322,7 +322,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f70e031e06a8..e30b60b57614 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2445,7 +2445,6 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0644, .proc_handler = seccomp_actions_logged_handler, }, - { } }; static int __init seccomp_sysctl_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 7bdbcf1b78d0..01c4c46a51a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4840,7 +4840,6 @@ static struct ctl_table signal_debug_table[] = { .proc_handler = proc_dointvec }, #endif - { } }; static int __init init_signal_sysctls(void) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 59cdfaf5118e..0f9712584913 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -54,7 +54,6 @@ static struct ctl_table stackleak_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init stackleak_sysctls_init(void) diff --git a/kernel/sys.c b/kernel/sys.c index 8bb106a56b3a..f9c95410278c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -146,6 +146,12 @@ #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif +#ifndef PPC_GET_DEXCR_ASPECT +# define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) +#endif +#ifndef PPC_SET_DEXCR_ASPECT +# define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2726,6 +2732,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; + case PR_PPC_GET_DEXCR: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PPC_GET_DEXCR_ASPECT(me, arg2); + break; + case PR_PPC_SET_DEXCR: + if (arg4 || arg5) + return -EINVAL; + error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81cc974913bb..e0b917328cf9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2034,7 +2034,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_INT_MAX, }, #endif - { } }; static struct ctl_table vm_table[] = { @@ -2240,7 +2239,6 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif - { } }; int __init sysctl_init_bases(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e394d6d5b9b5..48288dd4a102 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -312,7 +312,6 @@ static struct ctl_table timer_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init timer_sysctl_init(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b3d7f62ac581..166ad5444eea 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9dc605f08a23..f5154c051d2c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) { u32 instr; - /* Being extra safe in here in case entry ip is on the page-edge. */ - if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) - return fentry_ip; + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { + if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) + return fentry_ip; + } else { + instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); + } if (is_endbr(instr)) fentry_ip -= ENDBR_INSN_SIZE; return fentry_ip; @@ -1182,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; @@ -1197,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return -ENOENT; return entry_cnt * br_entry_size; -#endif } static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { @@ -1525,8 +1527,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1582,8 +1582,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1633,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static bool is_kprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_kprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1648,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_attach_cookie_proto_umulti; @@ -2008,6 +2017,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2070,6 +2081,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2370,16 +2384,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } @@ -2408,12 +2432,12 @@ out: #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2429,9 +2453,10 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; + struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's @@ -2443,18 +2468,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { - return __bpf_probe_register(btp, prog); -} - -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, @@ -2577,6 +2596,12 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ +struct bpf_session_run_ctx { + struct bpf_run_ctx run_ctx; + bool is_return; + void *data; +}; + #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; @@ -2590,7 +2615,7 @@ struct bpf_kprobe_multi_link { }; struct bpf_kprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; @@ -2769,7 +2794,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) if (WARN_ON_ONCE(!ctx)) return 0; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; @@ -2786,15 +2812,21 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs) + unsigned long entry_ip, struct pt_regs *regs, + bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + .data = data, + }, .link = link, .entry_ip = entry_ip, }; @@ -2809,7 +2841,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, migrate_disable(); rcu_read_lock(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); @@ -2826,10 +2858,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, void *data) { struct bpf_kprobe_multi_link *link; + int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); - return 0; + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); + return is_kprobe_session(link->link.prog) ? err : 0; } static void @@ -2840,7 +2873,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -2973,7 +3006,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + if (!is_kprobe_multi(prog)) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; @@ -3054,10 +3087,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error; - if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_exit_handler; - else + if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; + if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) + link->fp.exit_handler = kprobe_multi_link_exit_handler; + if (is_kprobe_session(prog)) + link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; @@ -3483,3 +3518,54 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif /* CONFIG_UPROBES */ + +#ifdef CONFIG_FPROBE +__bpf_kfunc_start_defs(); + +__bpf_kfunc bool bpf_session_is_return(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->is_return; +} + +__bpf_kfunc __u64 *bpf_session_cookie(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->data; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_ID_FLAGS(func, bpf_session_cookie) +BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) + +static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + return 0; + + if (!is_kprobe_session(prog)) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { + .owner = THIS_MODULE, + .set = &kprobe_multi_kfunc_set_ids, + .filter = bpf_kprobe_multi_filter, +}; + +static int __init bpf_kprobe_multi_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); +} + +late_initcall(bpf_kprobe_multi_kfuncs_init); +#endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c83c005e654e..a130b2d898f7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -125,17 +125,6 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /* - * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses are unpredictable anyway. - * Ex: BPF trampoline may call original function and may skip frame - * depending on type of BPF programs attached. - */ - if (ftrace_direct_func_count && - ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) - return -EBUSY; -#endif trace.func = func; trace.depth = ++current->curr_ret_depth; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c96b30f3d63..65208d3b5ed9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1595,12 +1595,15 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct dyn_ftrace *rec; + unsigned long ip = 0; + rcu_read_lock(); rec = lookup_rec(start, end); if (rec) - return rec->ip; + ip = rec->ip; + rcu_read_unlock(); - return 0; + return ip; } /** @@ -1614,25 +1617,22 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) */ unsigned long ftrace_location(unsigned long ip) { - struct dyn_ftrace *rec; + unsigned long loc; unsigned long offset; unsigned long size; - rec = lookup_rec(ip, ip); - if (!rec) { + loc = ftrace_location_range(ip, ip); + if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) goto out; /* map sym+0 to __fentry__ */ if (!offset) - rec = lookup_rec(ip, ip + size - 1); + loc = ftrace_location_range(ip, ip + size - 1); } - if (rec) - return rec->ip; - out: - return 0; + return loc; } /** @@ -2538,7 +2538,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); -int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -4201,12 +4200,12 @@ static int add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, int clear_filter) { - long index = simple_strtoul(func_g->search, NULL, 0); + long index; struct ftrace_page *pg; struct dyn_ftrace *rec; /* The index starts at 1 */ - if (--index < 0) + if (kstrtoul(func_g->search, 0, &index) || --index < 0) return 0; do_for_each_ftrace_rec(pg, rec) { @@ -5317,14 +5316,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -struct ftrace_direct_func { - struct list_head next; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - static int register_ftrace_function_nolock(struct ftrace_ops *ops); /* @@ -5365,6 +5356,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long } } +static void register_ftrace_direct_cb(struct rcu_head *rhp) +{ + struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu); + + free_ftrace_hash(fhp); +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5463,10 +5461,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) out_unlock: mutex_unlock(&direct_mutex); - if (free_hash && free_hash != EMPTY_HASH) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } + if (free_hash && free_hash != EMPTY_HASH) + call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb); if (new_hash) free_ftrace_hash(new_hash); @@ -5816,9 +5812,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function); static int __init set_graph_max_depth_function(char *str) { - if (!str) + if (!str || kstrtouint(str, 0, &fgraph_max_depth)) return 0; - fgraph_max_depth = simple_strtoul(str, NULL, 0); return 1; } __setup("ftrace_graph_max_depth=", set_graph_max_depth_function); @@ -6595,6 +6590,8 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { WARN_ON(!skipped); + /* Need to synchronize with ftrace_location_range() */ + synchronize_rcu(); ftrace_free_pages(pg_unuse); } return ret; @@ -6808,6 +6805,9 @@ void ftrace_release_mod(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) + synchronize_rcu(); for (pg = tmp_page; pg; pg = tmp_page) { /* Needs to be called outside of ftrace_lock */ @@ -7141,6 +7141,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; @@ -7182,12 +7183,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - ftrace_number_of_groups--; - kfree(pg); + pg->next = tmp_page; + tmp_page = pg; pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) ftrace_pages = pg; @@ -7204,6 +7201,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) clear_func_from_hashes(func); kfree(func); } + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) { + synchronize_rcu(); + ftrace_free_pages(tmp_page); + } } void __init ftrace_free_init_mem(void) @@ -7894,6 +7896,7 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /** @@ -8269,7 +8272,6 @@ static struct ctl_table ftrace_sysctls[] = { .mode = 0644, .proc_handler = ftrace_enable_sysctl, }, - {} }; static int __init ftrace_sysctl_init(void) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } @@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6511dc3a00da..7345a8b625fb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> +#include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -26,6 +27,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> @@ -312,6 +314,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_MASK (3 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -338,6 +342,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id; /* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +489,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -1524,7 +1535,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_ZERO, + mflags | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1599,6 +1610,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1609,7 +1621,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; @@ -1789,8 +1801,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -2318,7 +2328,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { - return rb_page_commit(bpage); + return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned @@ -3945,7 +3955,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) return true; /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_commit(reader)) + if (reader->read != rb_page_size(reader)) return false; /* @@ -4416,7 +4426,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && - iter->head == rb_page_commit(cpu_buffer->reader_page))); + iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -5211,6 +5221,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5255,6 +5281,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; + if (cpu_buffer->mapped) + rb_update_meta_page(cpu_buffer); + rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; } @@ -5469,6 +5498,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5579,7 +5614,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) goto out; page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, + GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); @@ -5720,7 +5755,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_commit(reader); + commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -5733,7 +5768,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5796,7 +5832,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += rb_page_commit(reader); + cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); @@ -5956,6 +5992,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -6057,6 +6098,414 @@ error: } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + meta->meta_page_size = PAGE_SIZE; + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) + cpu_buffer->mapped++; + else + cpu_buffer->mapped--; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (1 + (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +#ifdef CONFIG_MMU +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + /* Refuse MP_PRIVATE or writable mappings */ + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + + /* + * Make sure the mapping cannot become writable later. Also tell the VM + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). + */ + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, + VM_MAYWRITE); + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (!vma_pages || vma_pages > nr_pages) + return -EINVAL; + + nr_pages = vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * TODO: Align sub-buffers on their size, once + * vm_insert_pages() supports the zero-page. + */ + } else { + /* Skip the meta-page */ + pgoff--; + + if (pgoff % subbuf_pages) { + err = -EINVAL; + goto out; + } + + s += pgoff / subbuf_pages; + } + + while (p < nr_pages) { + struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + int off = 0; + + if (WARN_ON_ONCE(s >= nr_subbufs)) { + err = -EINVAL; + goto out; + } + + for (; off < (1 << (subbuf_order)); off++, page++) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + +out: + kfree(pages); + + return err; +} +#else +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} +#endif + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer->mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + } + +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + cpu_buffer->mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); + +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + reader = rb_get_reader_page(cpu_buffer); + if (WARN_ON(!reader)) + goto out; + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { + if (missed_events) { + struct buffer_data_page *bpage = reader->page; + unsigned int commit; + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + /* + * If there is room at the end of the page to save the + * missed events, then record it there. + */ + commit = rb_page_size(reader); + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + } else { + /* + * There really shouldn't be any missed events if the commit + * is on the reader page. + */ + WARN_ON_ONCE(missed_events); + } + + cpu_buffer->lost_events = 0; + + goto consume; + +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..578a49ff5c32 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, return; } + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); @@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) lockdep_assert_held(&trace_types_lock); spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX) { + if (tr->snapshot == UINT_MAX || tr->mapped) { spin_unlock(&tr->snapshot_trigger_lock); return -EBUSY; } @@ -5540,7 +5546,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t symstr, <type>\\[<array-size>\\]\n" + "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" @@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; - + tr->current_trace->enabled--; if (tr->current_trace->reset) @@ -8194,15 +8200,32 @@ out: return ret; } -/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; + int err; + + if (cmd == TRACE_MMAP_IOCTL_GET_READER) { + if (!(file->f_flags & O_NONBLOCK)) { + err = ring_buffer_wait(iter->array_buffer->buffer, + iter->cpu_file, + iter->tr->buffer_percent, + NULL, NULL); + if (err) + return err; + } - if (cmd) - return -ENOIOCTLCMD; + return ring_buffer_map_get_reader(iter->array_buffer->buffer, + iter->cpu_file); + } else if (cmd) { + return -ENOTTY; + } + /* + * An ioctl call with cmd 0 to the ring buffer file will wake up all + * waiters + */ mutex_lock(&trace_types_lock); /* Make sure the waiters see the new wait_index */ @@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } +#ifdef CONFIG_TRACER_MAX_TRACE +static int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} +static void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} +#else +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +#endif + +static void tracing_buffers_mmap_close(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); + put_snapshot_map(iter->tr); +} + +static const struct vm_operations_struct tracing_buffers_vmops = { + .close = tracing_buffers_mmap_close, +}; + +static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + int ret = 0; + + ret = get_snapshot_map(iter->tr); + if (ret) + return ret; + + ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); + if (ret) + put_snapshot_map(iter->tr); + + vma->vm_ops = &tracing_buffers_vmops; + + return ret; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, @@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = { .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, + .mmap = tracing_buffers_mmap, }; static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64450615ca0c..749a182dab48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -336,6 +336,7 @@ struct trace_array { bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; + unsigned int mapped; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 811b08439406..e19c32f2a938 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -104,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - delta = div64_u64(delta, bm_cnt); + do_div(delta, (u32)bm_cnt); avg = delta; if (stddev > 0) { diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 70d428c394b6..3a2b46847c8b 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1990,6 +1990,80 @@ static int user_event_set_tp_name(struct user_event *user) } /* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *insert_space_after_semis(char *args, int count) +{ + char *fixed, *pos; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + + /* Insert a space after ';' if there is no trailing space. */ + while (*args) { + *pos = *args++; + + if (*pos++ == ';' && !isspace(*args)) + *pos++ = ' '; + } + + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + char **split; + char *fixed; + int count; + + /* Count how many ';' without a trailing space */ + count = count_semis_no_space(args); + + /* No fixup is required */ + if (!count) + return argv_split(GFP_KERNEL, args, argc); + + /* We must fixup 'field;field' to 'field; field' */ + fixed = insert_space_after_semis(args, count); + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; +} + +/* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. @@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return -EPERM; if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); + argv = user_event_argv_split(args, &argc); if (!argv) return -ENOMEM; @@ -2833,7 +2907,6 @@ static struct ctl_table user_event_sysctls[] = { .mode = 0644, .proc_handler = set_max_user_events_sysctl, }, - {} }; static int __init trace_events_user_init(void) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4f4280815522..62e6a8f4aae9 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc, is_return); @@ -1154,6 +1159,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14099cc17fc9..16383247bdbf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } +#ifdef CONFIG_MODULES static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { char *p; @@ -129,6 +130,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +#else +static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) +{ + return false; +} +#endif static bool trace_kprobe_is_busy(struct dyn_event *ev) { @@ -670,6 +677,7 @@ end: return ret; } +#ifdef CONFIG_MODULES /* Module notifier call back, checking event on the module */ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -704,6 +712,16 @@ static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; +static int trace_kprobe_register_module_notifier(void) +{ + return register_module_notifier(&trace_kprobe_module_nb); +} +#else +static int trace_kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ static int count_symbols(void *data, unsigned long unused) { @@ -782,6 +800,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -933,6 +952,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); @@ -979,6 +1002,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: @@ -1933,7 +1957,7 @@ static __init int init_kprobe_trace_early(void) if (ret) return ret; - if (register_module_notifier(&trace_kprobe_module_nb)) + if (trace_kprobe_register_module_notifier()) return -EINVAL; return 0; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 42bc0f362226..5e263c141574 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include <linux/bpf.h> +#include <linux/fs.h> #include "trace_btf.h" #include "trace_probe.h" @@ -1180,8 +1181,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type, return ret; } -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, @@ -1739,6 +1738,68 @@ error: return ERR_PTR(ret); } +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + char *tmp; + char *equal; + size_t arg_len; + + if (!glob_match("*:%p[dD]", argv[i])) + continue; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; + } + + *buf = tmpbuf; + return 0; +nomem: + kfree(tmpbuf); + return -ENOMEM; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cef3a50628a3..5803e6a41570 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX #define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) @@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9e461362450a..8541fa1494ae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; @@ -940,30 +941,56 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + if (*ucbp) + return *ucbp; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + *ucbp = ucb; + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer **ucbp, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; + struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,14 +1004,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; @@ -993,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucbp, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucbp, link->file); rcu_read_unlock(); } @@ -1199,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1326,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); @@ -1335,10 +1366,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1356,7 +1388,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1412,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); - - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; + memcpy(data, ucb->buf, ucb->dsize); - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1425,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1474,11 +1504,9 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1490,18 +1518,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1512,8 +1534,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1546,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; diff --git a/kernel/ucount.c b/kernel/ucount.c index 4aa6166cb856..8c07714ff27d 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -38,7 +38,7 @@ static int set_is_seen(struct ctl_table_set *set) } static int set_permissions(struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); @@ -87,7 +87,6 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif - { } }; #endif /* CONFIG_SYSCTL */ @@ -96,7 +95,7 @@ bool setup_userns_sysctls(struct user_namespace *ns) #ifdef CONFIG_SYSCTL struct ctl_table *tbl; - BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); + BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { @@ -119,7 +118,7 @@ bool setup_userns_sysctls(struct user_namespace *ns) void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); diff --git a/kernel/umh.c b/kernel/umh.c index 1b13c5d34624..598b3ffe1522 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -560,7 +560,6 @@ static struct ctl_table usermodehelper_table[] = { .mode = 0600, .proc_handler = proc_cap_handler, }, - { } }; static int __init init_umh_sysctls(void) diff --git a/kernel/user.c b/kernel/user.c index 03cedc366dc9..aa1162deafe4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) +#define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 019e3a1566cf..76a772072557 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -120,7 +120,6 @@ static struct ctl_table uts_kern_table[] = { .proc_handler = proc_do_uts_string, .poll = &domainname_poll, }, - {} }; #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 23c125c2e243..1d5eadd9dd61 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,7 +198,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); +#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab) + VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d12ff74889ed..941236828de8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1155,7 +1155,6 @@ static struct ctl_table watchdog_sysctls[] = { }, #endif /* CONFIG_SMP */ #endif - {} }; static struct ctl_table watchdog_hardlockup_sysctl[] = { @@ -1168,7 +1167,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init watchdog_sysctl_init(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 80882ae43261..003474c9a77d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -99,6 +99,7 @@ enum worker_flags { enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ + WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { @@ -392,6 +393,12 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; +struct work_offq_data { + u32 pool_id; + u32 disable; + u32 flags; +}; + static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", @@ -490,12 +497,6 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* - * Used to synchronize multiple cancel_sync attempts on the same work item. See - * work_grab_pending() and __cancel_work_sync(). - */ -static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); - -/* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. @@ -763,6 +764,11 @@ static int work_next_color(int color) return (color + 1) % WORK_NR_COLORS; } +static unsigned long pool_offq_flags(struct worker_pool *pool) +{ + return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; +} + /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag @@ -776,11 +782,6 @@ static int work_next_color(int color) * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. - * - * %WORK_OFFQ_CANCELING is used to mark a work item which is being - * canceled. While being canceled, a work item may have its PENDING set - * but stay off timer and worklist for arbitrarily long and nobody should - * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { @@ -892,36 +893,26 @@ static struct worker_pool *get_work_pool(struct work_struct *work) return idr_find(&worker_pool_idr, pool_id); } -/** - * get_work_pool_id - return the worker pool ID a given work is associated with - * @work: the work item of interest - * - * Return: The worker_pool ID @work was last associated with. - * %WORK_OFFQ_POOL_NONE if none. - */ -static int get_work_pool_id(struct work_struct *work) +static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { - unsigned long data = atomic_long_read(&work->data); - - if (data & WORK_STRUCT_PWQ) - return work_struct_pwq(data)->pool->id; - - return data >> WORK_OFFQ_POOL_SHIFT; + return (v >> shift) & ((1 << bits) - 1); } -static void mark_work_canceling(struct work_struct *work) +static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) { - unsigned long pool_id = get_work_pool_id(work); + WARN_ON_ONCE(data & WORK_STRUCT_PWQ); - pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); + offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, + WORK_OFFQ_POOL_BITS); + offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, + WORK_OFFQ_DISABLE_BITS); + offqd->flags = data & WORK_OFFQ_FLAG_MASK; } -static bool work_is_canceling(struct work_struct *work) +static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { - unsigned long data = atomic_long_read(&work->data); - - return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); + return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | + ((unsigned long)offqd->flags); } /* @@ -2067,8 +2058,6 @@ out_put: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry - * -ENOENT if someone else is canceling @work, this state may persist - * for arbitrarily long * ======== ================================================================ * * Note: @@ -2151,7 +2140,8 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * this destroys work->data needed by the next step, stash it. */ work_data = *work_data_bits(work); - set_work_pool_and_keep_pending(work, pool->id, 0); + set_work_pool_and_keep_pending(work, pool->id, + pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); @@ -2164,26 +2154,9 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, fail: rcu_read_unlock(); local_irq_restore(*irq_flags); - if (work_is_canceling(work)) - return -ENOENT; - cpu_relax(); return -EAGAIN; } -struct cwt_wait { - wait_queue_entry_t wait; - struct work_struct *work; -}; - -static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) -{ - struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); - - if (cwait->work != key) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal @@ -2193,7 +2166,7 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * - * Must be called in process context. IRQ is disabled on return with IRQ state + * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * @@ -2202,41 +2175,14 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { - struct cwt_wait cwait; int ret; - might_sleep(); -repeat: - ret = try_to_grab_pending(work, cflags, irq_flags); - if (likely(ret >= 0)) - return ret; - if (ret != -ENOENT) - goto repeat; - - /* - * Someone is already canceling. Wait for it to finish. flush_work() - * doesn't work for PREEMPT_NONE because we may get woken up between - * @work's completion and the other canceling task resuming and clearing - * CANCELING - flush_work() will return false immediately as @work is no - * longer busy, try_to_grab_pending() will return -ENOENT as @work is - * still being canceled and the other canceling task won't be able to - * clear CANCELING as we're hogging the CPU. - * - * Let's wait for completion using a waitqueue. As this may lead to the - * thundering herd problem, use a custom wake function which matches - * @work along with exclusive wait and wakeup. - */ - init_wait(&cwait.wait); - cwait.wait.func = cwt_wakefn; - cwait.work = work; - - prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait, - TASK_UNINTERRUPTIBLE); - if (work_is_canceling(work)) - schedule(); - finish_wait(&wq_cancel_waitq, &cwait.wait); - - goto repeat; + while (true) { + ret = try_to_grab_pending(work, cflags, irq_flags); + if (ret >= 0) + return ret; + cpu_relax(); + } } /** @@ -2422,6 +2368,21 @@ out: rcu_read_unlock(); } +static bool clear_pending_if_disabled(struct work_struct *work) +{ + unsigned long data = *work_data_bits(work); + struct work_offq_data offqd; + + if (likely((data & WORK_STRUCT_PWQ) || + !(data & WORK_OFFQ_DISABLE_MASK))) + return false; + + work_offqd_unpack(&offqd, data); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + return true; +} + /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on @@ -2444,7 +2405,8 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } @@ -2522,7 +2484,8 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); @@ -2604,7 +2567,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, /* read the comment in __queue_work() */ local_irq_save(irq_flags); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } @@ -2636,19 +2600,14 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; - int ret; + bool ret; - do { - ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, - &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); - if (likely(ret >= 0)) { + if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); - local_irq_restore(irq_flags); - } - /* -ENOENT from try_to_grab_pending() becomes %true */ + local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); @@ -2677,7 +2636,12 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + /* + * rcu_work can't be canceled or disabled. Warn if the user reached + * inside @rwork and disabled the inner work. + */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && + !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; @@ -2953,7 +2917,7 @@ static void idle_worker_timeout(struct timer_list *t) unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); @@ -2995,7 +2959,7 @@ static void idle_cull_fn(struct work_struct *work) struct worker *worker; unsigned long expires; - worker = list_entry(pool->idle_list.prev, struct worker, entry); + worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { @@ -3230,7 +3194,7 @@ __acquires(&pool->lock) * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_pool_and_clear_pending(work, pool->id, 0); + set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); @@ -3700,7 +3664,7 @@ void workqueue_softirq_dead(unsigned int cpu) if (!need_more_worker(pool)) continue; - INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); @@ -3710,6 +3674,7 @@ void workqueue_softirq_dead(unsigned int cpu) queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); + destroy_work_on_stack(&dead_work.work); } } @@ -4154,8 +4119,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, struct pool_workqueue *pwq; struct workqueue_struct *wq; - might_sleep(); - rcu_read_lock(); pool = get_work_pool(work); if (!pool) { @@ -4207,6 +4170,7 @@ already_gone: static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; + unsigned long data; if (WARN_ON(!wq_online)) return false; @@ -4214,13 +4178,41 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (start_flush_work(work, &barr, from_cancel)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { + if (!start_flush_work(work, &barr, from_cancel)) return false; + + /* + * start_flush_work() returned %true. If @from_cancel is set, we know + * that @work must have been executing during start_flush_work() and + * can't currently be queued. Its data must contain OFFQ bits. If @work + * was queued on a BH workqueue, we also know that it was running in the + * BH context and thus can be busy-waited. + */ + data = *work_data_bits(work); + if (from_cancel && + !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { + /* + * On RT, prevent a live lock when %current preempted soft + * interrupt processing or prevents ksoftirqd from running by + * keeping flipping BH. If the BH work item runs on a different + * CPU then this has no effect other than doing the BH + * disable/enable dance for nothing. This is copied from + * kernel/softirq.c::tasklet_unlock_spin_wait(). + */ + while (!try_wait_for_completion(&barr.done)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } + } else { + wait_for_completion(&barr.done); } + + destroy_work_on_stack(&barr.work); + return true; } /** @@ -4236,6 +4228,7 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) */ bool flush_work(struct work_struct *work) { + might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -4282,32 +4275,53 @@ bool flush_rcu_work(struct rcu_work *rwork) } EXPORT_SYMBOL(flush_rcu_work); +static void work_offqd_disable(struct work_offq_data *offqd) +{ + const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; + + if (likely(offqd->disable < max)) + offqd->disable++; + else + WARN_ONCE(true, "workqueue: work disable count overflowed\n"); +} + +static void work_offqd_enable(struct work_offq_data *offqd) +{ + if (likely(offqd->disable > 0)) + offqd->disable--; + else + WARN_ONCE(true, "workqueue: work disable count underflowed\n"); +} + static bool __cancel_work(struct work_struct *work, u32 cflags) { + struct work_offq_data offqd; unsigned long irq_flags; int ret; - do { - ret = try_to_grab_pending(work, cflags, &irq_flags); - } while (unlikely(ret == -EAGAIN)); + ret = work_grab_pending(work, cflags, &irq_flags); - if (unlikely(ret < 0)) - return false; + work_offqd_unpack(&offqd, *work_data_bits(work)); - set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0); + if (cflags & WORK_CANCEL_DISABLE) + work_offqd_disable(&offqd); + + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { - unsigned long irq_flags; bool ret; - /* claim @work and tell other tasks trying to grab @work to back off */ - ret = work_grab_pending(work, cflags, &irq_flags); - mark_work_canceling(work); - local_irq_restore(irq_flags); + ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); + + if (*work_data_bits(work) & WORK_OFFQ_BH) + WARN_ON_ONCE(in_hardirq()); + else + might_sleep(); /* * Skip __flush_work() during early boot when we know that @work isn't @@ -4316,15 +4330,8 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags) if (wq_online) __flush_work(work, true); - /* - * smp_mb() at the end of set_work_pool_and_clear_pending() is paired - * with prepare_to_wait() above so that either waitqueue_active() is - * visible here or !work_is_canceling() is visible there. - */ - set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0); - - if (waitqueue_active(&wq_cancel_waitq)) - __wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work); + if (!(cflags & WORK_CANCEL_DISABLE)) + enable_work(work); return ret; } @@ -4342,19 +4349,19 @@ EXPORT_SYMBOL(cancel_work); * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. + * Cancel @work and wait for its execution to finish. This function can be used + * even if the work re-queues itself or migrates to another workqueue. On return + * from this function, @work is guaranteed to be not pending or executing on any + * CPU as long as there aren't racing enqueues. * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. + * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. + * Use cancel_delayed_work_sync() instead. * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. * - * Return: - * %true if @work was pending, %false otherwise. + * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { @@ -4400,6 +4407,108 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** + * disable_work - Disable and cancel a work item + * @work: work item to disable + * + * Disable @work by incrementing its disable count and cancel it if currently + * pending. As long as the disable count is non-zero, any attempt to queue @work + * will fail and return %false. The maximum supported disable depth is 2 to the + * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. + * + * Can be called from any context. Returns %true if @work was pending, %false + * otherwise. + */ +bool disable_work(struct work_struct *work) +{ + return __cancel_work(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work); + +/** + * disable_work_sync - Disable, cancel and drain a work item + * @work: work item to disable + * + * Similar to disable_work() but also wait for @work to finish if currently + * executing. + * + * Must be called from a sleepable context if @work was last queued on a non-BH + * workqueue. Can also be called from non-hardirq atomic contexts including BH + * if @work was last queued on a BH workqueue. + * + * Returns %true if @work was pending, %false otherwise. + */ +bool disable_work_sync(struct work_struct *work) +{ + return __cancel_work_sync(work, WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_work_sync); + +/** + * enable_work - Enable a work item + * @work: work item to enable + * + * Undo disable_work[_sync]() by decrementing @work's disable count. @work can + * only be queued if its disable count is 0. + * + * Can be called from any context. Returns %true if the disable count reached 0. + * Otherwise, %false. + */ +bool enable_work(struct work_struct *work) +{ + struct work_offq_data offqd; + unsigned long irq_flags; + + work_grab_pending(work, 0, &irq_flags); + + work_offqd_unpack(&offqd, *work_data_bits(work)); + work_offqd_enable(&offqd); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); + local_irq_restore(irq_flags); + + return !offqd.disable; +} +EXPORT_SYMBOL_GPL(enable_work); + +/** + * disable_delayed_work - Disable and cancel a delayed work item + * @dwork: delayed work item to disable + * + * disable_work() for delayed work items. + */ +bool disable_delayed_work(struct delayed_work *dwork) +{ + return __cancel_work(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work); + +/** + * disable_delayed_work_sync - Disable, cancel and drain a delayed work item + * @dwork: delayed work item to disable + * + * disable_work_sync() for delayed work items. + */ +bool disable_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_sync(&dwork->work, + WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); +} +EXPORT_SYMBOL_GPL(disable_delayed_work_sync); + +/** + * enable_delayed_work - Enable a delayed work item + * @dwork: delayed work item to enable + * + * enable_work() for delayed work items. + */ +bool enable_delayed_work(struct delayed_work *dwork) +{ + return enable_work(&dwork->work); +} +EXPORT_SYMBOL_GPL(enable_delayed_work); + +/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -4530,6 +4639,8 @@ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; + if (attrs->affn_strict) + cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ @@ -4538,11 +4649,12 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - hash = jhash_1word(attrs->affn_strict, hash); + if (!attrs->affn_strict) + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -4552,11 +4664,11 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, { if (a->nice != b->nice) return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) + if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; - if (a->affn_strict != b->affn_strict) + if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } @@ -7148,25 +7260,27 @@ static ssize_t __wq_cpumask_show(struct device *dev, return written; } -static ssize_t wq_unbound_cpumask_show(struct device *dev, +static ssize_t cpumask_requested_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); } +static DEVICE_ATTR_RO(cpumask_requested); -static ssize_t wq_requested_cpumask_show(struct device *dev, +static ssize_t cpumask_isolated_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); } +static DEVICE_ATTR_RO(cpumask_isolated); -static ssize_t wq_isolated_cpumask_show(struct device *dev, +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } -static ssize_t wq_unbound_cpumask_store(struct device *dev, +static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; @@ -7182,36 +7296,19 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, free_cpumask_var(cpumask); return ret ? ret : count; } +static DEVICE_ATTR_RW(cpumask); -static struct device_attribute wq_sysfs_cpumask_attrs[] = { - __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store), - __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), - __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), - __ATTR_NULL, +static struct attribute *wq_sysfs_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + &dev_attr_cpumask_requested.attr, + &dev_attr_cpumask_isolated.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { - struct device *dev_root; - int err; - - err = subsys_virtual_register(&wq_subsys, NULL); - if (err) - return err; - - dev_root = bus_get_dev_root(&wq_subsys); - if (dev_root) { - struct device_attribute *attr; - - for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { - err = device_create_file(dev_root, attr); - if (err) - break; - } - put_device(dev_root); - } - return err; + return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); |