summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c40
-rw-r--r--kernel/bpf/bloom_filter.c41
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c24
-rw-r--r--kernel/bpf/bpf_inode_storage.c61
-rw-r--r--kernel/bpf/bpf_iter.c70
-rw-r--r--kernel/bpf/bpf_local_storage.c371
-rw-r--r--kernel/bpf/bpf_struct_ops.c276
-rw-r--r--kernel/bpf/bpf_task_storage.c28
-rw-r--r--kernel/bpf/btf.c467
-rw-r--r--kernel/bpf/cgroup.c62
-rw-r--r--kernel/bpf/core.c11
-rw-r--r--kernel/bpf/cpumap.c18
-rw-r--r--kernel/bpf/cpumask.c87
-rw-r--r--kernel/bpf/devmap.c50
-rw-r--r--kernel/bpf/hashtab.c140
-rw-r--r--kernel/bpf/helpers.c509
-rw-r--r--kernel/bpf/local_storage.c13
-rw-r--r--kernel/bpf/log.c330
-rw-r--r--kernel/bpf/lpm_trie.c17
-rw-r--r--kernel/bpf/map_in_map.c15
-rw-r--r--kernel/bpf/memalloc.c59
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/queue_stack_maps.c32
-rw-r--r--kernel/bpf/reuseport_array.c10
-rw-r--r--kernel/bpf/ringbuf.c26
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c170
-rw-r--r--kernel/bpf/trampoline.c40
-rw-r--r--kernel/bpf/verifier.c2370
-rw-r--r--kernel/cgroup/cgroup.c24
-rw-r--r--kernel/cgroup/cpuset.c178
-rw-r--r--kernel/cgroup/legacy_freezer.c7
-rw-r--r--kernel/cgroup/rstat.c4
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/dma/swiotlb.c6
-rw-r--r--kernel/entry/syscall_user_dispatch.c74
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/fork.c127
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/kcsan/core.c17
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/locking/lockdep.c64
-rw-r--r--kernel/locking/locktorture.c188
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/module/internal.h1
-rw-r--r--kernel/module/kallsyms.c16
-rw-r--r--kernel/module/livepatch.c10
-rw-r--r--kernel/nsproxy.c17
-rw-r--r--kernel/padata.c4
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/power/main.c59
-rw-r--r--kernel/printk/printk.c13
-rw-r--r--kernel/ptrace.c9
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h43
-rw-r--r--kernel/rcu/rcuscale.c9
-rw-r--r--kernel/rcu/rcutorture.c234
-rw-r--r--kernel/rcu/refscale.c2
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c438
-rw-r--r--kernel/rcu/tasks.h33
-rw-r--r--kernel/rcu/tree.c45
-rw-r--r--kernel/rcu/tree_exp.h16
-rw-r--r--kernel/rcu/tree_nocb.h4
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c69
-rw-r--r--kernel/time/posix-cpu-timers.c81
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-sched.c151
-rw-r--r--kernel/time/tick-sched.h67
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/ftrace.c438
-rw-r--r--kernel/trace/ring_buffer.c13
-rw-r--r--kernel/trace/trace.c28
-rw-r--r--kernel/trace/trace_events_synth.c19
-rw-r--r--kernel/trace/trace_osnoise.c6
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_selftest.c19
-rw-r--r--kernel/vhost_task.c117
86 files changed, 5693 insertions, 2473 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 10ef068f598d..6fc72b3afbde 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 484706959556..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
@@ -721,6 +721,28 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
return num_elems;
}
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
@@ -742,6 +764,7 @@ const struct bpf_map_ops array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -762,6 +785,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -847,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -1156,6 +1180,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
@@ -1257,6 +1282,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
@@ -1291,6 +1317,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1379,5 +1406,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 48ee750849f2..540331b610a9 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -16,13 +16,6 @@ struct bpf_bloom_filter {
struct bpf_map map;
u32 bitset_mask;
u32 hash_seed;
- /* If the size of the values in the bloom filter is u32 aligned,
- * then it is more performant to use jhash2 as the underlying hash
- * function, else we use jhash. This tracks the number of u32s
- * in an u32-aligned value size. If the value size is not u32 aligned,
- * this will be 0.
- */
- u32 aligned_u32_count;
u32 nr_hash_funcs;
unsigned long bitset[];
};
@@ -32,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
{
u32 h;
- if (bloom->aligned_u32_count)
- h = jhash2(value, bloom->aligned_u32_count,
- bloom->hash_seed + index);
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
else
h = jhash(value, value_size, bloom->hash_seed + index);
return h & bloom->bitset_mask;
}
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
return 0;
}
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
return 0;
}
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
@@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
bloom->nr_hash_funcs = nr_hash_funcs;
bloom->bitset_mask = bitset_mask;
- /* Check whether the value size is u32-aligned */
- if ((attr->value_size & (sizeof(u32) - 1)) == 0)
- bloom->aligned_u32_count =
- attr->value_size / sizeof(u32);
-
if (!(attr->map_flags & BPF_F_ZERO_SEED))
bloom->hash_seed = get_random_u32();
@@ -177,8 +164,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-EINVAL);
}
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
/* The eBPF program should use map_push_elem instead */
return -EINVAL;
@@ -193,6 +180,17 @@ static int bloom_map_check_btf(const struct bpf_map *map,
return btf_type_is_void(key_type) ? 0 : -EINVAL;
}
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -206,5 +204,6 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_update_elem = bloom_map_update_elem,
.map_delete_elem = bloom_map_delete_elem,
.map_check_btf = bloom_map_check_btf,
+ .map_mem_usage = bloom_map_mem_usage,
.map_btf_id = &bpf_bloom_map_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 6cdf6d9ed91d..d44fe8dd9732 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- bool free_cgroup_storage = false;
- unsigned long flags;
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
}
bpf_cgrp_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
rcu_read_unlock();
-
- if (free_cgroup_storage)
- kfree_rcu(local_storage, rcu);
}
static struct bpf_local_storage_data *
@@ -100,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
return sdata ? sdata->data : NULL;
}
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct cgroup *cgroup;
@@ -128,11 +121,11 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
{
struct cgroup *cgroup;
int err, fd;
@@ -156,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
}
static void cgroup_storage_map_free(struct bpf_map *map)
@@ -221,6 +214,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
.map_update_elem = bpf_cgrp_storage_update_elem,
.map_delete_elem = bpf_cgrp_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = cgroup_storage_ptr,
};
@@ -230,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -241,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 05f4c66c9089..b0ef45db207c 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
struct bpf_local_storage *local_storage;
- bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
bsb = bpf_inode(inode);
@@ -72,51 +71,40 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- raw_spin_lock_bh(&local_storage->lock);
- free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_bh(&local_storage->lock);
+ bpf_local_storage_destroy(local_storage);
rcu_read_unlock();
-
- if (free_inode_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(f->f_inode, map, true);
- fput(f);
+ sdata = inode_storage_lookup(file_inode(f.file), map, true);
+ fdput(f);
return sdata ? sdata->data : NULL;
}
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- if (!inode_storage_ptr(f->f_inode)) {
- fput(f);
+ if (!inode_storage_ptr(file_inode(f.file))) {
+ fdput(f);
return -EBADF;
}
- sdata = bpf_local_storage_update(f->f_inode,
+ sdata = bpf_local_storage_update(file_inode(f.file),
(struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
- fput(f);
+ fdput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -128,23 +116,21 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct file *f;
- int fd, err;
+ struct fd f = fdget_raw(*(int *)key);
+ int err;
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- err = inode_storage_delete(f->f_inode, map);
- fput(f);
+ err = inode_storage_delete(file_inode(f.file), map);
+ fdput(f);
return err;
}
@@ -205,7 +191,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache);
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
@@ -223,6 +209,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
@@ -234,7 +221,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -245,6 +232,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ BTF_TYPE_EMIT(struct btf_iter_num);
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 35f4138a54dc..47d9948d768f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner)
return map->ops->map_owner_storage_ptr(owner);
}
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -70,11 +80,28 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+ migrate_enable();
+ if (selem)
+ /* Keep the original bpf_map_kzalloc behavior
+ * before started using the bpf_mem_cache_alloc.
+ *
+ * No need to use zero_map_value. The bpf_selem_free()
+ * only does bpf_mem_cache_free when there is
+ * no other bpf prog is using the selem.
+ */
+ memset(SDATA(selem)->data, 0, smap->map.value_size);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (selem) {
if (value)
copy_map_value(&smap->map, SDATA(selem)->data, value);
+ /* No need to call check_and_init_map_value as memory is zero init */
return selem;
}
@@ -84,7 +111,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -98,7 +126,66 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ bpf_mem_cache_raw_free(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool bpf_ma, bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!bpf_ma) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+ return;
+ }
+
+ if (smap) {
+ migrate_disable();
+ bpf_mem_cache_free(&smap->storage_ma, local_storage);
+ migrate_enable();
+ } else {
+ /* smap could be NULL if the selem that triggered
+ * this 'local_storage' creation had been long gone.
+ * In this case, directly do call_rcu().
+ */
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ }
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -109,13 +196,63 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ bpf_mem_cache_raw_free(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ bool reuse_now)
+{
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+
+ if (!smap->bpf_ma) {
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+ } else {
+ /* Instead of using the vanilla call_rcu(),
+ * bpf_mem_cache_free will be able to reuse selem
+ * immediately.
+ */
+ migrate_disable();
+ bpf_mem_cache_free(&smap->selem_ma, selem);
+ migrate_enable();
+ }
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+ bool uncharge_mem, bool reuse_now)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -159,40 +296,75 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- if (use_trace_rcu)
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
- else
- kfree_rcu(selem, rcu);
+ bpf_selem_free(selem, smap, reuse_now);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
return free_local_storage;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
- bool use_trace_rcu)
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *storage_smap,
+ struct bpf_local_storage_elem *selem)
+{
+
+ struct bpf_local_storage_map *selem_smap;
+
+ /* local_storage->smap may be NULL. If it is, get the bpf_ma
+ * from any selem in the local_storage->list. The bpf_ma of all
+ * local_storage and selem should have the same value
+ * for the same map type.
+ *
+ * If the local_storage->list is already empty, the caller will not
+ * care about the bpf_ma value also because the caller is not
+ * responsibile to free the local_storage.
+ */
+
+ if (storage_smap)
+ return storage_smap->bpf_ma;
+
+ if (!selem) {
+ struct hlist_node *n;
+
+ n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+ bpf_rcu_lock_held());
+ if (!n)
+ return false;
+
+ selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+ }
+ selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ return selem_smap->bpf_ma;
+}
+
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool free_local_storage = false;
+ bool bpf_ma, free_local_storage = false;
unsigned long flags;
- if (unlikely(!selem_linked_to_storage(selem)))
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
+ storage_smap = rcu_dereference_check(local_storage->smap,
+ bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, use_trace_rcu);
+ local_storage, selem, true, reuse_now);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- if (free_local_storage) {
- if (use_trace_rcu)
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_rcu);
- else
- kfree_rcu(local_storage, rcu);
- }
+ if (free_local_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -202,13 +374,13 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
unsigned long flags;
- if (unlikely(!selem_linked_to_map(selem)))
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
/* selem has already be unlinked from smap */
return;
@@ -232,14 +404,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem, use_trace_rcu);
+ bpf_selem_unlink_storage(selem, reuse_now);
}
/* If cacheit_lockit is false, this lookup function is lockless */
@@ -312,13 +484,21 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ migrate_enable();
+ } else {
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
+ RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
@@ -358,7 +538,7 @@ int bpf_local_storage_alloc(void *owner,
return 0;
uncharge:
- kfree(storage);
+ bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -402,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -420,7 +600,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
- if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
copy_map_value_locked(&smap->map, old_sdata->data,
value, false);
return old_sdata;
@@ -485,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false, true);
+ false, false);
}
unlock:
@@ -496,7 +676,7 @@ unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (selem) {
mem_uncharge(smap, owner, smap->elem_size);
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
}
return ERR_PTR(err);
}
@@ -552,40 +732,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
- struct bpf_local_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
-
- smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
-
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_area_free(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = offsetof(struct bpf_local_storage_elem,
- sdata.data[attr->value_size]);
-
- return smap;
-}
-
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
@@ -603,11 +749,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
+ bool bpf_ma, free_storage = false;
struct hlist_node *n;
+ unsigned long flags;
+
+ storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -618,6 +769,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
@@ -630,24 +782,89 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
+ local_storage, selem, false, true);
}
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return free_storage;
+ if (free_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache)
+ struct bpf_local_storage_cache *cache,
+ bool bpf_ma)
{
struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
- smap = __bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
+ smap->bpf_ma = bpf_ma;
+ if (bpf_ma) {
+ err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+ if (err)
+ goto free_smap;
+
+ err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+ if (err) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ goto free_smap;
+ }
+ }
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
}
void bpf_local_storage_map_free(struct bpf_map *map,
@@ -689,7 +906,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
migrate_disable();
this_cpu_inc(*busy_counter);
}
- bpf_selem_unlink(selem, false);
+ bpf_selem_unlink(selem, true);
if (busy_counter) {
this_cpu_dec(*busy_counter);
migrate_enable();
@@ -713,6 +930,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
+ if (smap->bpf_ma) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ bpf_mem_alloc_destroy(&smap->storage_ma);
+ }
kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ece9870cab68..d3f0a4825fa6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,11 +11,13 @@
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -58,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -349,8 +366,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
model, flags, tlinks, NULL);
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -491,12 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ set_memory_rox((long)st_map->image, 1);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -512,7 +546,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -524,20 +557,22 @@ unlock:
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+ return ERR_PTR(-EOPNOTSUPP);
+
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->links || !st_map->image) {
- bpf_struct_ops_map_free(map);
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -641,6 +701,21 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return map;
}
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct btf_type *vt = st_ops->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
@@ -651,6 +726,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
.map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
@@ -660,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
{
+ struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
bpf_map_put(&st_map->map);
}
-void bpf_struct_ops_put(const void *kdata)
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
- struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
*/
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(&st_map->map);
}
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err = 0;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1e486055a523..adf6dfe0ba68 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- bool free_task_storage = false;
- unsigned long flags;
rcu_read_lock();
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
}
bpf_task_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
rcu_read_unlock();
-
- if (free_task_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -127,8 +120,8 @@ out:
return ERR_PTR(err);
}
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
@@ -175,12 +168,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
if (!nobusy)
return -EBUSY;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
@@ -316,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache);
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
@@ -335,6 +328,7 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
@@ -344,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -355,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -366,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
@@ -375,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 73780748404c..6b682b8e4b50 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,9 @@
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
#include <net/sock.h>
#include "../tools/lib/bpf/relo_core.h"
@@ -207,6 +210,12 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
BTF_KFUNC_HOOK_MAX,
};
@@ -572,8 +581,8 @@ static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
*btf_p = btf;
return ret;
}
- spin_lock_bh(&btf_idr_lock);
btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
}
spin_unlock_bh(&btf_idr_lock);
return ret;
@@ -1661,10 +1670,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)
if (!tab)
return;
- for (i = 0; i < tab->cnt; i++) {
+ for (i = 0; i < tab->cnt; i++)
btf_record_free(tab->types[i].record);
- kfree(tab->types[i].field_offs);
- }
kfree(tab);
}
@@ -3226,12 +3233,6 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_info_type {
- BTF_FIELD_SPIN_LOCK,
- BTF_FIELD_TIMER,
- BTF_FIELD_KPTR,
-};
-
enum {
BTF_FIELD_IGNORE = 0,
BTF_FIELD_FOUND = 1,
@@ -3283,9 +3284,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
/* Reject extra tags */
if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
return -EINVAL;
- if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_UNREF;
- else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_REF;
else
return -EINVAL;
@@ -3394,6 +3395,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & BPF_KPTR) {
@@ -3442,6 +3444,7 @@ static int btf_find_struct_field(const struct btf *btf,
case BPF_TIMER:
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3507,6 +3510,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
case BPF_TIMER:
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3557,7 +3561,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
{
struct module *mod = NULL;
const struct btf_type *t;
- struct btf *kernel_btf;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
int ret;
s32 id;
@@ -3566,7 +3573,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
*/
t = btf_type_by_id(btf, info->kptr.type_id);
id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ btf_get(kptr_btf);
+ goto found_dtor;
+ }
if (id < 0)
return id;
@@ -3583,20 +3603,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
* can be used as a referenced pointer and be stored in a map at
* the same time.
*/
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
if (dtor_btf_id < 0) {
ret = dtor_btf_id;
goto end_btf;
}
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
if (!dtor_func) {
ret = -ENOENT;
goto end_btf;
}
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
if (!mod) {
ret = -ENXIO;
goto end_btf;
@@ -3606,7 +3626,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
/* We already verified dtor_func to be btf_type_is_func
* in register_btf_id_dtor_kfuncs.
*/
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
addr = kallsyms_lookup_name(dtor_func_name);
if (!addr) {
ret = -EINVAL;
@@ -3615,14 +3635,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = (void *)addr;
}
+found_dtor:
field->kptr.btf_id = id;
- field->kptr.btf = kernel_btf;
+ field->kptr.btf = kptr_btf;
field->kptr.module = mod;
return 0;
end_mod:
module_put(mod);
end_btf:
- btf_put(kernel_btf);
+ btf_put(kptr_btf);
return ret;
}
@@ -3684,12 +3705,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
__alignof__(struct bpf_rb_node));
}
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
u32 field_mask, u32 value_size)
{
struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
struct btf_record *rec;
- u32 next_off = 0;
int ret, i, cnt;
ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3708,8 +3741,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
- if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
ret = -EFAULT;
goto end;
@@ -3718,11 +3753,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
ret = -EEXIST;
goto end;
}
- next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
+ next_off = info_arr[i].off + field_type_size;
rec->field_mask |= info_arr[i].type;
rec->fields[i].offset = info_arr[i].off;
rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
switch (info_arr[i].type) {
case BPF_SPIN_LOCK:
@@ -3735,6 +3771,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
@@ -3768,30 +3809,16 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
goto end;
}
- /* need collection identity for non-owning refs before allowing this
- *
- * Consider a node type w/ both list and rb_node fields:
- * struct node {
- * struct bpf_list_node l;
- * struct bpf_rb_node r;
- * }
- *
- * Used like so:
- * struct node *n = bpf_obj_new(....);
- * bpf_list_push_front(&list_head, &n->l);
- * bpf_rbtree_remove(&rb_root, &n->r);
- *
- * It should not be possible to rbtree_remove the node since it hasn't
- * been added to a tree. But push_front converts n to a non-owning
- * reference, and rbtree_remove accepts the non-owning reference to
- * a type w/ bpf_rb_node field.
- */
- if (btf_record_has_field(rec, BPF_LIST_NODE) &&
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
btf_record_has_field(rec, BPF_RB_NODE)) {
ret = -EINVAL;
goto end;
}
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
return rec;
end:
btf_record_free(rec);
@@ -3873,61 +3900,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
return 0;
}
-static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
-{
- const u32 a = *(const u32 *)_a;
- const u32 b = *(const u32 *)_b;
-
- if (a < b)
- return -1;
- else if (a > b)
- return 1;
- return 0;
-}
-
-static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
-{
- struct btf_field_offs *foffs = (void *)priv;
- u32 *off_base = foffs->field_off;
- u32 *a = _a, *b = _b;
- u8 *sz_a, *sz_b;
-
- sz_a = foffs->field_sz + (a - off_base);
- sz_b = foffs->field_sz + (b - off_base);
-
- swap(*a, *b);
- swap(*sz_a, *sz_b);
-}
-
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
-{
- struct btf_field_offs *foffs;
- u32 i, *off;
- u8 *sz;
-
- BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
- if (IS_ERR_OR_NULL(rec))
- return NULL;
-
- foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
- if (!foffs)
- return ERR_PTR(-ENOMEM);
-
- off = foffs->field_off;
- sz = foffs->field_sz;
- for (i = 0; i < rec->cnt; i++) {
- off[i] = rec->fields[i].offset;
- sz[i] = btf_field_type_size(rec->fields[i].type);
- }
- foffs->cnt = rec->cnt;
-
- if (foffs->cnt == 1)
- return foffs;
- sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
- btf_field_offs_cmp, btf_field_offs_swap, foffs);
- return foffs;
-}
-
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
@@ -5332,6 +5304,7 @@ static const char *alloc_obj_fields[] = {
"bpf_list_node",
"bpf_rb_root",
"bpf_rb_node",
+ "bpf_refcount",
};
static struct btf_struct_metas *
@@ -5370,7 +5343,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
- struct btf_field_offs *foffs;
struct btf_struct_meta *type;
struct btf_record *record;
const struct btf_type *t;
@@ -5406,23 +5378,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE, t->size);
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
goto free;
}
- foffs = btf_parse_field_offs(record);
- /* We need the field_offs to be valid for a valid record,
- * either both should be set or both should be unset.
- */
- if (IS_ERR_OR_NULL(foffs)) {
- btf_record_free(record);
- ret = -EFAULT;
- goto free;
- }
type->record = record;
- type->field_offs = foffs;
tab->cnt++;
}
return tab;
@@ -5489,38 +5451,45 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
return 0;
}
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -5529,16 +5498,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -5561,7 +5530,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
- struct_meta_tab = btf_parse_struct_metas(log, btf);
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
if (IS_ERR(struct_meta_tab)) {
err = PTR_ERR(struct_meta_tab);
goto errout;
@@ -5578,10 +5547,9 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
}
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
- goto errout_meta;
- }
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
@@ -5590,6 +5558,11 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
errout_meta:
btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
@@ -5684,6 +5657,10 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
+ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return ctx_type;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
@@ -5891,12 +5868,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
-
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
return btf_type_is_int(t);
}
@@ -6147,7 +6120,8 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
@@ -6155,6 +6129,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
+ *flag = 0;
again:
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
@@ -6186,11 +6161,13 @@ again:
if (off < moff)
goto error;
- /* Only allow structure for now, can be relaxed for
- * other types later.
- */
+ /* allow structure and integer */
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
if (!btf_type_is_struct(t))
goto error;
@@ -6321,6 +6298,15 @@ error:
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
+ if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
+ btf_type_vlen(mtype) != 1)
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
/* our field must be inside that union or struct */
t = mtype;
@@ -6365,7 +6351,9 @@ error:
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
- *flag = tmp_flag;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
return WALK_PTR;
}
}
@@ -6392,7 +6380,8 @@ error:
int btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
int off, int size, enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
@@ -6424,7 +6413,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
t = btf_type_by_id(btf, id);
do {
- err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
switch (err) {
case WALK_PTR:
@@ -6499,7 +6488,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
if (err != WALK_STRUCT)
return false;
@@ -7180,15 +7169,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -7578,6 +7564,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *name, *sfx, *iter_name;
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ arg = &btf_params(func)[0];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = name + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7705,6 +7793,21 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -7741,7 +7844,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
const struct btf_kfunc_id_set *kset)
{
struct btf *btf;
- int ret;
+ int ret, i;
btf = btf_get_module_btf(kset->owner);
if (!btf) {
@@ -7758,7 +7861,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
if (IS_ERR(btf))
return PTR_ERR(btf);
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
btf_put(btf);
return ret;
}
@@ -8249,12 +8360,10 @@ check_modules:
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
- if (IS_ERR(cands)) {
- btf_put(mod_btf);
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
return ERR_CAST(cands);
- }
spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
@@ -8336,16 +8445,15 @@ out:
bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off)
+ const char *field_name, u32 btf_id, const char *suffix)
{
struct btf *btf = reg->btf;
const struct btf_type *walk_type, *safe_type;
const char *tname;
char safe_tname[64];
long ret, safe_id;
- const struct btf_member *member, *m_walk = NULL;
+ const struct btf_member *member;
u32 i;
- const char *walk_name;
walk_type = btf_type_by_id(btf, reg->btf_id);
if (!walk_type)
@@ -8353,7 +8461,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
tname = btf_name_by_offset(btf, walk_type->name_off);
- ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
if (ret < 0)
return false;
@@ -8365,30 +8473,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
if (!safe_type)
return false;
- for_each_member(i, walk_type, member) {
- u32 moff;
-
- /* We're looking for the PTR_TO_BTF_ID member in the struct
- * type we're walking which matches the specified offset.
- * Below, we'll iterate over the fields in the safe variant of
- * the struct and see if any of them has a matching type /
- * name.
- */
- moff = __btf_member_bit_offset(walk_type, member) / 8;
- if (off == moff) {
- m_walk = member;
- break;
- }
- }
- if (m_walk == NULL)
- return false;
-
- walk_name = __btf_name_by_offset(btf, m_walk->name_off);
for_each_member(i, safe_type, member) {
const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+ btf_type_skip_modifiers(btf, mtype->type, &id);
/* If we match on both type and name, the field is considered trusted. */
- if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ if (btf_id == id && !strcmp(field_name, m_name))
return true;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index bf2fdb33fb31..a06e118a9be5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1921,14 +1921,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
if (ret < 0)
goto out;
- if (ctx.optlen > max_optlen || ctx.optlen < 0) {
+ if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
ret = -EFAULT;
goto out;
}
if (ctx.optlen != 0) {
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
+ if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (put_user(ctx.optlen, optlen)) {
ret = -EFAULT;
goto out;
}
@@ -2223,10 +2226,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
- *insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg,
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
bpf_ctx_narrow_access_offset(
- 0, sizeof(u32), sizeof(loff_t)));
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -2376,10 +2381,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return true;
}
-#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
- T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sockopt_kern, F))
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -2391,25 +2403,25 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sockopt, sk):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
break;
case offsetof(struct bpf_sockopt, level):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
break;
case offsetof(struct bpf_sockopt, optname):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
break;
case offsetof(struct bpf_sockopt, optlen):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
break;
case offsetof(struct bpf_sockopt, retval):
BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
@@ -2429,9 +2441,11 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
treg, treg,
offsetof(struct task_struct, bpf_ctx));
- *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
- treg, si->src_reg,
- offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sockopt_kern, tmp_reg));
} else {
@@ -2447,10 +2461,10 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
}
break;
case offsetof(struct bpf_sockopt, optval):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
break;
case offsetof(struct bpf_sockopt, optval_end):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
}
@@ -2529,10 +2543,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_comm:
return &bpf_get_current_comm_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
case BPF_FUNC_get_cgroup_classid:
return &bpf_get_cgroup_classid_curr_proto;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e2d256c82072..7421487422d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
s16 off = insn->off;
s32 imm = insn->imm;
u8 *addr;
+ int err;
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
if (!*func_addr_fixed) {
@@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
} else {
/* Address of a BPF helper call. Since part of the core
* kernel, it's always at a fixed location. __bpf_call_base
@@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
return false;
}
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index d2110c1f6fa6..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpumap_val cpumap_value = {};
@@ -667,12 +667,21 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -683,6 +692,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = cpu_map_mem_usage,
.map_btf_id = &cpu_map_btf_ids[0],
.map_redirect = cpu_map_redirect,
};
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 52b981512a35..7efdf5d770ca 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,6 +9,7 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
+ * @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
@@ -24,6 +25,7 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
+ struct rcu_head rcu;
refcount_t usage;
};
@@ -55,7 +57,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
/* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
- cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
if (!cpumask)
return NULL;
@@ -80,32 +82,14 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
-/**
- * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
- * stored in a map.
- * @cpumaskp: A pointer to a BPF cpumask map value.
- *
- * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
- * cpumask returned by this function must either be embedded in a map as a
- * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
- * no BPF cpumask was found in the specified map value.
- */
-__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+static void cpumask_free_cb(struct rcu_head *head)
{
struct bpf_cpumask *cpumask;
- /* The BPF memory allocator frees memory backing its caches in an RCU
- * callback. Thus, we can safely use RCU to ensure that the cpumask is
- * safe to read.
- */
- rcu_read_lock();
-
- cpumask = READ_ONCE(*cpumaskp);
- if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
- cpumask = NULL;
-
- rcu_read_unlock();
- return cpumask;
+ cpumask = container_of(head, struct bpf_cpumask, rcu);
+ migrate_disable();
+ bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
}
/**
@@ -118,14 +102,8 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
- if (!cpumask)
- return;
-
- if (refcount_dec_and_test(&cpumask->usage)) {
- migrate_disable();
- bpf_mem_free(&bpf_cpumask_ma, cpumask);
- migrate_enable();
- }
+ if (refcount_dec_and_test(&cpumask->usage))
+ call_rcu(&cpumask->rcu, cpumask_free_cb);
}
/**
@@ -424,29 +402,28 @@ __diag_pop();
BTF_SET8_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
@@ -468,7 +445,7 @@ static int __init cpumask_kfunc_init(void)
},
};
- ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2675fefc6cb6..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -819,12 +819,14 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
- if (old_dev)
+ if (old_dev) {
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -895,8 +897,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -931,19 +933,21 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -995,27 +999,41 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_hash_lookup_elem);
}
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -1026,6 +1044,7 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
.map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_map_redirect,
};
@@ -1039,6 +1058,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
.map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_hash_map_redirect,
};
@@ -1109,9 +1129,11 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!dev || netdev != dev->dev)
continue;
odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
- if (dev == odev)
+ if (dev == odev) {
call_rcu(&dev->rcu,
__dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
}
}
rcu_read_unlock();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5dfcb5ad0d06..00c253b84bf5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
cond_resched();
}
}
@@ -596,6 +607,8 @@ free_htab:
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -759,9 +772,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
static void check_and_free_fields(struct bpf_htab *htab,
struct htab_elem *elem)
{
- void *map_value = elem->key + round_up(htab->map.key_size, 8);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = elem->key + round_up(htab->map.key_size, 8);
- bpf_obj_free_fields(htab->map.record, map_value);
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -858,9 +879,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
+ check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
- check_and_free_fields(htab, l);
bpf_mem_cache_free(&htab->ma, l);
}
@@ -918,14 +939,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
{
if (!onallcpus) {
/* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
@@ -940,16 +960,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
* (onallcpus=false always when coming from bpf prog).
*/
if (!onallcpus) {
- u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
- size);
- else
- memset(per_cpu_ptr(pptr, cpu), 0, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
@@ -1057,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1159,8 +1177,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1226,9 +1244,9 @@ err:
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1281,9 +1299,9 @@ err:
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1348,21 +1366,21 @@ err:
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1398,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1575,9 +1593,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
@@ -1772,8 +1789,8 @@ again_nocopy:
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(dst_val + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
@@ -2046,9 +2063,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += roundup_value_size;
}
ctx.value = info->percpu_value_buf;
@@ -2119,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -2175,6 +2192,44 @@ out:
return num_elems;
}
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -2191,6 +2246,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2212,6 +2268,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2292,8 +2349,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
@@ -2363,6 +2420,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2382,6 +2440,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2519,6 +2578,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5b278a38ae58..8d368fa353f9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
@@ -257,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
goto err_clear;
/* Verifier guarantees that size > 0 */
- strscpy(buf, task->comm, size);
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -571,7 +572,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_CONST_STR,
};
@@ -1264,10 +1265,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
{
struct bpf_hrtimer *t;
int ret = 0;
+ enum hrtimer_mode mode;
if (in_nmi())
return -EOPNOTSUPP;
- if (flags)
+ if (flags > BPF_F_TIMER_ABS)
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
@@ -1275,7 +1277,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
ret = -EINVAL;
goto out;
}
- hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
@@ -1420,11 +1428,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
return ptr->size & DYNPTR_RDONLY_BIT;
}
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
{
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
@@ -1497,6 +1515,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
if (!src->data || flags)
@@ -1506,13 +1525,25 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst, src->data + src->offset + offset, len);
+ type = bpf_dynptr_get_type(src);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_read_proto = {
@@ -1529,22 +1560,40 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
- if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst->data + dst->offset + offset, src, len);
+ type = bpf_dynptr_get_type(dst);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_write_proto = {
@@ -1560,6 +1609,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
+ enum bpf_dynptr_type type;
int err;
if (!ptr->data)
@@ -1572,7 +1622,20 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
if (bpf_dynptr_is_rdonly(ptr))
return 0;
- return (unsigned long)(ptr->data + ptr->offset + offset);
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
}
static const struct bpf_func_proto bpf_dynptr_data_proto = {
@@ -1693,6 +1756,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
default:
break;
@@ -1731,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@@ -1761,13 +1830,8 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- bpf_obj_free_fields(field->graph_root.value_rec, obj);
- /* bpf_mem_free requires migrate_disable(), since we can be
- * called from map free path as well apart from BPF program (as
- * part of map ops doing bpf_obj_free_fields).
- */
migrate_disable();
- bpf_mem_free(&bpf_global_ma, obj);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
migrate_enable();
}
}
@@ -1804,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj = pos;
obj -= field->graph_root.node_offset;
- bpf_obj_free_fields(field->graph_root.value_rec, obj);
migrate_disable();
- bpf_mem_free(&bpf_global_ma, obj);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
migrate_enable();
}
}
@@ -1826,45 +1889,96 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
if (!p)
return NULL;
if (meta)
- bpf_obj_init(meta->field_offs, p);
+ bpf_obj_init(meta->record, p);
return p;
}
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- if (meta)
- bpf_obj_free_fields(meta->record, p);
- bpf_mem_free(&bpf_global_ma, p);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
}
-static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+
+ refcount_inc((refcount_t *)ref);
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
{
struct list_head *n = (void *)node, *h = (void *)head;
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
- if (unlikely(!n->next))
- INIT_LIST_HEAD(n);
+ if (!list_empty(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
tail ? list_add_tail(n, h) : list_add(n, h);
+
+ return 0;
}
-__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
{
- return __bpf_list_add(node, head, false);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, false,
+ meta ? meta->record : NULL, off);
}
-__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
{
- return __bpf_list_add(node, head, true);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, true,
+ meta ? meta->record : NULL, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
{
struct list_head *n, *h = (void *)head;
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
if (list_empty(h))
@@ -1890,6 +2004,9 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct rb_root_cached *r = (struct rb_root_cached *)root;
struct rb_node *n = (struct rb_node *)node;
+ if (RB_EMPTY_NODE(n))
+ return NULL;
+
rb_erase_cached(n, r);
RB_CLEAR_NODE(n);
return (struct bpf_rb_node *)n;
@@ -1898,14 +2015,20 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
* program
*/
-static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
- void *less)
+static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ void *less, struct btf_record *rec, u64 off)
{
struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = (struct rb_node *)node;
bpf_callback_t cb = (bpf_callback_t)less;
- struct rb_node *parent = NULL;
bool leftmost = true;
+ if (!RB_EMPTY_NODE(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
while (*link) {
parent = *link;
if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
@@ -1916,15 +2039,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
}
}
- rb_link_node((struct rb_node *)node, parent, link);
- rb_insert_color_cached((struct rb_node *)node,
- (struct rb_root_cached *)root, leftmost);
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ return 0;
}
-__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
- bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
{
- __bpf_rbtree_add(root, node, (void *)less);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
}
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -1942,73 +2068,8 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
*/
__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
- return get_task_struct(p);
-}
-
-/**
- * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
- * acquired by this kfunc which is not stored in a map as a kptr, must be
- * released by calling bpf_task_release().
- * @p: The task on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
-{
- /* For the time being this function returns NULL, as it's not currently
- * possible to safely acquire a reference to a task with RCU protection
- * using get_task_struct() and put_task_struct(). This is due to the
- * slightly odd mechanics of p->rcu_users, and how task RCU protection
- * works.
- *
- * A struct task_struct is refcounted by two different refcount_t
- * fields:
- *
- * 1. p->usage: The "true" refcount field which tracks a task's
- * lifetime. The task is freed as soon as this
- * refcount drops to 0.
- *
- * 2. p->rcu_users: An "RCU users" refcount field which is statically
- * initialized to 2, and is co-located in a union with
- * a struct rcu_head field (p->rcu). p->rcu_users
- * essentially encapsulates a single p->usage
- * refcount, and when p->rcu_users goes to 0, an RCU
- * callback is scheduled on the struct rcu_head which
- * decrements the p->usage refcount.
- *
- * There are two important implications to this task refcounting logic
- * described above. The first is that
- * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
- * after the refcount goes to 0, the RCU callback being scheduled will
- * cause the memory backing the refcount to again be nonzero due to the
- * fields sharing a union. The other is that we can't rely on RCU to
- * guarantee that a task is valid in a BPF program. This is because a
- * task could have already transitioned to being in the TASK_DEAD
- * state, had its rcu_users refcount go to 0, and its rcu callback
- * invoked in which it drops its single p->usage reference. At this
- * point the task will be freed as soon as the last p->usage reference
- * goes to 0, without waiting for another RCU gp to elapse. The only
- * way that a BPF program can guarantee that a task is valid is in this
- * scenario is to hold a p->usage refcount itself.
- *
- * Until we're able to resolve this issue, either by pulling
- * p->rcu_users and p->rcu out of the union, or by getting rid of
- * p->usage and just using p->rcu_users for refcounting, we'll just
- * return NULL here.
- */
- return NULL;
-}
-
-/**
- * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_task_release().
- * @pp: A pointer to a task kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
-{
- /* We must return NULL here until we have clarity on how to properly
- * leverage RCU for ensuring a task's lifetime. See the comment above
- * in bpf_task_acquire_not_zero() for more details.
- */
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
return NULL;
}
@@ -2018,10 +2079,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
*/
__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
- if (!p)
- return;
-
- put_task_struct(p);
+ put_task_struct_rcu_user(p);
}
#ifdef CONFIG_CGROUPS
@@ -2033,39 +2091,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
*/
__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
{
- cgroup_get(cgrp);
- return cgrp;
-}
-
-/**
- * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_cgroup_release().
- * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
-{
- struct cgroup *cgrp;
-
- rcu_read_lock();
- /* Another context could remove the cgroup from the map and release it
- * at any time, including after we've done the lookup above. This is
- * safe because we're in an RCU read region, so the cgroup is
- * guaranteed to remain valid until at least the rcu_read_unlock()
- * below.
- */
- cgrp = READ_ONCE(*cgrpp);
-
- if (cgrp && !cgroup_tryget(cgrp))
- /* If the cgroup had been removed from the map and freed as
- * described above, cgroup_tryget() will return false. The
- * cgroup will be freed at some point after the current RCU gp
- * has ended, so just return NULL to the user.
- */
- cgrp = NULL;
- rcu_read_unlock();
-
- return cgrp;
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
}
/**
@@ -2077,9 +2103,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
*/
__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
{
- if (!cgrp)
- return;
-
cgroup_put(cgrp);
}
@@ -2097,10 +2120,28 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
if (level > cgrp->level || level < 0)
return NULL;
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
ancestor = cgrp->ancestors[level];
- cgroup_get(ancestor);
+ if (!cgroup_tryget(ancestor))
+ return NULL;
return ancestor;
}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
#endif /* CONFIG_CGROUPS */
/**
@@ -2116,12 +2157,146 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
rcu_read_lock();
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- bpf_task_acquire(p);
+ p = bpf_task_acquire(p);
rcu_read_unlock();
return p;
}
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ enum bpf_dynptr_type type;
+ u32 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (xdp_ptr)
+ return xdp_ptr;
+
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+ return buffer;
+ }
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possiblities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -2150,23 +2325,22 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_list_push_front)
-BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_SET8_END(generic_btf_ids)
@@ -2190,6 +2364,11 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
BTF_ID_FLAGS(func, bpf_rdonly_cast)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e90d9f63edc5..4c7bbec4a9e4 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -446,6 +446,12 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
@@ -457,6 +463,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
+ .map_mem_usage = cgroup_storage_map_usage,
.map_btf_id = &cgroup_storage_map_btf_ids[0],
};
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..046ddff37a76
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end, new_n;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d833496e9e42..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
}
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key;
@@ -720,6 +720,16 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -733,5 +743,6 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
+ .map_mem_usage = trie_mem_usage,
.map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 38136ec4e095..2c5c64c2a53b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
ret = PTR_ERR(inner_map_meta->record);
goto free;
}
- if (inner_map_meta->record) {
- struct btf_field_offs *field_offs;
- /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
- * valid.
- */
- field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
- if (!field_offs) {
- ret = -ENOMEM;
- goto free_rec;
- }
- inner_map_meta->field_offs = field_offs;
- }
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps, as some members like
* record->fields.list_head have pointers like value_rec pointing into
@@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
-free_rec:
- btf_record_free(inner_map_meta->record);
free:
kfree(inner_map_meta);
put:
@@ -99,7 +85,6 @@ put:
void bpf_map_meta_free(struct bpf_map *map_meta)
{
- kfree(map_meta->field_offs);
bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
return entry;
}
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
- /* Allocate, but don't deplete atomic reserves that typical
- * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
- * will allocate from the current numa node which is what we
- * want here.
- */
- gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
*/
obj = __llist_del_first(&c->free_by_rcu);
if (!obj) {
- obj = __alloc(c, node);
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
if (!obj)
break;
}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0c85e06f7ea7..d9c9f45e3529 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map)
bpf_map_area_free(offmap);
}
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
+}
+
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8a5e060de63b..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -124,7 +124,7 @@ out:
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -156,32 +156,32 @@ out:
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -246,6 +246,14 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -259,6 +267,7 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
.map_btf_id = &queue_map_btf_ids[0],
};
@@ -274,5 +283,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
.map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 82c61612f382..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
@@ -335,6 +335,13 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -344,5 +351,6 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
+ .map_mem_usage = reuseport_array_mem_usage,
.map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 8732e0aadf36..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -19,6 +19,7 @@
(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
/* consumer page and producer page */
#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
@@ -96,7 +97,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
__GFP_NOWARN | __GFP_ZERO;
- int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
struct page **pages, *page;
@@ -241,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
@@ -336,6 +337,21 @@ static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
return 0;
}
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -347,6 +363,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
.map_btf_id = &ringbuf_map_btf_ids[0],
};
@@ -361,6 +378,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
.map_btf_id = &user_ringbuf_map_btf_ids[0],
};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index aecea7451b61..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
@@ -654,6 +654,19 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -664,5 +677,6 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = stack_map_mem_usage,
.map_btf_id = &stack_trace_map_btf_ids[0],
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index adc83cb82f37..14f39c1e573e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <net/netfilter/nf_bpf_link.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -105,6 +106,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -128,6 +130,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return ERR_PTR(-EINVAL);
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
@@ -517,14 +521,14 @@ static int btf_field_cmp(const void *a, const void *b)
}
struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
- enum btf_field_type type)
+ u32 field_mask)
{
struct btf_field *field;
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
return NULL;
field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
- if (!field || !(field->type & type))
+ if (!field || !(field->type & field_mask))
return NULL;
return field;
}
@@ -549,6 +553,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_REFCOUNT:
/* Nothing to release */
break;
default:
@@ -596,6 +601,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_REFCOUNT:
/* Nothing to acquire */
break;
default:
@@ -647,6 +653,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -656,8 +664,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
return;
fields = rec->fields;
for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
const struct btf_field *field = &fields[i];
void *field_ptr = obj + field->offset;
+ void *xchgd_field;
switch (fields[i].type) {
case BPF_SPIN_LOCK:
@@ -669,7 +679,22 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
- field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ WARN_ON_ONCE(!pointee_struct_meta);
+ migrate_disable();
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record :
+ NULL);
+ migrate_enable();
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
@@ -683,6 +708,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
break;
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
break;
default:
WARN_ON_ONCE(1);
@@ -695,14 +721,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- struct btf_field_offs *foffs = map->field_offs;
struct btf_record *rec = map->record;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
- /* Delay freeing of field_offs and btf_record for maps, as map_free
+ /* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
*
@@ -711,7 +736,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
* eventually calls bpf_map_free_meta, since inner_map_meta is only a
* template bpf_map struct used during verification.
*/
- kfree(foffs);
btf_record_free(rec);
}
@@ -771,17 +795,10 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
- unsigned long size;
-
- size = round_up(map->key_size + bpf_map_value_size(map), 8);
-
- return round_up(map->max_entries * size, PAGE_SIZE);
+ return map->ops->map_mem_usage(map);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
@@ -803,7 +820,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"map_extra:\t%#llx\n"
- "memlock:\t%lu\n"
+ "memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
@@ -812,7 +829,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->max_entries,
map->map_flags,
(unsigned long long)map->map_extra,
- bpf_map_memory_footprint(map),
+ bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@@ -1019,7 +1036,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT,
+ BPF_RB_ROOT | BPF_REFCOUNT,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1058,10 +1075,17 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
@@ -1104,7 +1128,6 @@ free_map_tab:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
- struct btf_field_offs *foffs;
struct bpf_map *map;
int f_flags;
int err;
@@ -1184,17 +1207,9 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
-
- foffs = btf_parse_field_offs(map->record);
- if (IS_ERR(foffs)) {
- err = PTR_ERR(foffs);
- goto free_map;
- }
- map->field_offs = foffs;
-
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_field_offs;
+ goto free_map;
err = bpf_map_alloc_id(map);
if (err)
@@ -1218,8 +1233,6 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
-free_map_field_offs:
- kfree(map->field_offs);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1285,8 +1298,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -2049,6 +2064,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
@@ -2439,7 +2455,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2448,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
@@ -2477,9 +2493,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define BPF_PROG_LOAD_LAST_FIELD log_true_size
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2629,7 +2645,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2804,16 +2820,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -3095,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
@@ -4288,7 +4312,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4338,9 +4363,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -4348,7 +4373,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr, uattr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4551,6 +4576,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4562,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
switch (prog->type) {
case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_NETFILTER:
break;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
@@ -4628,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
#endif
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
@@ -4649,6 +4681,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4669,6 +4730,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -4989,7 +5055,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -5034,7 +5100,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..ac021bc43a66 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
-#include <linux/module.h>
#include <linux/static_call.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf_lsm.h>
@@ -45,8 +44,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd
lockdep_assert_held_once(&tr->mutex);
/* Instead of updating the trampoline here, we propagate
- * -EAGAIN to register_ftrace_direct_multi(). Then we can
- * retry register_ftrace_direct_multi() after updating the
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
* trampoline.
*/
if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
@@ -172,38 +171,16 @@ out:
return tr;
}
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
- struct module *mod;
- int err = 0;
-
- preempt_disable();
- mod = __module_text_address((unsigned long) tr->func.addr);
- if (mod && !try_module_get(mod))
- err = -ENOENT;
- preempt_enable();
- tr->mod = mod;
- return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
- module_put(tr->mod);
- tr->mod = NULL;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
- if (!ret)
- bpf_trampoline_module_put(tr);
return ret;
}
@@ -215,9 +192,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
if (tr->func.ftrace_managed) {
if (lock_direct_mutex)
- ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
else
- ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
}
@@ -238,18 +215,13 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
tr->func.ftrace_managed = true;
}
- if (bpf_trampoline_module_get(tr))
- return -ENOENT;
-
if (tr->func.ftrace_managed) {
ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
- if (ret)
- bpf_trampoline_module_put(tr);
return ret;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d517d13878cf..fbcf5a4e2fcd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/poison.h>
+#include <linux/module.h>
#include "disasm.h"
@@ -194,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -268,7 +271,50 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
- u8 uninit_dynptr_regno;
+};
+
+struct btf_and_id {
+ struct btf *btf;
+ u32 btf_id;
+};
+
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ union {
+ struct btf_and_id arg_obj_drop;
+ struct btf_and_id arg_refcount_acquire;
+ };
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ u64 mem_size;
};
struct btf *btf_vmlinux;
@@ -296,61 +342,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return &linfo[i - 1];
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
-{
- unsigned int n;
-
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
- if (log->level == BPF_LOG_KERNEL) {
- bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
- pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
- return;
- }
-
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
- char zero = 0;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
struct bpf_verifier_env *env = private_data;
@@ -364,20 +355,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -447,13 +424,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool type_may_be_null(u32 type)
+{
+ return type & PTR_MAYBE_NULL;
+}
+
static bool reg_type_not_null(enum bpf_reg_type type)
{
+ if (type_may_be_null(type))
+ return false;
+
+ type = base_type(type);
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
- type == PTR_TO_SOCK_COMMON;
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_MEM;
}
static bool type_is_ptr_alloc_obj(u32 type)
@@ -491,11 +478,6 @@ static bool type_is_rdonly_mem(u32 type)
return type & MEM_RDONLY;
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -633,6 +615,7 @@ static char slot_type_char[] = {
[STACK_MISC] = 'm',
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -675,37 +658,91 @@ static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_sl
return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
{
int off, spi;
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "dynptr has to be at a constant offset\n");
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
return -EINVAL;
}
off = reg->off + reg->var_off.value;
if (off % BPF_REG_SIZE) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
spi = __get_spi(off);
- if (spi < 1) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS))
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
return -ERANGE;
return spi;
}
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
+}
+
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
{
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
env->scratched_regs |= 1U << regno;
@@ -751,11 +788,31 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_LOCAL;
case DYNPTR_TYPE_RINGBUF:
return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
}
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ default:
+ return 0;
+ }
+}
+
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -895,6 +952,14 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
+
static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi)
{
@@ -934,12 +999,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
continue;
- if (dreg->dynptr_id == dynptr_id) {
- if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, dreg);
- else
- __mark_reg_unknown(env, dreg);
- }
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
}));
/* Do not release reference state, we are destroying dynptr on stack,
@@ -955,39 +1016,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ int spi;
+
if (reg->type == CONST_PTR_TO_DYNPTR)
return false;
- /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
- * will do check_mem_access to check and update stack bounds later, so
- * return true for that case.
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
*/
- if (spi < 0)
- return spi == -ERANGE;
- /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
- * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
- * ensure dynptr objects at the slots we are touching are completely
- * destructed before we reinitialize them for a new one. For referenced
- * ones, destroy_if_dynptr_stack_slot returns an error early instead of
- * delaying it until the end where the user will get "Unreleased
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
* reference" error.
*/
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int i;
+ int i, spi;
- /* This already represents first slot of initialized bpf_dynptr */
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
+ spi = dynptr_get_spi(env, reg);
if (spi < 0)
return false;
if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
@@ -1024,6 +1095,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
}
}
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return false;
+ if (i != 0 && st->ref_obj_id)
+ return false;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return false;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -1070,7 +1292,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, "%s", reg_type_str(env, t));
if (base_type(t) == PTR_TO_BTF_ID)
- verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
/*
* _a stands for append, was shortened to avoid multiline statements below.
@@ -1143,26 +1365,62 @@ static void print_verifier_state(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++) {
if (state->stack[i].slot_type[j] != STACK_INVALID)
valid = true;
- types_buf[j] = slot_type_char[
- state->stack[i].slot_type[j]];
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
}
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
if (!print_all && !stack_slot_scratched(env, i))
continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
- if (is_spilled_reg(&state->stack[i])) {
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
reg = &state->stack[i].spilled_ptr;
t = reg->type;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
+ break;
+ case STACK_DYNPTR:
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+ if (reg->ref_obj_id)
+ verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ reg = &state->stack[i].spilled_ptr;
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ reg = &state->stack[i].spilled_ptr;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+ types_buf[BPF_REG_SIZE] = 0;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
verbose(env, "=%s", types_buf);
+ break;
}
}
if (state->acquired_refs && state->refs[0].id) {
@@ -1188,10 +1446,10 @@ static inline u32 vlog_alignment(u32 pos)
static void print_insn_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state)
{
- if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
- bpf_vlog_reset(&env->log, env->prev_log_len - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
} else {
verbose(env, "%d:", env->insn_idx);
}
@@ -1499,7 +1757,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -1664,6 +1922,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -1823,9 +2087,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
@@ -2029,7 +2293,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -2117,6 +2381,7 @@ struct bpf_kfunc_desc {
u32 func_id;
s32 imm;
u16 offset;
+ unsigned long addr;
};
struct bpf_kfunc_btf {
@@ -2126,6 +2391,11 @@ struct bpf_kfunc_btf {
};
struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
@@ -2166,6 +2436,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
+ return 0;
+}
+
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
s16 offset)
{
@@ -2345,13 +2628,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
func_name);
return -EINVAL;
}
+ specialize_kfunc(env, func_id, offset, &addr);
- call_imm = BPF_CALL_IMM(addr);
- /* Check whether or not the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel function %s is out of range\n",
- func_name);
- return -EINVAL;
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
}
if (bpf_dev_bound_kfunc_id(func_id)) {
@@ -2364,6 +2652,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
desc->func_id = func_id;
desc->imm = call_imm;
desc->offset = offset;
+ desc->addr = addr;
err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
&desc->func_model);
@@ -2373,19 +2662,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
- if (d0->imm > d1->imm)
- return 1;
- else if (d0->imm < d1->imm)
- return -1;
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
return 0;
}
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
{
struct bpf_kfunc_desc_tab *tab;
@@ -2394,7 +2683,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
return;
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm, NULL);
+ kfunc_desc_cmp_by_imm_off, NULL);
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -2408,13 +2697,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
{
const struct bpf_kfunc_desc desc = {
.imm = insn->imm,
+ .offset = insn->off,
};
const struct bpf_kfunc_desc *res;
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
return res ? &res->func_model : NULL;
}
@@ -2475,8 +2765,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
- insn[i].imm == BPF_FUNC_tail_call &&
- insn[i].src_reg != BPF_PSEUDO_CALL)
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call)
subprog[cur_subprog].has_tail_call = true;
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
@@ -2587,6 +2877,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
}
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -2967,6 +3276,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
}
} else if (opcode == BPF_EXIT) {
return -ENOTSUPP;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!(*reg_mask & (dreg | sreg)))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ *reg_mask |= (sreg | dreg);
+ /* else dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
if (!(*reg_mask & dreg))
@@ -3568,8 +3892,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (is_spilled_reg(&state->stack[spi]))
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
@@ -3962,17 +4286,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
/* Variable offset is prohibited for unprivileged mode for simplicity
* since it requires corresponding support in Spectre masking for stack
- * ALU. See also retrieve_ptr_limit().
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
*/
- if (!env->bypass_spec_v1 && var_off) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- ptr_regno, tn_buf);
- return -EACCES;
- }
-
if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
@@ -4178,8 +4498,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */
@@ -4194,7 +4514,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
return -EINVAL;
}
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
- reg_name = kernel_type_name(reg->btf, reg->btf_id);
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
/* For ref_ptr case, release function check should ensure we get one
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
@@ -4246,6 +4566,36 @@ bad_type:
return -EINVAL;
}
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(struct, task_struct)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return false;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+}
+
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
@@ -4280,7 +4630,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ kptr_field->kptr.btf_id,
+ rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
+ PTR_MAYBE_NULL | MEM_RCU :
+ PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
@@ -4600,6 +4953,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg)
return reg->type & MEM_RCU;
}
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -5003,23 +5361,110 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
-#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields)
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
-BTF_TYPE_SAFE_NESTED(struct task_struct) {
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
};
-static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- int off)
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+ /* no negative dentry-s in places where bpf can see it */
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+ struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
{
- /* If its parent is not trusted, it can't regain its trusted status. */
- if (!is_trusted_reg(reg))
- return false;
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
- return btf_nested_type_is_trusted(&env->log, reg, off);
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5031,8 +5476,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ const char *field_name = NULL;
enum bpf_type_flag flag = 0;
- u32 btf_id;
+ u32 btf_id = 0;
int ret;
if (!env->allow_ptr_leaks) {
@@ -5077,12 +5523,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
if (!btf_is_kernel(reg->btf)) {
verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
return -EFAULT;
}
- ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
/* Writes are permitted with default btf_struct_access for
* program allocated objects (which always have ref_obj_id > 0),
@@ -5099,47 +5545,63 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EFAULT;
}
- ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
return ret;
- /* If this is an untrusted pointer, all pointers formed by walking it
- * also inherit the untrusted flag.
- */
- if (type_flag(reg->type) & PTR_UNTRUSTED)
- flag |= PTR_UNTRUSTED;
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
- /* By default any pointer obtained from walking a trusted pointer is no
- * longer trusted, unless the field being accessed has explicitly been
- * marked as inheriting its parent's state of trust.
- *
- * An RCU-protected pointer can also be deemed trusted if we are in an
- * RCU read region. This case is handled below.
- */
- if (nested_ptr_is_trusted(env, reg, off))
- flag |= PTR_TRUSTED;
- else
- flag &= ~PTR_TRUSTED;
-
- if (flag & MEM_RCU) {
- /* Mark value register as MEM_RCU only if it is protected by
- * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
- * itself can already indicate trustedness inside the rcu
- * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
- * it could be null in some cases.
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
*/
- if (!env->cur_state->active_rcu_lock ||
- !(is_trusted_reg(reg) || is_rcu_reg(reg)))
- flag &= ~MEM_RCU;
- else
- flag |= PTR_MAYBE_NULL;
- } else if (reg->type & MEM_RCU) {
- /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
- * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
*/
- flag |= PTR_UNTRUSTED;
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
+ }
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
}
if (atype == BPF_READ && value_regno >= 0)
@@ -5198,7 +5660,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
/* Simulate access to a PTR_TO_BTF_ID */
memset(&map_reg, 0, sizeof(map_reg));
mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
- ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
@@ -5864,6 +6326,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
case PTR_TO_CTX:
/* in case the function doesn't know how to access the context,
* (because we are in a program of type SYSCALL for example), we
@@ -6211,11 +6676,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
* Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
* type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
- enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- int spi = 0;
+ int err;
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -6224,15 +6689,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
return -EFAULT;
}
- /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
- * check_func_arg_reg_off's logic. We only need to check offset
- * and its alignment for PTR_TO_STACK.
- */
- if (reg->type == PTR_TO_STACK) {
- spi = dynptr_get_spi(env, reg);
- if (spi < 0 && spi != -ERANGE)
- return spi;
- }
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
@@ -6250,30 +6706,30 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
* to.
*/
if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
return -EINVAL;
}
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
}
- meta->uninit_dynptr_regno = regno;
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
} else /* MEM_RDONLY and None case from above */ {
- int err;
-
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
return -EINVAL;
}
- if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+ if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
regno);
@@ -6282,29 +6738,211 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
- const char *err_extra = "";
-
- switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
- case DYNPTR_TYPE_LOCAL:
- err_extra = "local";
- break;
- case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf";
- break;
- default:
- err_extra = "<unknown>";
- break;
- }
verbose(env,
"Expected a dynptr of type %s as arg #%d\n",
- err_extra, regno);
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
return -EINVAL;
}
err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ const struct btf_param *arg;
+ int spi, err, i, nr_slots;
+ u32 btf_id;
+
+ /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+ arg = &btf_params(meta->func_proto)[0];
+ t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
+ t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy() expect initialized iter state*/
+ if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
if (err)
return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
}
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
return 0;
}
@@ -6402,6 +7040,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MEM,
PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
};
@@ -6511,6 +7150,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ type &= ~MEM_ALLOC;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -6527,7 +7169,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
* allows bpf_sk_release to work for multiple socket types.
@@ -6535,6 +7197,12 @@ found:
bool strict_type_match = arg_type_is_release(arg_type) &&
meta->func_id != BPF_FUNC_sk_release;
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -6558,18 +7226,29 @@ found:
btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
}
- } else if (type_is_alloc(reg->type)) {
- if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
+ /* Handled by helper specific checks */
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
+ return -EFAULT;
}
-
return 0;
}
@@ -6619,7 +7298,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- kernel_type_name(reg->btf, reg->btf_id), reg->off);
+ btf_type_name(reg->btf, reg->btf_id), reg->off);
return -EINVAL;
}
@@ -6656,7 +7335,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | MEM_ALLOC:
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
@@ -6671,6 +7349,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
}
}
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
@@ -6697,9 +7397,28 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
return state->stack[spi].spilled_ptr.ref_obj_id;
}
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
- const struct bpf_func_proto *fn)
+ const struct bpf_func_proto *fn,
+ int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -6912,7 +7631,7 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, arg_type, meta);
+ err = process_dynptr_func(env, regno, insn_idx, arg_type);
if (err)
return err;
break;
@@ -7131,22 +7850,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
- func_id != BPF_FUNC_sk_storage_delete)
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_INODE_STORAGE:
if (func_id != BPF_FUNC_inode_storage_get &&
- func_id != BPF_FUNC_inode_storage_delete)
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_TASK_STORAGE:
if (func_id != BPF_FUNC_task_storage_get &&
- func_id != BPF_FUNC_task_storage_delete)
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_CGRP_STORAGE:
if (func_id != BPF_FUNC_cgrp_storage_get &&
- func_id != BPF_FUNC_cgrp_storage_delete)
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_BLOOM_FILTER:
@@ -7360,6 +8083,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
@@ -7367,8 +8093,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
struct bpf_reg_state *reg;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(env, reg);
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
}));
}
@@ -7413,12 +8139,8 @@ static int release_reference(struct bpf_verifier_env *env,
return err;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id) {
- if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, reg);
- else
- __mark_reg_unknown(env, reg);
- }
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
}));
return 0;
@@ -7431,7 +8153,7 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
if (type_is_non_owning_ref(reg->type))
- __mark_reg_unknown(env, reg);
+ mark_reg_invalid(env, reg);
}));
}
@@ -7793,10 +8515,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx)
{
- /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
* bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
*
- * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
* that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
* by this point, so look at 'root'
*/
@@ -8202,7 +8924,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- err = check_func_arg(env, i, &meta, fn);
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -8227,30 +8949,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
- * is safe to do directly.
- */
- if (meta.uninit_dynptr_regno) {
- if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
- verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
- return -EFAULT;
- }
- /* we write BPF_DW bits (8 bytes) at a time */
- for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
- err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
- i, BPF_DW, BPF_WRITE, -1, false);
- if (err)
- return err;
- }
-
- err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
- fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
- insn_idx);
- if (err)
- return err;
- }
-
if (meta.release_regno) {
err = -EINVAL;
/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
@@ -8335,43 +9033,62 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_dynptr_data:
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
- int id, ref_obj_id;
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
- if (meta.dynptr_id) {
- verbose(env, "verifier internal error: meta.dynptr_id already set\n");
- return -EFAULT;
- }
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
- if (meta.ref_obj_id) {
- verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
- return -EFAULT;
- }
- id = dynptr_id(env, reg);
- if (id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr id\n");
- return id;
- }
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
- ref_obj_id = dynptr_ref_obj_id(env, reg);
- if (ref_obj_id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
- return ref_obj_id;
- }
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
- meta.dynptr_id = id;
- meta.ref_obj_id = ref_obj_id;
- break;
- }
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
}
- if (i == MAX_BPF_FUNC_REG_ARGS) {
- verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
return -EFAULT;
- }
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
break;
+ }
case BPF_FUNC_user_ringbuf_drain:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_user_ringbuf_callback_state);
@@ -8484,6 +9201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_kptr_xchg) {
ret_btf = meta.kptr_field->kptr.btf;
ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf))
+ regs[BPF_REG_0].type |= MEM_ALLOC;
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -8600,36 +9319,6 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-struct bpf_kfunc_call_arg_meta {
- /* In parameters */
- struct btf *btf;
- u32 func_id;
- u32 kfunc_flags;
- const struct btf_type *func_proto;
- const char *func_name;
- /* Out parameters */
- u32 ref_obj_id;
- u8 release_regno;
- bool r0_rdonly;
- u32 ret_btf_id;
- u64 r0_size;
- u32 subprogno;
- struct {
- u64 value;
- bool found;
- } arg_constant;
- struct {
- struct btf *btf;
- u32 btf_id;
- } arg_obj_drop;
- struct {
- struct btf_field *field;
- } arg_list_head;
- struct {
- struct btf_field *field;
- } arg_rbtree_root;
-};
-
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_ACQUIRE;
@@ -8647,7 +9336,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
{
- return meta->kfunc_flags & KF_TRUSTED_ARGS;
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
}
static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
@@ -8665,11 +9354,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU;
}
-static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
-{
- return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
-}
-
static bool __kfunc_param_match_suffix(const struct btf *btf,
const struct btf_param *arg,
const char *suffix)
@@ -8701,6 +9385,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
return __kfunc_param_match_suffix(btf, arg, "__sz");
}
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8716,6 +9413,16 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param
return __kfunc_param_match_suffix(btf, arg, "__alloc");
}
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -8855,14 +9562,15 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CTX,
- KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
- KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
KF_ARG_PTR_TO_LIST_HEAD,
KF_ARG_PTR_TO_LIST_NODE,
- KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
KF_ARG_PTR_TO_MEM,
- KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
KF_ARG_PTR_TO_CALLBACK,
KF_ARG_PTR_TO_RB_ROOT,
KF_ARG_PTR_TO_RB_NODE,
@@ -8871,8 +9579,9 @@ enum kfunc_ptr_arg_type {
enum special_kfunc_type {
KF_bpf_obj_new_impl,
KF_bpf_obj_drop_impl,
- KF_bpf_list_push_front,
- KF_bpf_list_push_back,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
KF_bpf_cast_to_kern_ctx,
@@ -8880,29 +9589,39 @@ enum special_kfunc_type {
KF_bpf_rcu_read_lock,
KF_bpf_rcu_read_unlock,
KF_bpf_rbtree_remove,
- KF_bpf_rbtree_add,
+ KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
};
BTF_SET_START(special_kfunc_set)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
@@ -8910,8 +9629,12 @@ BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -8949,24 +9672,15 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
- if (is_kfunc_arg_kptr_get(meta, argno)) {
- if (!btf_type_is_ptr(ref_t)) {
- verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
- return -EINVAL;
- }
- ref_t = btf_type_by_id(meta->btf, ref_t->type);
- ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
- if (!btf_type_is_struct(ref_t)) {
- verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
- meta->func_name, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- return KF_ARG_PTR_TO_KPTR;
- }
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
+ if (is_kfunc_arg_iter(meta, argno))
+ return KF_ARG_PTR_TO_ITER;
+
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_LIST_HEAD;
@@ -8991,7 +9705,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
arg_mem_size = true;
/* This is the catch all argument type of register types supported by
@@ -9071,40 +9788,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
return 0;
}
-static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- const struct btf_type *ref_t,
- const char *ref_tname,
- struct bpf_kfunc_call_arg_meta *meta,
- int argno)
-{
- struct btf_field *kptr_field;
-
- /* check_func_arg_reg_off allows var_off for
- * PTR_TO_MAP_VALUE, but we need fixed offset to find
- * off_desc.
- */
- if (!tnum_is_const(reg->var_off)) {
- verbose(env, "arg#0 must have constant offset\n");
- return -EINVAL;
- }
-
- kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
- if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
- verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
- reg->off + reg->var_off.value);
- return -EINVAL;
- }
-
- if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id, true)) {
- verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
- meta->func_name, argno, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- return 0;
-}
-
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -9211,7 +9894,6 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
ptr = reg->map_ptr;
break;
case PTR_TO_BTF_ID | MEM_ALLOC:
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
ptr = reg->btf;
break;
default:
@@ -9232,27 +9914,28 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
static bool is_bpf_list_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_back];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
- return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id);
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
static bool is_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
@@ -9293,12 +9976,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
break;
case BPF_RB_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
break;
default:
verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -9460,11 +10143,13 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
&meta->arg_rbtree_root.field);
}
-static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
{
const char *func_name = meta->func_name, *ref_tname;
const struct btf *btf = meta->btf;
const struct btf_param *args;
+ struct btf_record *rec;
u32 i, nargs;
int ret;
@@ -9543,7 +10228,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if (is_kfunc_trusted_args(meta) &&
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
(register_is_null(reg) || type_may_be_null(reg->type))) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
@@ -9590,8 +10275,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
/* Trusted arguments have the same offset checks as release arguments */
arg_type |= OBJ_RELEASE;
break;
- case KF_ARG_PTR_TO_KPTR:
case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
case KF_ARG_PTR_TO_RB_ROOT:
@@ -9599,6 +10284,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_MEM:
case KF_ARG_PTR_TO_MEM_SIZE:
case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
/* Trusted by default */
break;
default:
@@ -9641,23 +10327,46 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
meta->arg_obj_drop.btf_id = reg->btf_id;
}
break;
- case KF_ARG_PTR_TO_KPTR:
- if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#0 expected pointer to map value\n");
- return -EINVAL;
- }
- ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
- if (ret < 0)
- return ret;
- break;
case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+
if (reg->type != PTR_TO_STACK &&
reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
return -EINVAL;
}
- ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ ret = process_iter_arg(env, regno, insn_idx, meta);
if (ret < 0)
return ret;
break;
@@ -9754,17 +10463,59 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_MEM_SIZE:
- ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ {
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
return ret;
}
- /* Skip next '__sz' argument */
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
i++;
break;
+ }
case KF_ARG_PTR_TO_CALLBACK:
meta->subprogno = reg->subprogno;
break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verbose(env, "verifier internal error: Couldn't find btf_record\n");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+ if (rec->refcount_off >= 0) {
+ verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
+ return -EINVAL;
+ }
+ meta->arg_refcount_acquire.btf = reg->btf;
+ meta->arg_refcount_acquire.btf_id = reg->btf_id;
+ break;
}
}
@@ -9777,24 +10528,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx_p)
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
{
- const struct btf_type *t, *func, *func_proto, *ptr_type;
- u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
- struct bpf_reg_state *regs = cur_regs(env);
- const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
- struct bpf_kfunc_call_arg_meta meta;
- int err, insn_idx = *insn_idx_p;
- const struct btf_param *args;
- const struct btf_type *ret_t;
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
struct btf *desc_btf;
- u32 *kfunc_flags;
- /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
if (!insn->imm)
- return 0;
+ return -EINVAL;
desc_btf = find_kfunc_desc_btf(env, insn->off);
if (IS_ERR(desc_btf))
@@ -9803,22 +10551,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_id = insn->imm;
func = btf_type_by_id(desc_btf, func_id);
func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
if (!kfunc_flags) {
- verbose(env, "calling kernel function %s is not allowed\n",
- func_name);
return -EACCES;
}
- /* Prepare kfunc call metadata */
- memset(&meta, 0, sizeof(meta));
- meta.btf = desc_btf;
- meta.func_id = func_id;
- meta.kfunc_flags = *kfunc_flags;
- meta.func_proto = func_proto;
- meta.func_name = func_name;
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ const struct btf_type *t, *ptr_type;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ const struct btf_type *ret_t;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ if (err)
+ return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -9833,10 +10612,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
- if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
- verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
- return -EACCES;
- }
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
@@ -9865,7 +10640,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
/* Check the arguments */
- err = check_kfunc_args(env, &meta);
+ err = check_kfunc_args(env, &meta, insn_idx);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -9875,36 +10650,37 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
err = release_reference(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_rbtree_add_callback_state);
if (err) {
verbose(env, "kfunc %s#%d failed callback verification\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
@@ -9913,11 +10689,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
/* Only exception is bpf_obj_new_impl */
- if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -9962,13 +10740,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
- env->insn_aux_data[insn_idx].kptr_struct_meta =
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta =
btf_find_struct_meta(ret_btf, ret_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
- env->insn_aux_data[insn_idx].kptr_struct_meta =
- btf_find_struct_meta(meta.arg_obj_drop.btf,
- meta.arg_obj_drop.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
+ regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_refcount_acquire.btf,
+ meta.arg_refcount_acquire.btf_id);
} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
@@ -9996,6 +10779,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta.arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta.initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
} else {
verbose(env, "kernel function %s unhandled dynamic return type\n",
meta.func_name);
@@ -10003,6 +10822,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
ptr_type_name = btf_name_by_offset(desc_btf,
ptr_type->name_off);
verbose(env,
@@ -10048,15 +10875,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove])
- invalidate_non_owning_refs(env);
-
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
- } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ }
+ }
+ }
- nargs = btf_type_vlen(func_proto);
- args = (const struct btf_param *)(func_proto + 1);
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
@@ -10068,6 +10900,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_btf_func_reg_size(env, regno, t->size);
}
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -11601,12 +12439,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && !src_reg->id)
+ src_reg->id = ++env->id_gen;
copy_register_state(dst_reg, src_reg);
- /* Make sure ID is cleared otherwise
+ /* Make sure ID is cleared if src_reg is not in u32 range otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
*/
- dst_reg->id = 0;
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
@@ -11774,10 +12617,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(subreg))
return !!tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(subreg))
return !tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 1;
break;
case BPF_JSET:
if ((~subreg.mask & subreg.value) & val)
@@ -11847,10 +12694,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
return !!tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 1;
break;
case BPF_JSET:
if ((~reg->var_off.mask & reg->var_off.value) & val)
@@ -12471,6 +13322,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
src_reg->var_off.value,
opcode,
is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
+ pred = is_branch_taken(src_reg,
+ tnum_subreg(dst_reg->var_off).value,
+ flip_opcode(opcode),
+ is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
+ pred = is_branch_taken(src_reg,
+ dst_reg->var_off.value,
+ flip_opcode(opcode),
+ is_jmp32);
} else if (reg_is_pkt_pointer_any(dst_reg) &&
reg_is_pkt_pointer_any(src_reg) &&
!is_jmp32) {
@@ -12971,6 +13834,9 @@ static int check_return_code(struct bpf_verifier_env *env)
}
break;
+ case BPF_PROG_TYPE_NETFILTER:
+ range = tnum_range(NF_DROP, NF_ACCEPT);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -13065,6 +13931,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].prune_point;
}
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -13157,44 +14034,63 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
*/
static int visit_insn(int t, struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
int ret;
- if (bpf_pseudo_func(insns + t))
+ if (bpf_pseudo_func(insn))
return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insns[t].code) != BPF_JMP &&
- BPF_CLASS(insns[t].code) != BPF_JMP32)
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32)
return push_insn(t, t + 1, FALLTHROUGH, env, false);
- switch (BPF_OP(insns[t].code)) {
+ switch (BPF_OP(insn->code)) {
case BPF_EXIT:
return DONE_EXPLORING;
case BPF_CALL:
- if (insns[t].imm == BPF_FUNC_timer_set_callback)
+ if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
/* Mark this call insn as a prune point to trigger
* is_state_visited() check before call itself is
* processed by __check_func_call(). Otherwise new
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
- return visit_func_call_insn(t, insns, env,
- insns[t].src_reg == BPF_PSEUDO_CALL);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insns[t].code) != BPF_K)
+ if (BPF_SRC(insn->code) != BPF_K)
return -EINVAL;
/* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
+ ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
- mark_prune_point(env, t + insns[t].off + 1);
- mark_jmp_point(env, t + insns[t].off + 1);
+ mark_prune_point(env, t + insn->off + 1);
+ mark_jmp_point(env, t + insn->off + 1);
return ret;
@@ -13206,7 +14102,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
if (ret)
return ret;
- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
+ return push_insn(t, t + insn->off + 1, BRANCH, env, true);
}
}
@@ -13827,7 +14723,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
struct bpf_id_pair *idmap)
{
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_ids(rold->id, rcur->id, idmap) &&
check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}
@@ -13882,13 +14778,17 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off) &&
- check_ids(rold->id, rcur->id, idmap);
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
/* We must have at least as much range as the old ptr
@@ -13930,6 +14830,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -13986,9 +14888,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
break;
case STACK_DYNPTR:
- {
- const struct bpf_reg_state *old_reg, *cur_reg;
-
old_reg = &old->stack[spi].spilled_ptr;
cur_reg = &cur->stack[spi].spilled_ptr;
if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -13996,7 +14895,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
- }
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -14210,10 +15124,11 @@ static int propagate_precision(struct bpf_verifier_env *env,
state_reg = state->regs;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating r%d\n", i, fr);
+ verbose(env, "frame %d: propagating r%d\n", fr, i);
err = mark_chain_precision_frame(env, fr, i);
if (err < 0)
return err;
@@ -14224,11 +15139,12 @@ static int propagate_precision(struct bpf_verifier_env *env,
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "frame %d: propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE, fr);
+ fr, (-i - 1) * BPF_REG_SIZE);
err = mark_chain_precision_stack_frame(env, fr, i);
if (err < 0)
return err;
@@ -14255,6 +15171,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
@@ -14262,7 +15264,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
+ bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+ bool add_new_state = force_new_state;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -14302,8 +15305,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* Since the verifier still needs to catch infinite loops
* inside async callbacks.
*/
- } else if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur) &&
+ !iter_active_depths_differ(&sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -14320,12 +15361,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
if (states_equal(env, &sl->state, cur)) {
+hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
@@ -14509,6 +15553,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_missmatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_missmatch &&
+ base_type(type) == PTR_TO_BTF_ID &&
+ base_type(*prev_type) == PTR_TO_BTF_ID) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM.
+ */
+ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -14594,11 +15676,11 @@ static int do_check(struct bpf_verifier_env *env)
print_insn_state(env, state->frame[state->curframe]);
verbose_linfo(env, env->insn_idx, "; ");
- env->prev_log_len = env->log.len_used;
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
- env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
- env->prev_log_len = env->log.len_used;
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -14618,7 +15700,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type *prev_src_type, src_reg_type;
+ enum bpf_reg_type src_reg_type;
/* check for reserved fields is already done */
@@ -14642,29 +15724,11 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_src_type == NOT_INIT) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * save type to validate intersecting paths
- */
- *prev_src_type = src_reg_type;
-
- } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, src_reg_type, true);
+ if (err)
+ return err;
} else if (class == BPF_STX) {
- enum bpf_reg_type *prev_dst_type, dst_reg_type;
+ enum bpf_reg_type dst_reg_type;
if (BPF_MODE(insn->code) == BPF_ATOMIC) {
err = check_atomic(env, env->insn_idx, insn);
@@ -14697,16 +15761,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_dst_type == NOT_INIT) {
- *prev_dst_type = dst_reg_type;
- } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_ST uses reserved fields\n");
@@ -14717,12 +15777,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- if (is_ctx_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
- insn->dst_reg,
- reg_type_str(env, reg_state(env, insn->dst_reg)->type));
- return -EACCES;
- }
+ dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
@@ -14731,6 +15786,9 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
@@ -14765,6 +15823,8 @@ static int do_check(struct bpf_verifier_env *env)
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
+
+ mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
@@ -14939,8 +15999,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
goto err_put;
}
- if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
err = -EINVAL;
goto err_put;
}
@@ -14953,6 +16013,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -ENOENT;
goto err_put;
}
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ goto check_btf;
+ }
datasec_id = find_btf_percpu_datasec(btf);
if (datasec_id > 0) {
@@ -14965,9 +16033,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
}
}
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
-
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
@@ -14995,7 +16060,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-
+check_btf:
/* check whether we recorded this BTF (and maybe module) already */
for (i = 0; i < env->used_btf_cnt; i++) {
if (env->used_btfs[i].btf == btf) {
@@ -15839,14 +16904,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
- bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- ctx_access = true;
} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
@@ -15856,7 +16919,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- ctx_access = BPF_CLASS(insn->code) == BPF_STX;
} else {
continue;
}
@@ -15879,9 +16941,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
- if (!ctx_access)
- continue;
-
switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -16272,11 +17331,62 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc) {
+ *addr = (unsigned long)xdp_kfunc;
+ return;
+ }
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
+ if (offset)
+ return;
+
+ if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ }
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
- void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -16285,18 +17395,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
*cnt = 0;
- if (bpf_dev_bound_kfunc_id(insn->imm)) {
- xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
- if (xdp_kfunc) {
- insn->imm = BPF_CALL_IMM(xdp_kfunc);
- return 0;
- }
-
- /* fallback to default kfunc when not supported by netdev */
- }
-
- /* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_call_base).
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
@@ -16305,7 +17406,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
- insn->imm = desc->imm;
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
return 0;
if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
@@ -16318,7 +17420,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[2] = addr[1];
insn_buf[3] = *insn;
*cnt = 4;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
@@ -16326,6 +17429,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[1] = addr[1];
insn_buf[2] = *insn;
*cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
@@ -16665,21 +17782,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (int (*)(struct bpf_map *map,
+ (long (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
void *callback_ctx,
u64 flags))NULL));
@@ -16859,7 +17976,7 @@ patch_call_imm:
}
}
- sort_kfunc_descs_by_imm(env->prog);
+ sort_kfunc_descs_by_imm_off(env->prog);
return 0;
}
@@ -17287,6 +18404,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
const char *tname;
struct btf *btf;
long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -17460,8 +18578,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
+ module_put(mod);
bpf_log(log,
"The address of function %s cannot be found\n",
tname);
@@ -17501,11 +18628,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
break;
}
if (ret) {
+ module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
return ret;
}
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
if (tgt_prog) {
+ module_put(mod);
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
@@ -17514,6 +18643,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {
+ module_put(mod);
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
}
@@ -17524,6 +18654,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
tgt_info->tgt_addr = addr;
tgt_info->tgt_name = tname;
tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
return 0;
}
@@ -17536,6 +18667,14 @@ BTF_ID(func, migrate_enable)
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
BTF_SET_END(btf_id_deny)
static bool can_be_sleepable(struct bpf_prog *prog)
@@ -17603,6 +18742,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* store info about the attachment target that will be used later */
prog->aux->attach_func_proto = tgt_info.tgt_type;
prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
if (tgt_prog) {
prog->aux->saved_dst_prog_type = tgt_prog->type;
@@ -17647,12 +18787,12 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
/* no program is valid */
@@ -17665,7 +18805,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
- log = &env->log;
len = (*prog)->len;
env->insn_aux_data =
@@ -17686,20 +18825,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- ret = -EINVAL;
- goto err_unlock;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
mark_verifier_state_clean(env);
@@ -17721,8 +18854,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
- env->rcu_tag_supported = btf_vmlinux &&
- btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
@@ -17815,9 +18946,14 @@ skip_full_check:
print_verification_stats(env);
env->prog->aux->verified_insns = env->insn_processed;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
+
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 935e8121b21e..ae518166d70a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1465,8 +1465,18 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
{
struct css_set *cset;
- cset = current->nsproxy->cgroup_ns->root_cset;
- return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_task_namespaces() and
+ * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
+ * cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
}
/* look up cgroup associated with given css_set on the specified hierarchy */
@@ -6856,14 +6866,12 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
- struct file *f;
-
- f = fget_raw(fd);
- if (!f)
+ struct fd f = fdget_raw(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f);
- fput(f);
+ cgrp = cgroup_v1v2_get_from_file(f.file);
+ fdput(f);
return cgrp;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 636f1c682ac0..505d86b16642 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent, tmp->new_cpus);
+ update_tasks_cpumask(parent, tmp->addmask);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
* to allocated cpumasks.
+ *
+ * Note that update_parent_subparts_cpumask() uses only addmask &
+ * delmask, but not new_cpus.
*/
tmp.addmask = trialcs->subparts_cpus;
tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = trialcs->cpus_allowed;
+ tmp.new_cpus = NULL;
#endif
retval = validate_change(cs, trialcs);
@@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
spin_unlock_irq(&callback_lock);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /* Now trialcs->cpus_allowed is available */
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
/* effective_cpus will be updated here */
update_cpumasks_hier(cs, &tmp, false);
@@ -2445,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
+{
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
+}
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
@@ -2459,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!is_in_v2_mode() &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
- goto out_unlock;
-
- /*
- * Task cannot be moved to a cpuset with empty effective cpus.
- */
- if (cpumask_empty(cs->effective_cpus))
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
@@ -2485,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
- ret = 0;
out_unlock:
percpu_up_write(&cpuset_rwsem);
return ret;
@@ -2494,25 +2508,47 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
+ struct cpuset *cs;
cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ cs->subparts_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_rwsem */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
@@ -2543,20 +2579,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
- if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
- else
- cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flags(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -3248,17 +3272,101 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cpuset_rwsem);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task, cs->effective_cpus);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ percpu_up_write(&cpuset_rwsem);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ percpu_down_write(&cpuset_rwsem);
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+ percpu_up_write(&cpuset_rwsem);
+}
+
+/*
* Make sure the new task conform to the current state of its parent,
* which could have been changed by cpuset just after it inherits the
* state from the parent and before it sits on the cgroup's task list.
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
+
+ /* CLONE_INTO_CGROUP */
+ percpu_down_write(&cpuset_rwsem);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ percpu_up_write(&cpuset_rwsem);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -3271,6 +3379,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
.legacy_cftypes = legacy_files,
.dfl_cftypes = dfl_files,
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 1b6b21851e9d..936473203a6b 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- static_branch_inc(&freezer_active);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (!(freezer->state & CGROUP_FREEZING)) {
freezer->state &= ~CGROUP_FROZEN;
if (was_freezing)
- static_branch_dec(&freezer_active);
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 831f1f472bb8..0a2b4967e333 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
struct task_cputime *cputime = &bstat->cputime;
int i;
- cputime->stime = 0;
- cputime->utime = 0;
- cputime->sum_exec_runtime = 0;
+ memset(bstat, 0, sizeof(*bstat));
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2f9c912df1c..144b2bd86b14 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,6 +7,5 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLOB_DEPRECATED is not set
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 5b919ef832b6..dac42a2ad588 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -623,7 +623,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ dma_get_min_align_mask(dev) | alloc_align_mask;
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int index, slots_checked, count = 0, i;
@@ -639,8 +639,8 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
* allocations.
*/
if (alloc_size >= PAGE_SIZE)
- iotlb_align_mask &= PAGE_MASK;
- iotlb_align_mask &= alloc_align_mask;
+ iotlb_align_mask |= ~PAGE_MASK;
+ iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
/*
* For mappings with an alignment requirement don't bother looping to
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 0b6379adff6b..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -68,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
@@ -86,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
if (offset && offset + len <= offset)
return -EINVAL;
- if (selector && !access_ok(selector, sizeof(*selector)))
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
return -EFAULT;
break;
@@ -94,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb3e436bcd4a..435815d3be3f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12173,7 +12173,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -12893,12 +12893,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
- __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ }
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..bfe73db1c26c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -1171,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1622,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1634,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -1951,6 +1961,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2005,7 +2100,7 @@ static void rv_task_fork(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -2098,6 +2193,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2107,6 +2204,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2249,7 +2349,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2274,6 +2374,9 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2291,21 +2394,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2622,6 +2716,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2725,7 +2820,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
@@ -2733,6 +2829,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8ce75495e04f..d2742af0f0fd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -189,9 +189,12 @@ void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
+ for_each_action_of_desc(desc, action) {
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ if (action->secondary && action->secondary->thread)
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 54d077e1a2dc..5a60cc52adc0 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -337,11 +337,20 @@ static void delay_access(int type)
*/
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
switch (size) {
- case 1: return READ_ONCE(*(const u8 *)ptr);
- case 2: return READ_ONCE(*(const u16 *)ptr);
- case 4: return READ_ONCE(*(const u32 *)ptr);
- case 8: return READ_ONCE(*(const u64 *)ptr);
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
default: return 0; /* Ignore; we do not diff the values. */
}
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..4bc6e0971ec9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
/* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
@@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- char name[TASK_COMM_LEN];
- va_list aq;
- int len;
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- va_copy(aq, args);
- len = vsnprintf(name, sizeof(name), namefmt, aq);
- va_end(aq);
- if (len >= TASK_COMM_LEN) {
- struct kthread *kthread = to_kthread(task);
-
- /* leave it truncated when out of memory. */
- kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
- }
- set_task_comm(task, name);
- }
+free_create:
kfree(create);
return task;
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..feaf90c5e537 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -596,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -612,7 +612,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_object_groups,
@@ -628,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..dcd1d5bfc1e0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
printk(" lock(");
@@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
- printk(" lock(");
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
__print_lock_name(source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
@@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5693,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f04b1978899d..153ddc4c47ef 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -51,8 +51,11 @@ torture_param(int, rt_boost, 2,
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
@@ -79,10 +82,12 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(int tid);
@@ -252,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -365,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
static int torture_mutex_lock(int tid __maybe_unused)
__acquires(torture_mutex)
@@ -393,11 +473,24 @@ __releases(torture_mutex)
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
.task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -504,6 +597,28 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_rtmutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
+}
+
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
static int torture_rtmutex_lock(int tid __maybe_unused)
__acquires(torture_rtmutex)
@@ -545,11 +660,24 @@ static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
__torture_rt_boost(trsp);
}
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
.task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -684,6 +812,8 @@ static int lock_torture_writer(void *arg)
struct lock_stress_stats *lwsp = arg;
int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
+ u32 lockset_mask;
+ bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -692,19 +822,40 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock(tid);
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
- lwsp->n_lock_fail++; /* rare, but... */
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
- lwsp->n_lock_acquired++;
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ }
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock(tid);
+ if (!skip_main_lock) {
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -845,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress,
+ nested_locks, stat_interval, verbose, shuffle_interval,
+ stutter, shutdown_secs, onoff_interval, onoff_holdoff);
}
static void lock_torture_cleanup(void)
@@ -919,6 +1070,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -1068,6 +1220,10 @@ static int __init lock_torture_init(void)
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 29dc253d03af..93cca6e69860 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..1c877561a7d2 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module *mod,
void init_build_id(struct module *mod, const struct load_info *info);
void layout_symtab(struct module *mod, struct load_info *info);
void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
static inline bool sect_empty(const Elf_Shdr *sect)
{
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index ab2376a1be88..bdc911dbcde5 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
}
/* Given a module and name of symbol, find and return the symbol's value */
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
{
unsigned int i;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
if (colon) {
mod = find_module_all(name, colon - name, false);
if (mod)
- return find_kallsyms_symbol_value(mod, colon + 1);
+ return __find_kallsyms_symbol_value(mod, colon + 1);
return 0;
}
@@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- ret = find_kallsyms_symbol_value(mod, name);
+ ret = __find_kallsyms_symbol_value(mod, name);
if (ret)
return ret;
}
@@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned long ret;
+
+ preempt_disable();
+ ret = __find_kallsyms_symbol_value(mod, name);
+ preempt_enable();
+ return ret;
+}
+
int module_kallsyms_on_each_symbol(const char *modname,
int (*fn)(void *, const char *,
struct module *, unsigned long),
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
index 486d4ff92719..a89f01e1d6b7 100644
--- a/kernel/module/livepatch.c
+++ b/kernel/module/livepatch.c
@@ -11,7 +11,7 @@
#include "internal.h"
/*
- * Persist Elf information about a module. Copy the Elf header,
+ * Persist ELF information about a module. Copy the ELF header,
* section header table, section string table, and symtab section
* index from info to mod->klp_info.
*/
@@ -25,11 +25,11 @@ int copy_module_elf(struct module *mod, struct load_info *info)
if (!mod->klp_info)
return -ENOMEM;
- /* Elf header */
+ /* ELF header */
size = sizeof(mod->klp_info->hdr);
memcpy(&mod->klp_info->hdr, info->hdr, size);
- /* Elf section header table */
+ /* ELF section header table */
size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
if (!mod->klp_info->sechdrs) {
@@ -37,7 +37,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_info;
}
- /* Elf section name string table */
+ /* ELF section name string table */
size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
if (!mod->klp_info->secstrings) {
@@ -45,7 +45,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_sechdrs;
}
- /* Elf symbol section index */
+ /* ELF symbol section index */
symndx = info->index.sym;
mod->klp_info->symndx = symndx;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a487ff24129b..80d9c6d77a45 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,21 +545,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ struct fd f = fdget(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
+ if (proc_ns_file(f.file)) {
+ ns = get_proc_ns(file_inode(f.file));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ } else if (!IS_ERR(pidfd_pid(f.file))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -571,17 +570,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, f.file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
+ fdput(f);
return err;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index e007b8a4b738..222d60195de6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -491,7 +491,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
return;
/* Ensure at least one thread when size < min_chunk. */
- nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
nworks = min(nworks, job->max_threads);
if (nworks == 1) {
@@ -967,7 +967,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
.default_groups = padata_default_groups,
.release = padata_sysfs_release,
diff --git a/kernel/pid.c b/kernel/pid.c
index 3fbc5e46b721..f93954a0384d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
-
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
+ int pidfd;
+ struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 31ec4a9b9d70..3113ec2f1db4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,6 +6,7 @@
* Copyright (c) 2003 Open Source Development Lab
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
@@ -83,6 +84,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
@@ -314,24 +328,27 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
-#define suspend_attr(_name) \
+#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%d\n", suspend_stats._name); \
+ return sprintf(buf, format_str, suspend_stats._name); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_attr(success, "%d\n");
+suspend_attr(fail, "%d\n");
+suspend_attr(failed_freeze, "%d\n");
+suspend_attr(failed_prepare, "%d\n");
+suspend_attr(failed_suspend, "%d\n");
+suspend_attr(failed_suspend_late, "%d\n");
+suspend_attr(failed_suspend_noirq, "%d\n");
+suspend_attr(failed_resume, "%d\n");
+suspend_attr(failed_resume_early, "%d\n");
+suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -391,12 +408,30 @@ static struct attribute *suspend_attrs[] = {
&last_failed_dev.attr,
&last_failed_errno.attr,
&last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
NULL,
};
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd0c9f913940..9644f6e5bf15 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -730,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > PRINTKRB_RECORD_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -792,9 +792,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
};
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
@@ -859,8 +856,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
@@ -893,9 +888,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
@@ -944,9 +936,6 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0786450074c1..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
break;
#endif
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index ab62074174c3..9071182b1284 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -53,9 +53,6 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- def_bool y
-
config TINY_SRCU
bool
default y if TINY_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 115616ac3bfa..4a1b9622598b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -14,6 +14,43 @@
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
@@ -341,11 +378,13 @@ extern void rcu_init_geometry(void);
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 91fb5905a008..e82ec9f9a5d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -631,8 +631,7 @@ static int compute_real(int n)
static int
rcu_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_scale_cleanup();
kernel_power_off();
@@ -716,7 +715,7 @@ kfree_scale_thread(void *arg)
// is tested.
if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
(kfree_rcu_test_both && torture_random(&tr) & 0x800))
- kfree_rcu(alloc_ptr);
+ kfree_rcu_mightsleep(alloc_ptr);
else
kfree_rcu(alloc_ptr, rh);
}
@@ -771,8 +770,8 @@ kfree_scale_cleanup(void)
static int
kfree_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8e6c023212cb..147551c23baf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -119,7 +119,9 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
@@ -179,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
@@ -2194,12 +2195,11 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
@@ -2217,15 +2217,13 @@ rcu_torture_stats_print(void)
if (atomic_read(&n_rcu_torture_mberror) ||
atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
@@ -2358,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
- "nocbs_nthreads=%d nocbs_toggle=%d\n",
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2369,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
- nocbs_nthreads, nocbs_toggle);
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3273,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void)
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -3297,6 +3320,8 @@ rcu_torture_cleanup(void)
return;
}
+ rcutorture_test_nmis(test_nmis);
+
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
@@ -3463,6 +3488,188 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
@@ -3501,9 +3708,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -3540,7 +3755,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index afa3e1a2f690..1970ce5f22d4 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -1031,7 +1031,7 @@ ref_scale_cleanup(void)
static int
ref_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq, shutdown_start);
+ wait_event_idle(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b12fb0cec44d..336af24e0fe3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ab4ee58af84b..20d7a238d675 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -103,7 +103,7 @@ do { \
#define spin_trylock_irqsave_rcu_node(p, flags) \
({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
\
if (___locked) \
smp_mb__after_unlock_lock(); \
@@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
sdp->mynode = NULL;
sdp->cpu = cpu;
INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -173,14 +173,14 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
- ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
- if (!ssp->node)
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
return false;
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
* leaves of the srcu_node tree.
*/
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
sdp->mynode = &snp_first[cpu / levelspread[level]];
@@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
}
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
return true;
}
@@ -236,36 +236,47 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
- ssp->node = NULL;
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
- ssp->sda_is_static = is_static;
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- if (!ssp->sda)
+ if (!ssp->sda) {
+ if (!is_static)
+ kfree(ssp->srcu_sup);
return -ENOMEM;
+ }
init_srcu_struct_data(ssp);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
- if (!ssp->sda_is_static) {
+ if (!ssp->srcu_sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(ssp->srcu_sup);
return -ENOMEM;
}
} else {
- WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
}
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -277,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -294,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -306,8 +315,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
*/
static void __srcu_transition_to_big(struct srcu_struct *ssp)
{
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
}
/*
@@ -318,15 +327,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
unsigned long flags;
/* Double-checked locking on ->srcu_size-state. */
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -337,14 +346,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
- if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
return;
j = jiffies;
- if (ssp->srcu_size_jiffies != j) {
- ssp->srcu_size_jiffies = j;
- ssp->srcu_n_lock_retries = 0;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
}
- if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
return;
__srcu_transition_to_big(ssp);
}
@@ -361,9 +370,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
if (spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, *flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_rcu_node(sdp, *flags);
}
@@ -375,9 +384,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
*/
static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
{
- if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
}
@@ -394,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -607,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long gpstart;
unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
jbase = 0;
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
- gpstart = READ_ONCE(ssp->srcu_gp_start);
+ gpstart = READ_ONCE(sup->srcu_gp_start);
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
}
}
@@ -634,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -648,21 +659,23 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
- WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
- rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- if (!ssp->sda_is_static) {
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
}
- kfree(ssp->node);
- ssp->node = NULL;
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -760,23 +773,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
struct srcu_data *sdp;
int state;
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = this_cpu_ptr(ssp->sda);
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
- WRITE_ONCE(ssp->srcu_gp_start, jiffies);
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -849,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp)
unsigned long sgsne;
struct srcu_node *snp;
int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
cbdelay = 0;
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
cbdelay);
@@ -879,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
if (last_lvl)
cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
@@ -911,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp)
}
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
}
/* Transition to big if needed. */
@@ -930,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (ss_state == SRCU_SIZE_ALLOC)
init_srcu_struct_nodes(ssp, GFP_KERNEL);
else
- smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -950,7 +964,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (snp)
for (; snp != NULL; snp = snp->srcu_parent) {
sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -963,9 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
spin_unlock_irqrestore_rcu_node(snp, flags);
}
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -990,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
struct srcu_node *snp;
struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Ensure that snp node tree is fully initialized before traversing it */
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
snp_leaf = NULL;
else
snp_leaf = sdp->mynode;
@@ -1000,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp_leaf)
/* Each pass through the loop does one level of the srcu_node tree. */
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
@@ -1027,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/* Top of tree, must ensure the grace period will be started. */
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
/* If grace period not already in progress, start it. */
- if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
// And how can that list_add() in the "else" clause
@@ -1049,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
// can only be executed during early boot when there is only
// the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
+ queue_delayed_work(rcu_gp_wq, &sup->work,
!!srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -1085,16 +1100,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1154,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -1199,7 +1234,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
* sequence number cannot wrap around in the meantime.
*/
idx = __srcu_read_lock_nmisafe(ssp);
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
@@ -1208,8 +1243,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
@@ -1307,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
@@ -1420,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
// Any prior manipulation of SRCU-protected data must happen
// before the load from ->srcu_gp_seq.
smp_mb();
- return rcu_seq_snap(&ssp->srcu_gp_seq);
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -1467,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1486,8 +1523,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
}
/*
@@ -1501,13 +1538,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
{
spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
&sdp->srcu_barrier_head)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
}
spin_unlock_irq_rcu_node(sdp);
}
@@ -1520,23 +1557,23 @@ void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
int idx;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
idx = __srcu_read_lock_nmisafe(ssp);
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
else
for_each_possible_cpu(cpu)
@@ -1544,12 +1581,12 @@ void srcu_barrier(struct srcu_struct *ssp)
__srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1575,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1587,39 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- ssp->srcu_n_exp_nodelay = 0;
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1627,10 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
- ssp->srcu_n_exp_nodelay = 0;
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1656,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1684,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
@@ -1700,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1724,22 +1761,24 @@ static void process_srcu(struct work_struct *work)
unsigned long curdelay;
unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
curdelay = srcu_get_delay(ssp);
if (curdelay) {
- WRITE_ONCE(ssp->reschedule_count, 0);
+ WRITE_ONCE(sup->reschedule_count, 0);
} else {
j = jiffies;
- if (READ_ONCE(ssp->reschedule_jiffies) == j) {
- WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
- WRITE_ONCE(ssp->reschedule_count, 1);
- WRITE_ONCE(ssp->reschedule_jiffies, j);
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
}
}
srcu_reschedule(ssp, curdelay);
@@ -1752,7 +1791,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
@@ -1774,14 +1813,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
- int ss_state = READ_ONCE(ssp->srcu_size_state);
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
srcu_size_state_name[ss_state_idx]);
if (!ssp->sda) {
// Called after cleanup_srcu_struct(), perhaps.
@@ -1838,7 +1877,7 @@ early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
/* Decide on srcu_struct-size strategy. */
if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
@@ -1858,12 +1897,13 @@ void __init srcu_init(void)
*/
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- list_del_init(&ssp->work.work.entry);
- if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
- ssp->srcu_size_state = SRCU_SIZE_ALLOC;
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1873,13 +1913,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1888,10 +1929,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index bfb5e1549f2b..5f4fc8184dd0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -136,8 +136,16 @@ static struct rcu_tasks rt_name = \
.kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+#endif
+
+#ifdef CONFIG_TASKS_RCU
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
@@ -830,6 +838,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
* Exiting tasks may escape the tasklist scan. Those are vulnerable
* until their final schedule() with TASK_DEAD state. To cope with
@@ -848,6 +863,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -923,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e880c09ab59..f52ff7241041 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
/*
@@ -1955,7 +1956,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
bool needacc = false;
struct rcu_node *rnp;
@@ -1987,7 +1987,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
* NOCB kthreads have their own way to deal with that...
*/
if (!rcu_rdp_is_offloaded(rdp)) {
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
} else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
/*
* ...but NOCB kthreads may miss or delay callbacks acceleration
@@ -1999,8 +2004,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
if (needacc) {
rcu_nocb_lock_irqsave(rdp, flags);
@@ -2131,6 +2134,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
break;
}
} else {
+ // In rcuoc context, so no worries about depriving
+ // other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -3024,6 +3029,18 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
return !!READ_ONCE(krcp->head);
}
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
static int krc_count(struct kfree_rcu_cpu *krcp)
{
int sum = atomic_read(&krcp->head_count);
@@ -3107,15 +3124,14 @@ static void kfree_rcu_monitor(struct work_struct *work)
for (i = 0; i < KFREE_N_BATCHES; i++) {
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- // Try to detach bulk_head or head and attach it over any
- // available corresponding free channel. It can be that
- // a previous RCU batch is in progress, it means that
- // immediately to queue another one is not possible so
- // in that case the monitor work is rearmed.
- if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) ||
- (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) ||
- (READ_ONCE(krcp->head) && !krwp->head_free)) {
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
@@ -4940,9 +4956,8 @@ void __init rcu_init(void)
else
qovld_calc = qovld;
- // Kick-start any polled grace periods that started early.
- if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
- (void)start_poll_synchronize_rcu_expedited();
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 249c2967d9e6..3b7abb58157d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_exp_jiffies_till_stall_check();
@@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void)
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
mask = READ_ONCE(rnp->expmask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- preempt_disable();
if (cpu_online(cpu))
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
- preempt_enable();
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
@@ -802,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -1065,9 +1068,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
if (rcu_init_invoked())
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
if (!poll_state_synchronize_rcu(s)) {
- rnp->exp_seq_poll_rq = s;
- if (rcu_init_invoked())
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
}
if (rcu_init_invoked())
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 9e1c8caec5ce..f2280616f9d5 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1312,6 +1312,7 @@ int rcu_nocb_cpu_offload(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+#ifdef CONFIG_RCU_LAZY
static unsigned long
lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -1360,6 +1361,7 @@ static struct shrinker lazy_rcu_shrinker = {
.batch = 0,
.seeks = DEFAULT_SEEKS,
};
+#endif // #ifdef CONFIG_RCU_LAZY
void __init rcu_init_nohz(void)
{
@@ -1391,8 +1393,10 @@ void __init rcu_init_nohz(void)
if (!rcu_state.nocb_is_setup)
return;
+#ifdef CONFIG_RCU_LAZY
if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
pr_err("Failed to register lazy_rcu shrinker!\n");
+#endif // #ifdef CONFIG_RCU_LAZY
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..5f6587d94c1d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10238,6 +10238,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
+
+ /*
+ * If the local group is more loaded than the average system
+ * load, don't try to pull any tasks.
+ */
+ if (local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
}
/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 8cb28f1df294..8f6330f0e9ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
/*
* Now find a thread we can wake up to take the signal off the queue.
*
- * If the main thread wants the signal, it gets first crack.
- * Probably the least surprising to the average bear.
+ * Try the suggested task first (may or may not be the main thread).
*/
if (wants_signal(sig, p))
t = p;
@@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
ret = -1;
rcu_read_lock();
+
+ /*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
t = pid_task(pid, type);
- if (!t || !likely(lock_task_sighand(t, &flags)))
+ if (!t)
+ goto ret;
+ if (type != PIDTYPE_PID && same_thread_group(t, current))
+ t = current;
+ if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..1b725510dd0f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -793,10 +793,15 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (tasklet_clear_sched(t)) {
- if (t->use_callback)
+ if (t->use_callback) {
+ trace_tasklet_entry(t, t->callback);
t->callback(t);
- else
+ trace_tasklet_exit(t, t->callback);
+ } else {
+ trace_tasklet_entry(t, t->func);
t->func(t->data);
+ trace_tasklet_exit(t, t->func);
+ }
}
tasklet_unlock(t);
continue;
diff --git a/kernel/sys.c b/kernel/sys.c
index 495cd87d9bf4..351de7916302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
+ bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
+ (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
+ uid_eq(keuid, old->fsuid))) &&
+ (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
+ return 0;
+
+ ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
+ euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
+ suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
+ if ((ruid_new || euid_new || suid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
- if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
- !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
- goto error;
- if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
- !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
- goto error;
- if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
- !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
- goto error;
- }
-
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
@@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
+ bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
@@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
+ (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
+ gid_eq(kegid, old->fsgid))) &&
+ (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
+ return 0;
+
+ rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
+ egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
+ sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
+ if ((rgid_new || egid_new || sgid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETGID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
- if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
- !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
- goto error;
- if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
- !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
- goto error;
- if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
- !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
- goto error;
- }
if (rgid != (gid_t) -1)
new->gid = krgid;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b34022c..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a11b39..808a247205a9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..65b8658da829 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ ktime_t next_p;
+ u32 rem;
+
tick_do_timer_cpu = cpu;
- tick_next_period = ktime_get();
+ next_p = ktime_get();
+ div_u64_rem(next_p, TICK_NSEC, &rem);
+ if (rem) {
+ next_p -= rem;
+ next_p += TICK_NSEC;
+ }
+
+ tick_next_period = next_p;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..52254679ec48 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU_EXP) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+ return true;
+ }
+
return false;
}
@@ -527,7 +532,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
tick_nohz_full_running = true;
}
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
* The tick_do_timer_cpu CPU handles housekeeping duty (unbound
@@ -535,8 +540,13 @@ static int tick_nohz_cpu_down(unsigned int cpu)
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -637,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!ts->idle_active))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (ts->idle_active && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -681,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -691,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -722,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -732,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
-
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -1084,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -1119,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..5ed5a9d41d5a 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,65 +22,82 @@ enum tick_nohz_mode {
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
* it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a856d4a34c67..5b1e7fa41ca8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -257,7 +257,7 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config DYNAMIC_FTRACE_WITH_CALL_OPS
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e8da032bb6fc..bcf91bc7bf71 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1453,10 +1453,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
case BPF_FUNC_cgrp_storage_get:
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0feea145bb29..3b46dba3f69b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2583,28 +2583,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr;
+ unsigned long addr = READ_ONCE(ops->direct_call);
- addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
arch_ftrace_set_direct_caller(fregs, addr);
}
-
-static struct ftrace_ops direct_ops = {
- .func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
- | FTRACE_OPS_FL_PERMANENT,
- /*
- * By declaring the main trampoline as this trampoline
- * it will never have one allocated for it. Allocated
- * trampolines should not call direct functions.
- * The direct_ops should only be called by the builtin
- * ftrace_regs_caller trampoline.
- */
- .trampoline = FTRACE_REGS_ADDR,
-};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -5301,388 +5286,9 @@ struct ftrace_direct_func {
static LIST_HEAD(ftrace_direct_funcs);
-/**
- * ftrace_find_direct_func - test an address if it is a registered direct caller
- * @addr: The address of a registered direct caller
- *
- * This searches to see if a ftrace direct caller has been registered
- * at a specific address, and if so, it returns a descriptor for it.
- *
- * This can be used by architecture code to see if an address is
- * a direct caller (trampoline) attached to a fentry/mcount location.
- * This is useful for the function_graph tracer, as it may need to
- * do adjustments if it traced a location that also has a direct
- * trampoline attached to it.
- */
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *entry;
- bool found = false;
-
- /* May be called by fgraph trampoline (protected by rcu tasks) */
- list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
- if (entry->addr == addr) {
- found = true;
- break;
- }
- }
- if (found)
- return entry;
-
- return NULL;
-}
-
-static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *direct;
-
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
- if (!direct)
- return NULL;
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
- return direct;
-}
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
-/**
- * register_ftrace_direct - Call a custom trampoline directly
- * @ip: The address of the nop at the beginning of a function
- * @addr: The address of the trampoline to call at @ip
- *
- * This is used to connect a direct call from the nop location (@ip)
- * at the start of ftrace traced functions. The location that it calls
- * (@addr) must be able to handle a direct call, and save the parameters
- * of the function being traced, and restore them (or inject new ones
- * if needed), before returning.
- *
- * Returns:
- * 0 on success
- * -EBUSY - Another direct function is already attached (there can be only one)
- * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
- * -ENOMEM - There was an allocation failure.
- */
-int register_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *free_hash = NULL;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- /* See if there's a direct function at @ip already */
- ret = -EBUSY;
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
-
- ret = -ENODEV;
- rec = lookup_rec(ip, ip);
- if (!rec)
- goto out_unlock;
-
- /*
- * Check if the rec says it has a direct call but we didn't
- * find one earlier?
- */
- if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
- goto out_unlock;
-
- /* Make sure the ip points to the exact record */
- if (ip != rec->ip) {
- ip = rec->ip;
- /* Need to check this ip for a direct. */
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
- }
-
- ret = -ENOMEM;
- direct = ftrace_find_direct_func(addr);
- if (!direct) {
- direct = ftrace_alloc_direct_func(addr);
- if (!direct)
- goto out_unlock;
- }
-
- entry = ftrace_add_rec_direct(ip, addr, &free_hash);
- if (!entry)
- goto out_unlock;
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
-
- if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function_nolock(&direct_ops);
- if (ret)
- ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
- }
-
- if (ret) {
- remove_hash_entry(direct_functions, entry);
- kfree(entry);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- if (free_hash)
- free_ftrace_hash(free_hash);
- free_hash = NULL;
- ftrace_direct_func_count--;
- }
- } else {
- direct->count++;
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- if (free_hash) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_direct);
-
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
- struct dyn_ftrace **recp)
-{
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
-
- rec = lookup_rec(*ip, *ip);
- if (!rec)
- return NULL;
-
- entry = __ftrace_lookup_ip(direct_functions, rec->ip);
- if (!entry) {
- WARN_ON(rec->flags & FTRACE_FL_DIRECT);
- return NULL;
- }
-
- WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-
- /* Passed in ip just needs to be on the call site */
- *ip = rec->ip;
-
- if (recp)
- *recp = rec;
-
- return entry;
-}
-
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *hash;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, NULL);
- if (!entry)
- goto out_unlock;
-
- hash = direct_ops.func_hash->filter_hash;
- if (hash->count == 1)
- unregister_ftrace_function(&direct_ops);
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
-
- WARN_ON(ret);
-
- remove_hash_entry(direct_functions, entry);
-
- direct = ftrace_find_direct_func(addr);
- if (!WARN_ON(!direct)) {
- /* This is the good path (see the ! before WARN) */
- direct->count--;
- WARN_ON(direct->count < 0);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- kfree(entry);
- ftrace_direct_func_count--;
- }
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-
-static struct ftrace_ops stub_ops = {
- .func = ftrace_stub,
-};
-
-/**
- * ftrace_modify_direct_caller - modify ftrace nop directly
- * @entry: The ftrace hash entry of the direct helper for @rec
- * @rec: The record representing the function site to patch
- * @old_addr: The location that the site at @rec->ip currently calls
- * @new_addr: The location that the site at @rec->ip should call
- *
- * An architecture may overwrite this function to optimize the
- * changing of the direct callback on an ftrace nop location.
- * This is called with the ftrace_lock mutex held, and no other
- * ftrace callbacks are on the associated record (@rec). Thus,
- * it is safe to modify the ftrace record, where it should be
- * currently calling @old_addr directly, to call @new_addr.
- *
- * This is called with direct_mutex locked.
- *
- * Safety checks should be made to make sure that the code at
- * @rec->ip is currently calling @old_addr. And this must
- * also update entry->direct to @new_addr.
- */
-int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
- struct dyn_ftrace *rec,
- unsigned long old_addr,
- unsigned long new_addr)
-{
- unsigned long ip = rec->ip;
- int ret;
-
- lockdep_assert_held(&direct_mutex);
-
- /*
- * The ftrace_lock was used to determine if the record
- * had more than one registered user to it. If it did,
- * we needed to prevent that from changing to do the quick
- * switch. But if it did not (only a direct caller was attached)
- * then this function is called. But this function can deal
- * with attached callers to the rec that we care about, and
- * since this function uses standard ftrace calls that take
- * the ftrace_lock mutex, we need to release it.
- */
- mutex_unlock(&ftrace_lock);
-
- /*
- * By setting a stub function at the same address, we force
- * the code to call the iterator and the direct_ops helper.
- * This means that @ip does not call the direct call, and
- * we can simply modify it.
- */
- ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
- if (ret)
- goto out_lock;
-
- ret = register_ftrace_function_nolock(&stub_ops);
- if (ret) {
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
- goto out_lock;
- }
-
- entry->direct = new_addr;
-
- /*
- * By removing the stub, we put back the direct call, calling
- * the @new_addr.
- */
- unregister_ftrace_function(&stub_ops);
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
- out_lock:
- mutex_lock(&ftrace_lock);
-
- return ret;
-}
-
-/**
- * modify_ftrace_direct - Modify an existing direct call to call something else
- * @ip: The instruction pointer to modify
- * @old_addr: The address that the current @ip calls directly
- * @new_addr: The address that the @ip should call
- *
- * This modifies a ftrace direct caller at an instruction pointer without
- * having to disable it first. The direct call will switch over to the
- * @new_addr without missing anything.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -ENODEV : the @ip given has no direct caller attached
- * -EINVAL : the @old_addr does not match the current direct caller
- */
-int modify_ftrace_direct(unsigned long ip,
- unsigned long old_addr, unsigned long new_addr)
-{
- struct ftrace_direct_func *direct, *new_direct = NULL;
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- mutex_lock(&ftrace_lock);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, &rec);
- if (!entry)
- goto out_unlock;
-
- ret = -EINVAL;
- if (entry->direct != old_addr)
- goto out_unlock;
-
- direct = ftrace_find_direct_func(old_addr);
- if (WARN_ON(!direct))
- goto out_unlock;
- if (direct->count > 1) {
- ret = -ENOMEM;
- new_direct = ftrace_alloc_direct_func(new_addr);
- if (!new_direct)
- goto out_unlock;
- direct->count--;
- new_direct->count++;
- } else {
- direct->addr = new_addr;
- }
-
- /*
- * If there's no other ftrace callback on the rec->ip location,
- * then it can be changed directly by the architecture.
- * If there is another caller, then we just need to change the
- * direct caller helper to point to @new_addr.
- */
- if (ftrace_rec_count(rec) == 1) {
- ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
- } else {
- entry->direct = new_addr;
- ret = 0;
- }
-
- if (unlikely(ret && new_direct)) {
- direct->count++;
- list_del_rcu(&new_direct->next);
- synchronize_rcu_tasks();
- kfree(new_direct);
- ftrace_direct_func_count--;
- }
-
- out_unlock:
- mutex_unlock(&ftrace_lock);
- mutex_unlock(&direct_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-
-#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5711,7 +5317,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
/**
- * register_ftrace_direct_multi - Call a custom trampoline directly
+ * register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the trampoline to call at @ops functions
@@ -5732,7 +5338,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
* -ENODEV - @ip does not point to a ftrace nop location (or not supported)
* -ENOMEM - There was an allocation failure.
*/
-int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
@@ -5774,6 +5380,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
+ ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
@@ -5790,11 +5397,11 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
return err;
}
-EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
/**
- * unregister_ftrace_direct_multi - Remove calls to custom trampoline
- * previously registered by register_ftrace_direct_multi for @ops object.
+ * unregister_ftrace_direct - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
*
* This is used to remove a direct calls to @addr from the nop locations
@@ -5805,7 +5412,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
* 0 on success
* -EINVAL - The @ops object was not properly registered.
*/
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+ bool free_filters)
{
struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
@@ -5823,12 +5431,15 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* cleanup for possible another register call */
ops->func = NULL;
ops->trampoline = 0;
+
+ if (free_filters)
+ ftrace_free_filter(ops);
return err;
}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
-__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash;
struct ftrace_func_entry *entry, *iter;
@@ -5844,6 +5455,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
+ tmp_ops.direct_call = addr;
err = register_ftrace_function_nolock(&tmp_ops);
if (err)
@@ -5865,6 +5477,8 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
entry->direct = addr;
}
}
+ /* Prevent store tearing if a trampoline concurrently accesses the value */
+ WRITE_ONCE(ops->direct_call, addr);
mutex_unlock(&ftrace_lock);
@@ -5875,7 +5489,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
/**
- * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5892,19 +5506,19 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
{
if (check_direct_multi(ops))
return -EINVAL;
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EINVAL;
- return __modify_ftrace_direct_multi(ops, addr);
+ return __modify_ftrace_direct(ops, addr);
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * modify_ftrace_direct - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5918,7 +5532,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
int err;
@@ -5928,11 +5542,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
return -EINVAL;
mutex_lock(&direct_mutex);
- err = __modify_ftrace_direct_multi(ops, addr);
+ err = __modify_ftrace_direct(ops, addr);
mutex_unlock(&direct_mutex);
return err;
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c6f47b6cfd5f..76a2d91eecad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3098,6 +3098,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
@@ -3107,6 +3111,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
@@ -4684,7 +4690,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Make sure we see any padding after the write update
- * (see rb_reset_tail())
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
*/
smp_rmb();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 937e9676dfd4..36a6037823cd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1149,22 +1149,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
unsigned long flags;
if (in_nmi()) {
- internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- internal_trace_puts("*** snapshot is being ignored ***\n");
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
return;
}
if (!tr->allocated_snapshot) {
- internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- internal_trace_puts("*** stopping trace here! ***\n");
- tracing_off();
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
return;
}
/* Note, snapshot can not be used when the tracer uses it */
if (tracer->use_max_tr) {
- internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
@@ -9516,6 +9516,7 @@ static int __remove_instance(struct trace_array *tr)
tracefs_remove(tr->dir);
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
+ clear_tracing_err_log(tr);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
@@ -10393,19 +10394,20 @@ out:
void __init ftrace_boot_snapshot(void)
{
+#ifdef CONFIG_TRACER_MAX_TRACE
struct trace_array *tr;
- if (snapshot_at_boot) {
- tracing_snapshot();
- internal_trace_puts("** Boot snapshot taken **\n");
- }
+ if (!snapshot_at_boot)
+ return;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == &global_trace)
+ if (!tr->allocated_snapshot)
continue;
- trace_array_puts(tr, "** Boot snapshot taken **\n");
+
tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
}
+#endif
}
void __init early_trace_init(void)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 46d0abb32d0f..d6a70aff2410 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -44,14 +44,21 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
+static DEFINE_MUTEX(lastcmd_mutex);
static char *last_cmd;
static int errpos(const char *str)
{
+ int ret = 0;
+
+ mutex_lock(&lastcmd_mutex);
if (!str || !last_cmd)
- return 0;
+ goto out;
- return err_pos(last_cmd, str);
+ ret = err_pos(last_cmd, str);
+ out:
+ mutex_unlock(&lastcmd_mutex);
+ return ret;
}
static void last_cmd_set(const char *str)
@@ -59,18 +66,22 @@ static void last_cmd_set(const char *str)
if (!str)
return;
+ mutex_lock(&lastcmd_mutex);
kfree(last_cmd);
-
last_cmd = kstrdup(str, GFP_KERNEL);
+ mutex_unlock(&lastcmd_mutex);
}
static void synth_err(u8 err_type, u16 err_pos)
{
+ mutex_lock(&lastcmd_mutex);
if (!last_cmd)
- return;
+ goto out;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
+ out:
+ mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 9176bb7a9bb4..efbbec2caff8 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr)
if (!found)
return;
- kvfree_rcu(inst);
+ kvfree_rcu_mightsleep(inst);
}
/*
@@ -1296,7 +1296,7 @@ static void notify_new_max_latency(u64 latency)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- if (tr->max_latency < latency) {
+ if (tracer_tracing_is_on(tr) && tr->max_latency < latency) {
tr->max_latency = latency;
latency_fsnotify(tr);
}
@@ -1738,6 +1738,8 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
+ notify_new_max_latency(diff);
+
timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 20d0c4a97633..2d2616678295 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp,
return -ENOENT;
list_del_rcu(&link->list);
- kvfree_rcu(link);
+ kvfree_rcu_mightsleep(link);
if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ff0536cea968..a931d9aaea26 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -785,14 +785,7 @@ static struct fgraph_ops fgraph_ops __initdata = {
};
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-#ifndef CALL_DEPTH_ACCOUNT
-#define CALL_DEPTH_ACCOUNT ""
-#endif
-
-noinline __noclone static void trace_direct_tramp(void)
-{
- asm(CALL_DEPTH_ACCOUNT);
-}
+static struct ftrace_ops direct;
#endif
/*
@@ -870,8 +863,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Register direct function together with graph tracer
* and make sure we get graph trace.
*/
- ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
+ ret = register_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp);
if (ret)
goto out;
@@ -891,8 +885,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
unregister_ftrace_graph(&fgraph_ops);
- ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ret = unregister_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp,
+ true);
if (ret)
goto out;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..b7cbd66f889e
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+ VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+ struct vhost_task *vtsk = data;
+ int ret;
+
+ ret = vtsk->fn(vtsk->data);
+ complete(&vtsk->exited);
+ do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+ pid_t pid = vtsk->task->pid;
+
+ set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+ wake_up_process(vtsk->task);
+ /*
+ * Make sure vhost_task_fn is no longer accessing the vhost_task before
+ * freeing it below. If userspace crashed or exited without closing,
+ * then the vhost_task->task could already be marked dead so
+ * kernel_wait will return early.
+ */
+ wait_for_completion(&vtsk->exited);
+ /*
+ * If we are just closing/removing a device and the parent process is
+ * not exiting then reap the task.
+ */
+ kernel_wait4(pid, NULL, __WCLONE, NULL);
+ kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+ return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+ const char *name)
+{
+ struct kernel_clone_args args = {
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .exit_signal = 0,
+ .fn = vhost_task_fn,
+ .name = name,
+ .user_worker = 1,
+ .no_files = 1,
+ .ignore_signals = 1,
+ };
+ struct vhost_task *vtsk;
+ struct task_struct *tsk;
+
+ vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+ if (!vtsk)
+ return NULL;
+ init_completion(&vtsk->exited);
+ vtsk->data = arg;
+ vtsk->fn = fn;
+
+ args.fn_arg = vtsk;
+
+ tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+ if (IS_ERR(tsk)) {
+ kfree(vtsk);
+ return NULL;
+ }
+
+ vtsk->task = tsk;
+ return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+ wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);