diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/cgroup_iter.c | 65 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 7 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 40 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 42 | ||||
-rw-r--r-- | kernel/bpf/ringbuf.c | 3 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 36 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 151 | ||||
-rw-r--r-- | kernel/bpf/tcx.c | 4 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 569 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 18 |
12 files changed, 845 insertions, 110 deletions
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 810378f04fbc..209e5135f9fb 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -294,3 +294,68 @@ static int __init bpf_cgroup_iter_init(void) } late_initcall(bpf_cgroup_iter_init); + +struct bpf_iter_css { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_kern { + struct cgroup_subsys_state *start; + struct cgroup_subsys_state *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, unsigned int flags) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css)); + + kit->start = NULL; + switch (flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + break; + default: + return -EINVAL; + } + + kit->start = start; + kit->pos = NULL; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + if (!kit->start) + return NULL; + + switch (kit->flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + kit->pos = css_next_descendant_pre(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + kit->pos = css_next_descendant_post(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_ANCESTORS_UP: + kit->pos = kit->pos ? kit->pos->parent : kit->start; + } + + return kit->pos; +} + +__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) +{ +} + +__diag_pop();
\ No newline at end of file diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e42a1bdb7f53..8a0bb80fe48a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -764,6 +764,16 @@ void __cpu_map_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool cpu_map_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) + return false; + __cpu_map_flush(); + return true; +} +#endif + static int __init cpu_map_init(void) { int cpu; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4d42f6ed6c11..a936c704d4e7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -418,6 +418,16 @@ void __dev_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool dev_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&dev_flush_list))) + return false; + __dev_flush(); + return true; +} +#endif + /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a8c7e1c5abfa..fd8d4b0addfc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -155,13 +155,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); preempt_disable(); + local_irq_save(flags); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); return -EBUSY; } - raw_spin_lock_irqsave(&b->raw_lock, flags); + raw_spin_lock(&b->raw_lock); *pflags = flags; return 0; @@ -172,8 +174,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, unsigned long flags) { hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock_irqrestore(&b->raw_lock, flags); + raw_spin_unlock(&b->raw_lock); __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62a53ebfedf9..e46ac288a108 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { @@ -1844,7 +1842,7 @@ unlock: * bpf_list_head which needs to be freed. */ migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1883,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1915,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) } /* Must be called under migrate_disable(), as required by bpf_mem_free */ -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { + struct bpf_mem_alloc *ma; + if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 @@ -1928,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) if (rec) bpf_obj_free_fields(rec, p); + if (percpu) + ma = &bpf_global_percpu_ma; + else + ma = &bpf_global_ma; if (rec && rec->refcount_off >= 0) - bpf_mem_free_rcu(&bpf_global_ma, p); + bpf_mem_free_rcu(ma, p); else - bpf_mem_free(&bpf_global_ma, p); + bpf_mem_free(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1939,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - __bpf_obj_drop_impl(p, meta ? meta->record : NULL); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1983,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2082,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2215,7 +2219,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { - return task_under_cgroup_hierarchy(task, ancestor); + long ret; + + rcu_read_lock(); + ret = task_under_cgroup_hierarchy(task, ancestor); + rcu_read_unlock(); + return ret; } #endif /* CONFIG_CGROUPS */ @@ -2555,6 +2564,15 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 39ea316c55e7..63b909d277d4 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -340,6 +340,7 @@ static void free_bulk(struct bpf_mem_cache *c) int cnt; WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); do { inc_active(c, &flags); @@ -365,6 +366,9 @@ static void __free_by_rcu(struct rcu_head *head) struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; @@ -491,21 +495,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) struct llist_node *first; unsigned int obj_size; - /* For per-cpu allocator, the size of free objects in free list doesn't - * match with unit_size and now there is no way to get the size of - * per-cpu pointer saved in free object, so just skip the checking. - */ - if (c->percpu_size) - return 0; - first = c->free_llist.first; if (!first) return 0; - obj_size = ksize(first); + if (c->percpu_size) + obj_size = pcpu_alloc_size(((void **)first)[1]); + else + obj_size = ksize(first); if (obj_size != c->unit_size) { - WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", - idx, obj_size, c->unit_size); + WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", + idx, c->percpu_size, obj_size, c->unit_size); return -EINVAL; } return 0; @@ -529,6 +529,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); @@ -878,6 +879,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) return !ret ? NULL : ret + LLIST_NODE_SZ; } +static notrace int bpf_mem_free_idx(void *ptr, bool percpu) +{ + size_t size; + + if (percpu) + size = pcpu_alloc_size(*((void **)ptr)); + else + size = ksize(ptr - LLIST_NODE_SZ); + return bpf_mem_cache_idx(size); +} + void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { int idx; @@ -885,7 +897,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; @@ -899,7 +911,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; @@ -973,6 +985,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } +/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the + * actual size of dynamic per-cpu area will always be matched and there is + * no need to adjust size_index for per-cpu allocation. However for the + * simplicity of the implementation, use an unified size_index for both + * kmalloc and per-cpu allocation. + */ static __init int bpf_mem_cache_adjust_size(void) { unsigned int size; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f045fde632e5..0ee653a936ea 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -770,8 +770,7 @@ schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ - smp_mb__before_atomic(); - atomic_set(&rb->busy, 0); + atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 341f8cb4405c..0ed286b8a0f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,8 +35,9 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> -#include <net/netfilter/nf_bpf_link.h> +#include <net/netfilter/nf_bpf_link.h> +#include <net/netkit.h> #include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -626,8 +627,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } -extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -662,8 +661,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + pointee_struct_meta->record : NULL, + fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); @@ -3732,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; @@ -3783,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && - attach_type != BPF_TCX_EGRESS) + attach_type != BPF_TCX_EGRESS && + attach_type != BPF_NETKIT_PRIMARY && + attach_type != BPF_NETKIT_PEER) return -EINVAL; return 0; default: @@ -3866,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_attach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_attach(attr, prog); + else + ret = netkit_prog_attach(attr, prog); break; default: ret = -EINVAL; @@ -3927,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_detach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_detach(attr, prog); + else + ret = netkit_prog_detach(attr, prog); break; default: ret = -EINVAL; @@ -3994,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return netkit_prog_query(attr, uattr); default: return -EINVAL; } @@ -4975,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_link_attach(attr, prog); + if (attr->link_create.attach_type == BPF_TCX_INGRESS || + attr->link_create.attach_type == BPF_TCX_EGRESS) + ret = tcx_link_attach(attr, prog); + else + ret = netkit_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index fef17628341f..59e747938bdb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -894,6 +894,157 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __diag_pop(); +struct bpf_iter_css_task { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_task_kern { + struct css_task_iter *css_it; +} __attribute__((aligned(8))); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, + struct cgroup_subsys_state *css, unsigned int flags) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != + __alignof__(struct bpf_iter_css_task)); + kit->css_it = NULL; + switch (flags) { + case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: + case CSS_TASK_ITER_PROCS: + case 0: + break; + default: + return -EINVAL; + } + + kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); + if (!kit->css_it) + return -ENOMEM; + css_task_iter_start(css, flags, kit->css_it); + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return NULL; + return css_task_iter_next(kit->css_it); +} + +__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return; + css_task_iter_end(kit->css_it); + bpf_mem_free(&bpf_global_ma, kit->css_it); +} + +__diag_pop(); + +struct bpf_iter_task { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_task_kern { + struct task_struct *task; + struct task_struct *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +enum { + /* all process in the system */ + BPF_TASK_ITER_ALL_PROCS, + /* all threads in the system */ + BPF_TASK_ITER_ALL_THREADS, + /* all threads of a specific process */ + BPF_TASK_ITER_PROC_THREADS +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, + struct task_struct *task__nullable, unsigned int flags) +{ + struct bpf_iter_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != + __alignof__(struct bpf_iter_task)); + + kit->task = kit->pos = NULL; + switch (flags) { + case BPF_TASK_ITER_ALL_THREADS: + case BPF_TASK_ITER_ALL_PROCS: + break; + case BPF_TASK_ITER_PROC_THREADS: + if (!task__nullable) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (flags == BPF_TASK_ITER_PROC_THREADS) + kit->task = task__nullable; + else + kit->task = &init_task; + kit->pos = kit->task; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) +{ + struct bpf_iter_task_kern *kit = (void *)it; + struct task_struct *pos; + unsigned int flags; + + flags = kit->flags; + pos = kit->pos; + + if (!pos) + return pos; + + if (flags == BPF_TASK_ITER_ALL_PROCS) + goto get_next_task; + + kit->pos = next_thread(kit->pos); + if (kit->pos == kit->task) { + if (flags == BPF_TASK_ITER_PROC_THREADS) { + kit->pos = NULL; + return pos; + } + } else + return pos; + +get_next_task: + kit->pos = next_task(kit->pos); + kit->task = kit->pos; + if (kit->pos == &init_task) + kit->pos = NULL; + + return pos; +} + +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) +{ +} + +__diag_pop(); + DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); static void do_mmap_read_unlock(struct irq_work *entry) diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 1338a13a8b64..2e4885e7781f 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -250,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link) static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - const struct tcx_link *tcx = tcx_link_const(link); + const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); @@ -267,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) static int tcx_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { - const struct tcx_link *tcx = tcx_link_const(link); + const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb58987e4844..857d76694517 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1173,7 +1173,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg static void __mark_reg_known_zero(struct bpf_reg_state *reg); +static bool in_rcu_cs(struct bpf_verifier_env *env); + +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta); + static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, struct bpf_reg_state *reg, int insn_idx, struct btf *btf, u32 btf_id, int nr_slots) { @@ -1194,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + if (is_kfunc_rcu_protected(meta)) { + if (in_rcu_cs(env)) + st->type |= MEM_RCU; + else + st->type |= PTR_UNTRUSTED; + } st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; @@ -1268,7 +1279,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, return true; } -static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots) { struct bpf_func_state *state = func(env, reg); @@ -1276,26 +1287,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_ spi = iter_get_spi(env, reg, nr_slots); if (spi < 0) - return false; + return -EINVAL; for (i = 0; i < nr_slots; i++) { struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr; + if (st->type & PTR_UNTRUSTED) + return -EPROTO; /* only main (first) slot has ref_obj_id set */ if (i == 0 && !st->ref_obj_id) - return false; + return -EINVAL; if (i != 0 && st->ref_obj_id) - return false; + return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) - return false; + return -EINVAL; for (j = 0; j < BPF_REG_SIZE; j++) if (slot->slot_type[j] != STACK_ITER) - return false; + return -EINVAL; } - return true; + return 0; } /* Check if given stack slot is "special": @@ -1789,6 +1802,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->dfs_depth = src->dfs_depth; + dst_state->used_as_loop_entry = src->used_as_loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1804,11 +1819,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + +static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +} + +static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b) +{ + int fr; + + if (a->curframe != b->curframe) + return false; + + for (fr = a->curframe; fr >= 0; fr--) + if (a->frame[fr]->callsite != b->frame[fr]->callsite) + return false; + + return true; +} + +/* Open coded iterators allow back-edges in the state graph in order to + * check unbounded loops that iterators. + * + * In is_state_visited() it is necessary to know if explored states are + * part of some loops in order to decide whether non-exact states + * comparison could be used: + * - non-exact states comparison establishes sub-state relation and uses + * read and precision marks to do so, these marks are propagated from + * children states and thus are not guaranteed to be final in a loop; + * - exact states comparison just checks if current and explored states + * are identical (and thus form a back-edge). + * + * Paper "A New Algorithm for Identifying Loops in Decompilation" + * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient + * algorithm for loop structure detection and gives an overview of + * relevant terminology. It also has helpful illustrations. + * + * [1] https://api.semanticscholar.org/CorpusID:15784067 + * + * We use a similar algorithm but because loop nested structure is + * irrelevant for verifier ours is significantly simpler and resembles + * strongly connected components algorithm from Sedgewick's textbook. + * + * Define topmost loop entry as a first node of the loop traversed in a + * depth first search starting from initial state. The goal of the loop + * tracking algorithm is to associate topmost loop entries with states + * derived from these entries. + * + * For each step in the DFS states traversal algorithm needs to identify + * the following situations: + * + * initial initial initial + * | | | + * V V V + * ... ... .---------> hdr + * | | | | + * V V | V + * cur .-> succ | .------... + * | | | | | | + * V | V | V V + * succ '-- cur | ... ... + * | | | + * | V V + * | succ <- cur + * | | + * | V + * | ... + * | | + * '----' + * + * (A) successor state of cur (B) successor state of cur or it's entry + * not yet traversed are in current DFS path, thus cur and succ + * are members of the same outermost loop + * + * initial initial + * | | + * V V + * ... ... + * | | + * V V + * .------... .------... + * | | | | + * V V V V + * .-> hdr ... ... ... + * | | | | | + * | V V V V + * | succ <- cur succ <- cur + * | | | + * | V V + * | ... ... + * | | | + * '----' exit + * + * (C) successor state of cur is a part of some loop but this loop + * does not include cur or successor state is not in a loop at all. + * + * Algorithm could be described as the following python code: + * + * traversed = set() # Set of traversed nodes + * entries = {} # Mapping from node to loop entry + * depths = {} # Depth level assigned to graph node + * path = set() # Current DFS path + * + * # Find outermost loop entry known for n + * def get_loop_entry(n): + * h = entries.get(n, None) + * while h in entries and entries[h] != h: + * h = entries[h] + * return h + * + * # Update n's loop entry if h's outermost entry comes + * # before n's outermost entry in current DFS path. + * def update_loop_entry(n, h): + * n1 = get_loop_entry(n) or n + * h1 = get_loop_entry(h) or h + * if h1 in path and depths[h1] <= depths[n1]: + * entries[n] = h1 + * + * def dfs(n, depth): + * traversed.add(n) + * path.add(n) + * depths[n] = depth + * for succ in G.successors(n): + * if succ not in traversed: + * # Case A: explore succ and update cur's loop entry + * # only if succ's entry is in current DFS path. + * dfs(succ, depth + 1) + * h = get_loop_entry(succ) + * update_loop_entry(n, h) + * else: + * # Case B or C depending on `h1 in path` check in update_loop_entry(). + * update_loop_entry(n, succ) + * path.remove(n) + * + * To adapt this algorithm for use with verifier: + * - use st->branch == 0 as a signal that DFS of succ had been finished + * and cur's loop entry has to be updated (case A), handle this in + * update_branch_counts(); + * - use st->branch > 0 as a signal that st is in the current DFS path; + * - handle cases B and C in is_state_visited(); + * - update topmost loop entry for intermediate states in get_loop_entry(). + */ +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +{ + struct bpf_verifier_state *topmost = st->loop_entry, *old; + + while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + topmost = topmost->loop_entry; + /* Update loop entries for intermediate states to avoid this + * traversal in future get_loop_entry() calls. + */ + while (st && st->loop_entry != topmost) { + old = st->loop_entry; + st->loop_entry = topmost; + st = old; + } + return topmost; +} + +static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +{ + struct bpf_verifier_state *cur1, *hdr1; + + cur1 = get_loop_entry(cur) ?: cur; + hdr1 = get_loop_entry(hdr) ?: hdr; + /* The head1->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr1->branches == 0 then + * head's topmost loop entry is not in current DFS path, + * hence 'cur' and 'hdr' are not in the same loop and there is + * no need to update cur->loop_entry. + */ + if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + cur->loop_entry = hdr; + hdr->used_as_loop_entry = true; + } +} + static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { while (st) { u32 br = --st->branches; + /* br == 0 signals that DFS exploration for 'st' is finished, + * thus it is necessary to update parent's loop entry if it + * turned out that st is a part of some loop. + * This is a part of 'case A' in get_loop_entry() comment. + */ + if (br == 0 && st->parent && st->loop_entry) + update_loop_entry(st->parent, st->loop_entry); + /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: */ @@ -7640,15 +7847,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return err; } - err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots); if (err) return err; } else { /* iter_next() or iter_destroy() expect initialized iter state*/ - if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); + switch (err) { + case 0: + break; + case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", iter_type_str(meta->btf, btf_id), regno); - return -EINVAL; + return err; + case -EPROTO: + verbose(env, "expected an RCU CS when using %s\n", meta->func_name); + return err; + default: + return err; } spi = iter_get_spi(env, reg, nr_slots); @@ -7674,6 +7890,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return 0; } +/* Look for a previous loop entry at insn_idx: nearest parent state + * stopped at insn_idx with callsites matching those in cur->frame. + */ +static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, + int insn_idx) +{ + struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *st; + + /* Explored states are pushed in stack order, most recent states come first */ + sl = *explored_state(env, insn_idx); + for (; sl; sl = sl->next) { + /* If st->branches != 0 state is a part of current DFS verification path, + * hence cur & st for a loop. + */ + st = &sl->state; + if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) && + st->dfs_depth < cur->dfs_depth) + return st; + } + + return NULL; +} + +static void reset_idmap_scratch(struct bpf_verifier_env *env); +static bool regs_exact(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur, + struct bpf_idmap *idmap); + +static void maybe_widen_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *rold, struct bpf_reg_state *rcur, + struct bpf_idmap *idmap) +{ + if (rold->type != SCALAR_VALUE) + return; + if (rold->type != rcur->type) + return; + if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + return; + __mark_reg_unknown(env, rcur); +} + +static int widen_imprecise_scalars(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr; + + reset_idmap_scratch(env); + for (fr = old->curframe; fr >= 0; fr--) { + fold = old->frame[fr]; + fcur = cur->frame[fr]; + + for (i = 0; i < MAX_BPF_REG; i++) + maybe_widen_reg(env, + &fold->regs[i], + &fcur->regs[i], + &env->idmap_scratch); + + for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&fold->stack[i]) || + !is_spilled_reg(&fcur->stack[i])) + continue; + + maybe_widen_reg(env, + &fold->stack[i].spilled_ptr, + &fcur->stack[i].spilled_ptr, + &env->idmap_scratch); + } + } + return 0; +} + /* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. @@ -7715,25 +8006,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id * is some statically known limit on number of iterations (e.g., if there is * an explicit `if n > 100 then break;` statement somewhere in the loop). * - * One very subtle but very important aspect is that we *always* simulate NULL - * condition first (as the current state) before we simulate non-NULL case. - * This has to do with intricacies of scalar precision tracking. By simulating - * "exit condition" of iter_next() returning NULL first, we make sure all the - * relevant precision marks *that will be set **after** we exit iterator loop* - * are propagated backwards to common parent state of NULL and non-NULL - * branches. Thanks to that, state equivalence checks done later in forked - * state, when reaching iter_next() for ACTIVE iterator, can assume that - * precision marks are finalized and won't change. Because simulating another - * ACTIVE iterator iteration won't change them (because given same input - * states we'll end up with exactly same output states which we are currently - * comparing; and verification after the loop already propagated back what - * needs to be **additionally** tracked as precise). It's subtle, grok - * precision tracking for more intuitive understanding. + * Iteration convergence logic in is_state_visited() relies on exact + * states comparison, which ignores read and precision marks. + * This is necessary because read and precision marks are not finalized + * while in the loop. Exact comparison might preclude convergence for + * simple programs like below: + * + * i = 0; + * while(iter_next(&it)) + * i++; + * + * At each iteration step i++ would produce a new distinct state and + * eventually instruction processing limit would be reached. + * + * To avoid such behavior speculatively forget (widen) range for + * imprecise scalar registers, if those registers were not precise at the + * end of the previous iteration and do not match exactly. + * + * This is a conservative heuristic that allows to verify wide range of programs, + * however it precludes verification of programs that conjure an + * imprecise value on the first loop iteration and use it as precise on a second. + * For example, the following safe program would fail to verify: + * + * struct bpf_num_iter it; + * int arr[10]; + * int i = 0, a = 0; + * bpf_iter_num_new(&it, 0, 10); + * while (bpf_iter_num_next(&it)) { + * if (a == 0) { + * a = 1; + * i = 7; // Because i changed verifier would forget + * // it's range on second loop entry. + * } else { + * arr[i] = 42; // This would fail to verify. + * } + * } + * bpf_iter_num_destroy(&it); */ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter; int iter_frameno = meta->iter.frameno; @@ -7751,6 +8064,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, } if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* Because iter_next() call is a checkpoint is_state_visitied() + * should guarantee parent state with same call sites and insn_idx. + */ + if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || + !same_callsites(cur_st->parent, cur_st)) { + verbose(env, "bug: bad parent state for iter next call"); + return -EFAULT; + } + /* Note cur_st->parent in the call below, it is necessary to skip + * checkpoint created for cur_st by is_state_visited() + * right at this instruction. + */ + prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); if (!queued_st) @@ -7759,6 +8085,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; queued_iter->iter.depth++; + if (prev_st) + widen_imprecise_scalars(env, prev_st, queued_st); queued_fr = queued_st->frame[queued_st->curframe]; mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); @@ -10231,6 +10559,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU; } +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU_PROTECTED; +} + static bool __kfunc_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) @@ -10305,6 +10638,11 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); } +static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__nullable"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -10447,6 +10785,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CALLBACK, KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, + KF_ARG_PTR_TO_NULL, }; enum special_kfunc_type { @@ -10472,6 +10811,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_iter_css_task_new, }; BTF_SET_START(special_kfunc_set) @@ -10495,6 +10835,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_iter_css_task_new) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10520,6 +10861,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_iter_css_task_new) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -10600,6 +10942,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + return KF_ARG_PTR_TO_NULL; if (argno + 1 < nargs && (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || @@ -11050,6 +11394,20 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } +static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + return true; + case BPF_TRACE_ITER: + return env->prog->aux->sleepable; + default: + return false; + } +} + static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, int insn_idx) { @@ -11136,7 +11494,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type))) { + (register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -11161,6 +11520,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return kf_arg_type; switch (kf_arg_type) { + case KF_ARG_PTR_TO_NULL: + continue; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -11300,6 +11661,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_ITER: + if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) { + if (!check_css_task_iter_allowlist(env)) { + verbose(env, "css_task_iter is only allowed in bpf_lsm and bpf iter-s\n"); + return -EINVAL; + } + } ret = process_iter_arg(env, regno, insn_idx, meta); if (ret < 0) return ret; @@ -11559,6 +11926,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); @@ -11569,7 +11937,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); return -EINVAL; } else if (rcu_unlock) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; @@ -13663,12 +14031,16 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) return !!tnum_equals_const(subreg, val); else if (val < reg->u32_min_value || val > reg->u32_max_value) return 0; + else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); else if (val < reg->u32_min_value || val > reg->u32_max_value) return 1; + else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + return 1; break; case BPF_JSET: if ((~subreg.mask & subreg.value) & val) @@ -13740,12 +14112,16 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return !!tnum_equals_const(reg->var_off, val); else if (val < reg->umin_value || val > reg->umax_value) return 0; + else if (sval < reg->smin_value || sval > reg->smax_value) + return 0; break; case BPF_JNE: if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); else if (val < reg->umin_value || val > reg->umax_value) return 1; + else if (sval < reg->smin_value || sval > reg->smax_value) + return 1; break; case BPF_JSET: if ((~reg->var_off.mask & reg->var_off.value) & val) @@ -14958,21 +15334,6 @@ enum { BRANCH = 2, }; -static u32 state_htab_size(struct bpf_verifier_env *env) -{ - return env->prog->len; -} - -static struct bpf_verifier_state_list **explored_state( - struct bpf_verifier_env *env, - int idx) -{ - struct bpf_verifier_state *cur = env->cur_state; - struct bpf_func_state *state = cur->frame[cur->curframe]; - - return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; -} - static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; @@ -15849,18 +16210,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { struct bpf_verifier_state_list *sl; - int i; sl = *explored_state(env, insn); while (sl) { if (sl->state.branches) goto next; if (sl->state.insn_idx != insn || - sl->state.curframe != cur->curframe) + !same_callsites(&sl->state, cur)) goto next; - for (i = 0; i <= cur->curframe; i++) - if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) - goto next; clean_verifier_state(env, &sl->state); next: sl = sl->next; @@ -15878,8 +16235,11 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact) { + if (exact) + return regs_exact(rold, rcur, idmap); + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; @@ -15996,7 +16356,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) { int i, spi; @@ -16009,7 +16369,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + if (exact && + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + return false; + + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) { i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; @@ -16059,7 +16424,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * return false to continue verification of this path */ if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + &cur->stack[spi].spilled_ptr, idmap, exact)) return false; break; case STACK_DYNPTR: @@ -16141,16 +16506,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur) + struct bpf_func_state *cur, bool exact) { int i; for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - &env->idmap_scratch)) + &env->idmap_scratch, exact)) return false; - if (!stacksafe(env, old, cur, &env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; if (!refsafe(old, cur, &env->idmap_scratch)) @@ -16159,17 +16524,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat return true; } +static void reset_idmap_scratch(struct bpf_verifier_env *env) +{ + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); +} + static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) + struct bpf_verifier_state *cur, + bool exact) { int i; if (old->curframe != cur->curframe) return false; - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + reset_idmap_scratch(env); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -16199,7 +16570,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) return false; } return true; @@ -16453,10 +16824,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; - struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err, states_cnt = 0; + struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + int i, j, n, err, states_cnt = 0; bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); bool add_new_state = force_new_state; + bool force_exact; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -16509,9 +16881,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * It's safe to assume that iterator loop will finish, taking into * account iter_next() contract of eventually returning * sticky NULL result. + * + * Note, that states have to be compared exactly in this case because + * read and precision marks might not be finalized inside the loop. + * E.g. as in the program below: + * + * 1. r7 = -16 + * 2. r6 = bpf_get_prandom_u32() + * 3. while (bpf_iter_num_next(&fp[-8])) { + * 4. if (r6 != 42) { + * 5. r7 = -32 + * 6. r6 = bpf_get_prandom_u32() + * 7. continue + * 8. } + * 9. r0 = r10 + * 10. r0 += r7 + * 11. r8 = *(u64 *)(r0 + 0) + * 12. r6 = bpf_get_prandom_u32() + * 13. } + * + * Here verifier would first visit path 1-3, create a checkpoint at 3 + * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does + * not have read or precision mark for r7 yet, thus inexact states + * comparison would discard current state with r7=-32 + * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur)) { + if (states_equal(env, &sl->state, cur, true)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; @@ -16527,17 +16923,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; - if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { + update_loop_entry(cur, &sl->state); goto hit; + } } goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur) && + states_equal(env, &sl->state, cur, false) && !iter_active_depths_differ(&sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); + verbose(env, "cur state:"); + print_verifier_state(env, cur->frame[cur->curframe], true); + verbose(env, "old state:"); + print_verifier_state(env, sl->state.frame[cur->curframe], true); return -EINVAL; } /* if the verifier is processing a loop, avoid adding new state @@ -16559,7 +16961,36 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - if (states_equal(env, &sl->state, cur)) { + /* If sl->state is a part of a loop and this loop's entry is a part of + * current verification path then states have to be compared exactly. + * 'force_exact' is needed to catch the following case: + * + * initial Here state 'succ' was processed first, + * | it was eventually tracked to produce a + * V state identical to 'hdr'. + * .---------> hdr All branches from 'succ' had been explored + * | | and thus 'succ' has its .branches == 0. + * | V + * | .------... Suppose states 'cur' and 'succ' correspond + * | | | to the same instruction + callsites. + * | V V In such case it is necessary to check + * | ... ... if 'succ' and 'cur' are states_equal(). + * | | | If 'succ' and 'cur' are a part of the + * | V V same loop exact flag has to be set. + * | succ <- cur To check if that is the case, verify + * | | if loop entry of 'succ' is in current + * | V DFS path. + * | ... + * | | + * '----' + * + * Additional details are in the comment before get_loop_entry(). + */ + loop_entry = get_loop_entry(&sl->state); + force_exact = loop_entry && loop_entry->branches > 0; + if (states_equal(env, &sl->state, cur, force_exact)) { + if (force_exact) + update_loop_entry(cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -16598,13 +17029,18 @@ miss: * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, * but do not meaningfully decrease insn_processed. + * 'n' controls how many times state could miss before eviction. + * Use bigger 'n' for checkpoints because evicting checkpoint states + * too early would hinder iterator convergence. */ - if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; + if (sl->miss_cnt > sl->hit_cnt * n + n) { /* the state is unlikely to be useful. Remove it to * speed up verification */ *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && + !sl->state.used_as_loop_entry) { u32 br = sl->state.branches; WARN_ONCE(br, @@ -16673,6 +17109,7 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..b6d64f3b8888 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4917,9 +4917,11 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { + unsigned long irqflags; + memset(it, 0, sizeof(*it)); - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; @@ -4933,7 +4935,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, css_task_iter_advance(it); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } /** @@ -4946,12 +4948,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) @@ -4964,7 +4968,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } @@ -4977,11 +4981,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ void css_task_iter_end(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_cset) { - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) |